diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b595682..4c7b267 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,25 +20,41 @@ stages:
   - lint
   - test
 
-format_python:
+python_format:
   stage: lint
   tags:
     - python-lint
   script:
     - black --check .
 
-static_check_python:
+python_sort_imports:
+  stage: lint
+  tags:
+    - python-lint
+  script:
+    - isort --check test
+
+python_static_check:
   stage: lint
   tags:
     - python-lint
   script:
     - pyright .
 
-run_test0:
+run_ne16_test:
   stage: test
   tags:
     - gap9-sdk
   artifacts:
     untracked: true
   script:
-    - cd test && pytest test.py --test-dir tests --recursive
+    - cd test && pytest test.py --test-dir tests --recursive -A ne16
+
+run_neureka_test:
+  stage: test
+  tags:
+    - siracusa-sdk
+  artifacts:
+    untracked: true
+  script:
+    - cd test && pytest test.py --test-dir tests --recursive -A neureka
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48a4461..84b516f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+
+- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels
+- Support for kernels without normalization and quantization for NE16
+- isort check
+- publication citation
+
+### Changed
+
+- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
+- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
+- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
+
+### Removed
+
+- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2`
+- `mode` attribute from `ne16_quant_t` structure
+
 ## [0.3.0] - 2024-01-14
 
 ### Added
diff --git a/README.md b/README.md
index be8c9be..1671dc7 100644
--- a/README.md
+++ b/README.md
@@ -39,51 +39,22 @@ _Note: The accelerator can provide additional helper functions if needed._
 
 ## Accelerators
 
-### NE16
-
-Github repo [link](https://github.com/pulp-platform/ne16).
-
-#### Implemented features
-
-- [x] Convolution w/ kernel shape 1x1
-- [x] Convolution w/ kernel shape 3x3
-- [x] Depthwise convolution w/ kernel shape 3x3
-- [x] Stride 1x1
-- [x] Stride 2x2
-- [ ] Normalization and quantization
-    - [x] With
-    - [ ] Without
-    - [x] Relu (w/ and w/o)
-    - [x] Bias (w/ and w/o)
-    - [ ] Per-channel shift
-    - [x] Per-layer shift
-    - [ ] Rounding
-- [ ] Input type
-    - [x] uint8
-    - [ ] uint16
-- [ ] Output type
-    - [x] int8
-    - [x] uint8 (only w/ Relu)
-    - [ ] int32
-    - [ ] uint32 (only w/ Relu)
-- [ ] Scale type
-    - [x] uint8
-    - [ ] uint16
-    - [ ] uint32
-- [x] Bias type
-    - [x] int32
-- [ ] Weight type
-    - [x] int8
-    - [ ] int2-7
-
-### Neureka
-
-**Untested and considered broken.**
+- [NE16](ne16/README.md)
+- [Neureka](neureka/README.md)
 
 ## Testing
 
 You can find information about testing in the dedicated [README](test/README.md).
 
+### Environment
+
+The library was tested with following pairs of SDKs and compilers:
+
+| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash |
+| --- | --------------- | -------- | -------------------- |
+| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 |
+| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
+
 ## Contributing
 
 Bug reports and feature requests should be reported through issues.
@@ -93,15 +64,38 @@ All the development should be done through forks and merged onto the `dev` branc
 
 The library will follow the [Semantic Versioning](https://semver.org/).
 
-## Citing
+## Publication
+
+<details>
+<summary>If you use PULP-NNX in your work, you can cite us:</summary>
+
+```
+@inproceedings{10.1145/3607889.3609092,
+    author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco},
+    title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study},
+    year = {2024},
+    isbn = {9798400702907},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3607889.3609092},
+    doi = {10.1145/3607889.3609092},
+    abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.},
+    booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems},
+    pages = {9–10},
+    numpages = {2},
+    keywords = {TinyML, MCUs, deep learning, HW accelerators},
+    location = {<conf-loc>, <city>Hamburg</city>, <country>Germany</country>, </conf-loc>},
+    series = {CASES '23 Companion}
+}
+```
 
-*TBA*
+</details>
 
 ## Contributors
 
 * Luka Macan <[luka.macan@unibo.it](mailto:luka.macan@unibo.it)>
 * Francesco Conti <[fconti@unibo.it](mailto:fconti@unibo.it)>
-* Arpan Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
+* Arpan Suravi Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
 
 ## License
 
diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
index eff9a60..97e6e2e 100644
--- a/inc/pulp_nnx_ne16.h
+++ b/inc/pulp_nnx_ne16.h
@@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev);
 /** ne16_nnx_dispatch
  *
  * Dispatch a task to the accelerator.
- * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
  */
 int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task);
 
@@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task);
  */
 void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
 
-
 /* Additional helper functions */
 
 /** ne16_nnx_dispatch_stride2x2
@@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
  * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
  * Works only if the k_out is divisible by 2.
  */
-void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker);
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+                                 const uint32_t w_in, const uint32_t k_in,
+                                 const uint32_t h_out, const uint32_t w_out,
+                                 const uint32_t k_out, const uint8_t h_ker,
+                                 const uint8_t w_ker);
diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
new file mode 100644
index 0000000..25ef4a8
--- /dev/null
+++ b/inc/pulp_nnx_neureka.h
@@ -0,0 +1,61 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include <stdint.h>
+
+/* PULP-NNX interface */
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf);
+void neureka_nnx_term(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int neureka_nnx_dispatch_check(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
+ */
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
diff --git a/ne16/README.md b/ne16/README.md
new file mode 100644
index 0000000..9f05956
--- /dev/null
+++ b/ne16/README.md
@@ -0,0 +1,36 @@
+# NE16
+
+## Docs
+
+- Github repo [link](https://github.com/pulp-platform/ne16).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [x] Stride 2x2
+- [ ] Normalization and quantization
+    - [x] With
+    - [x] Without
+    - [x] Relu (w/ and w/o)
+    - [x] Bias (w/ and w/o)
+    - [ ] Per-channel shift
+    - [x] Per-layer shift
+    - [ ] Rounding
+- [ ] Input type
+    - [x] uint8
+    - [ ] uint16
+- [ ] Output type
+    - [x] int8
+    - [x] uint8 (only w/ Relu)
+    - [x] int32
+- [ ] Scale type
+    - [x] uint8
+    - [ ] uint16
+    - [ ] uint32
+- [x] Bias type
+    - [x] int32
+- [ ] Weight type
+    - [x] int8
+    - [ ] int2-7
diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c
index 97859b4..d92a7d5 100644
--- a/ne16/hal/ne16.c
+++ b/ne16/hal/ne16.c
@@ -23,8 +23,6 @@
 #define NE16_STATUS_EMPTY (0x000)
 #define NE16_STATUS_FULL (0x101)
 
-inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; }
-
 inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) {
   uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
   return (status & 0x1) + ((status >> 8) & 0x1);
diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h
index c4c3a19..88ebee7 100644
--- a/ne16/hal/ne16.h
+++ b/ne16/hal/ne16.h
@@ -24,11 +24,12 @@
 #include "hwpe.h"
 #include <stdint.h>
 
+#define NE16_TASK_QUEUE_SIZE (2)
+
 typedef struct ne16_dev_t {
   hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
 } ne16_dev_t;
 
-int ne16_task_queue_size(ne16_dev_t *dev);
 int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev);
 int ne16_task_queue_empty(ne16_dev_t *dev);
 int ne16_task_queue_full(ne16_dev_t *dev);
diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 0ba54d5..f8408da 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -22,9 +22,9 @@
 #include "ne16_task_defs.h"
 #include "pulp_nnx_util.h"
 
-inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
-                                      uint32_t i_width, uint32_t n_height,
-                                      uint32_t n_width) {
+uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
+                               uint32_t i_width, uint32_t n_height,
+                               uint32_t n_width) {
   uint32_t tile_padding = padding;
   if (i_height > 0) {
     tile_padding &= ~(0xf << 28);
@@ -41,41 +41,65 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
   return tile_padding;
 }
 
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const ne16_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, ne16_quant_t quant,
-                    ne16_norm_t norm, const uint8_t stride) {
-  const uint32_t flag_mode16 =
-      input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
-
-  *task = (ne16_task_t){
-      .outbytes = output_bits / 8,
-      .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
-                                      : NE16_WEIGHT_D0_STRIDE_MODE8,
-      .qw = weights_bits,
-      .stride_shift = stride == 2 ? 1 : 0,
-      .output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
-                                             : NE16_OUTPUT_CHANNEL_THROUGHPUT,
-      .kernel_shape = kernel_shape,
-      .depthwise = depthwise,
-      .data = {0}};
-
-  const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+void ne16_task_init(ne16_task_t *task) { *task = (ne16_task_t){.data = {0}}; }
 
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+                              const uint8_t depthwise, const uint8_t stride) {
+  task->depthwise = depthwise;
+  task->kernel_shape = kernel_shape;
+  task->subtile_output_channel =
+      depthwise ? NE16_SUBTILE_INPUT_CHANNEL : NE16_SUBTILE_OUTPUT_CHANNEL;
   const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
                         : depthwise == 1  ? NE16_FLAG_MODE_3x3_DW
                                           : NE16_FLAG_MODE_3x3;
 
+  const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+
+  task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE | NE16_MASK_FLAG_STRIDE_2x2);
+  task->data.cfg.conf0 |= flag_mode | flag_stride2x2;
+}
+
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+                        const uint8_t output_bits, const uint8_t weight_bits) {
+  const uint32_t flag_mode16 =
+      input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
+
+  ne16_quant_mode_e quantMode;
+  if (output_bits == 16) {
+    quantMode = quantMode16Bit;
+  } else if (output_bits == 8) {
+    quantMode = quantMode8Bit;
+  } else {
+    quantMode = quantMode32Bit;
+  }
+
+  task->weight_d0_stride =
+      flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8;
+  task->qw = weight_bits;
+  task->data.cfg.conf0 &= ~(NE16_MASK_QUANT_MODE | NE16_MASK_FLAG_MODE16 |
+                            NE16_MASK_FLAG_WEIGHT_BITS);
+  task->data.cfg.conf0 |= quantMode | flag_mode16 | (weight_bits - 1);
+}
+
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+                              ne16_norm_t norm) {
+  task->data.cfg.conf0 &=
+      ~(NE16_MASK_QUANT_FUNCTION | NE16_MASK_SHIFT_AMOUNT |
+        NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE |
+        NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT);
   task->data.cfg.conf0 |=
-      NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
-      (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING |
-      norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
-      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
-      flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+      NE16_FLAG_NORM_QUANT | quant.function | (quant.shift_amount << 16) |
+      quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING | norm.mode |
+      norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
+      norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
+}
 
-  task->data.cfg.weight_offset_factor = weights_offset_factor;
+void ne16_task_set_weight_offset(ne16_task_t *task,
+                                 ne16_weight_offset_mode_e weight_offset_mode,
+                                 const int32_t weight_offset) {
+  task->data.cfg.conf0 &= ~NE16_MASK_WEIGHT_OFFSET_MODE;
+  task->data.cfg.conf0 |= weight_offset_mode;
+  task->data.cfg.weight_offset_factor = weight_offset;
 }
 
 /** ne16_pad_ptr
@@ -84,21 +108,18 @@ void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
  * it was the start to the padded data.
  * Necessary for input pointer when it's padded.
  */
-inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
-                             const uint32_t channel, const uint8_t bits,
-                             const uint8_t padding_top,
-                             const uint8_t padding_left) {
-  return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
+                      const uint8_t padding_top, const uint8_t padding_left) {
+  return ptr - (padding_top * width + padding_left) * width_stride;
 }
 
-inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
-                               uint32_t w_in, uint32_t k_in, uint8_t bits_in,
-                               uint8_t padding_top, uint8_t padding_left,
-                               uint32_t output_ptr, uint32_t weights_ptr,
-                               uint32_t scale_ptr, uint32_t shift_ptr,
-                               uint32_t bias_ptr) {
+void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
+                        uint32_t w_in_stride, uint8_t padding_top,
+                        uint8_t padding_left, uint32_t output_ptr,
+                        uint32_t weights_ptr, uint32_t scale_ptr,
+                        uint32_t shift_ptr, uint32_t bias_ptr) {
   task->data.infeat_ptr =
-      ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+      ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
   task->data.outfeat_ptr = output_ptr;
   task->data.weights_ptr = weights_ptr;
   task->data.scale_ptr = scale_ptr;
@@ -107,100 +128,101 @@ inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
 }
 
 void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+                           const uint32_t h_in_stride,
                            const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride) {
-  const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
+                           const uint32_t h_out_stride,
+                           const uint32_t w_out_stride) {
+  const uint32_t num_k_in =
+      nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
 
   const ne16_stride_t input_stride = {
-      .d0 = k_in_stride,
-      .d1 = k_in_stride * w_in_stride,
-      .d2 = task->depthwise ? 0
-                            : k_in_stride * NE16_FILTER_BUFFER_SIZE *
-                                  NE16_FILTER_BUFFER_SIZE};
+      .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
   task->data.cfg.input_stride = input_stride;
 
-  // WARNING: Stride works only for even output channel sizes (divisible by 2)
-  const ne16_stride_t output_stride = {
-      .d0 = 32,
-      .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
-      .d2 =
-          (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+  const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES,
+                                       .d1 = w_out_stride,
+                                       .d2 = h_out_stride};
   task->data.cfg.output_stride = output_stride;
 
   if (task->kernel_shape == 1) {
     task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
     task->data.cfg.weights_stride.d1 =
         task->weight_d0_stride * task->qw * num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
   } else if (!task->depthwise) {
     task->data.cfg.weights_stride.d0 =
         NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
     task->data.cfg.weights_stride.d1 = NE16_FILTER_SIZE * NE16_FILTER_SIZE *
                                        task->weight_d0_stride * task->qw *
                                        num_k_in;
-    task->data.cfg.weights_stride.d2 = 0;
   } else {
     task->data.cfg.weights_stride.d0 =
         NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
     task->data.cfg.weights_stride.d1 = 0;
-    task->data.cfg.weights_stride.d2 = 0;
   }
+  task->data.cfg.weights_stride.d2 = 0;
 }
 
 void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
                             const uint8_t padding_right) {
-  const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
-  const uint16_t num_Ki = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
-  const uint16_t num_Ho = divnceil(h_out, NE16_FILTER_SIZE);
-  const uint16_t num_Wo = divnceil(w_out, NE16_FILTER_SIZE);
-
-  const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
-  const uint16_t rem_Ki = remainder(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
-  const uint16_t rem_Ho = remainder(h_out, NE16_FILTER_SIZE);
-  const uint16_t rem_Wo = remainder(w_out, NE16_FILTER_SIZE);
+  const uint16_t num_Ko =
+      nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki =
+      nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t num_Ho =
+      nnx_calculate_number_of_tiles(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo =
+      nnx_calculate_number_of_tiles(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko =
+      nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki =
+      nnx_calculate_last_tile_size(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+  const uint16_t rem_Ho =
+      nnx_calculate_last_tile_size(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo =
+      nnx_calculate_last_tile_size(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
   const uint16_t rem_Hi =
       (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
   const uint16_t rem_Wi =
       (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
 
   const ne16_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
+      .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+                 .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+                    .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+                    .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
   task->data.cfg.subtile = subtile;
 }
 
-inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
-                                  const uint8_t bottom, const uint8_t left,
-                                  const uint8_t right, const uint8_t value) {
+void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
+                           const uint8_t bottom, const uint8_t left,
+                           const uint8_t right, const uint8_t value) {
   task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
                            ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
                            (value & 0xff);
 }
 
-inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
-                                      const uint8_t right, const uint8_t bottom,
-                                      const uint8_t left) {
+void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
+                               const uint8_t right, const uint8_t bottom,
+                               const uint8_t left) {
   task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
                                ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
 }
 
 void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t k_in, const uint32_t h_in_stride,
+                        const uint32_t w_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint32_t h_out_stride,
+                        const uint32_t w_out_stride, const uint8_t padding_top,
+                        const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left) {
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+  ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+                        w_out_stride);
   ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
                          padding_right);
   ne16_task_set_padding(task, padding_top, padding_bottom, padding_left,
@@ -209,18 +231,20 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
 
 void ne16_task_set_dims_stride2x2(
     ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
     const uint8_t padding_bottom, const uint8_t padding_right,
     const uint8_t padding_left) {
   const uint8_t stride = 2;
 
-  ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
-                        k_out_stride);
+  // WARNING: works only for even output channel stride (divisible by 2)
+  ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride >> 1,
+                        w_out_stride >> 1);
   ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
-                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+                         k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
+                         0);
 
   const uint8_t padding_bottom_new =
       (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index df16b6c..69bc78c 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -60,7 +60,6 @@ typedef enum ne16_quant_function_e {
 typedef struct ne16_quant_t {
   // Shift amount must be in range 0x00-0x1F
   unsigned shift_amount;
-  ne16_quant_mode_e mode;
   ne16_quant_function_e function;
   int flag_rounding;
 } ne16_quant_t;
@@ -110,38 +109,46 @@ typedef struct ne16_task_data_t {
 
 typedef struct ne16_task_t {
   ne16_task_data_t data;
-  uint8_t outbytes;
   uint8_t weight_d0_stride;
   uint8_t qw;
-  uint8_t stride_shift;
-  uint8_t output_channel_throughput;
+  uint8_t subtile_output_channel;
   uint8_t kernel_shape;
   uint8_t depthwise;
   uint8_t id;
 } ne16_task_t;
 
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
-                    const uint8_t depthwise, const uint8_t input_bits,
-                    const uint8_t output_bits, const uint8_t weights_bits,
-                    const ne16_weight_offset_mode_e weights_offset_mode,
-                    const uint32_t weights_offset_factor, ne16_quant_t quant,
-                    ne16_norm_t norm, const uint8_t stride);
+void ne16_task_init(ne16_task_t *task);
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+                              const uint8_t depthwise, const uint8_t stride);
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+                        const uint8_t output_bits, const uint8_t weight_bits);
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+                              ne16_norm_t norm);
+void ne16_task_set_weight_offset(ne16_task_t *task,
+                                 ne16_weight_offset_mode_e weight_offset_mode,
+                                 const int32_t weight_offset);
 uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
                                uint32_t i_width, uint32_t n_height,
                                uint32_t n_width);
 uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
-                      const uint32_t channel, const uint8_t bits,
-                      const uint8_t padding_top, const uint8_t padding_left);
+                      const uint32_t width_stride, const uint8_t padding_top,
+                      const uint8_t padding_left);
 void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
-                        uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+                        uint32_t w_in_stride, uint8_t padding_top,
                         uint8_t padding_left, uint32_t output_ptr,
                         uint32_t weights_ptr, uint32_t scale_ptr,
                         uint32_t shift_ptr, uint32_t bias_ptr);
+/** ne16_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+                           const uint32_t h_in_stride,
                            const uint32_t w_in_stride,
-                           const uint32_t k_in_stride,
-                           const uint32_t w_out_stride,
-                           const uint32_t k_out_stride);
+                           const uint32_t h_out_stride,
+                           const uint32_t w_out_stride);
 void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
                             const uint32_t h_out, const uint32_t w_out,
                             const uint32_t k_out, const uint8_t padding_bottom,
@@ -152,19 +159,32 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
 void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
                                const uint8_t right, const uint8_t bottom,
                                const uint8_t left);
+/** ne16_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
-                        const uint32_t k_in, const uint32_t w_in_stride,
-                        const uint32_t k_in_stride, const uint32_t h_out,
+                        const uint32_t k_in, const uint32_t h_in_stride,
+                        const uint32_t w_in_stride, const uint32_t h_out,
                         const uint32_t w_out, const uint32_t k_out,
-                        const uint32_t w_out_stride, const uint32_t k_out_stride,
-                        const uint8_t padding_top, const uint8_t padding_bottom,
+                        const uint32_t h_out_stride,
+                        const uint32_t w_out_stride, const uint8_t padding_top,
+                        const uint8_t padding_bottom,
                         const uint8_t padding_right,
                         const uint8_t padding_left);
+/** ne16_task_set_dims_stride2x2
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
 void ne16_task_set_dims_stride2x2(
     ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
-    const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+    const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
     const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
     const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
     const uint8_t padding_bottom, const uint8_t padding_right,
     const uint8_t padding_left);
diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h
index 803e30e..d3d7297 100644
--- a/ne16/hal/ne16_task_defs.h
+++ b/ne16/hal/ne16_task_defs.h
@@ -25,8 +25,13 @@
 
 #define NE16_FILTER_SIZE (3)
 #define NE16_FILTER_BUFFER_SIZE (5)
-#define NE16_INPUT_CHANNEL_THROUGHPUT (16)
-#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32)
+#define NE16_SUBTILE_INPUT_HEIGHT (5)
+#define NE16_SUBTILE_INPUT_WIDTH (5)
+#define NE16_SUBTILE_INPUT_CHANNEL (16)
+#define NE16_SUBTILE_OUTPUT_HEIGHT (3)
+#define NE16_SUBTILE_OUTPUT_WIDTH (3)
+#define NE16_SUBTILE_OUTPUT_CHANNEL (32)
+#define NE16_OUTPUT_BANDWIDTH_BYTES (32)
 
 #define NE16_WEIGHT_D0_STRIDE_MODE8 (2)
 #define NE16_WEIGHT_D0_STRIDE_MODE16 (1)
@@ -59,12 +64,6 @@
 #define NE16_REG_FILTER_MASKING 22
 #define NE16_REG_CONF0 23
 
-/*  SHIFT  */
-
-#define NE16_SHIFT_FLAG_NORM_BIAS (25)
-#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
-#define NE16_SHIFT_ROUNDING (11)
-
 /*  CONF0 FLAGS */
 
 #define NE16_FLAG_NORM_BIAS (1 << 25)
@@ -81,7 +80,7 @@
 #define NE16_NORM_MODE_8BIT (0 << 12)
 #define NE16_NORM_MODE_16BIT (1 << 12)
 #define NE16_NORM_MODE_32BIT (2 << 12)
-#define NE16_FLAG_ROUND (1 << 11)
+#define NE16_FLAG_ROUNDING (1 << 11)
 #define NE16_FLAG_STRIDE_2x2 (1 << 8)
 #define NE16_FLAG_LINEAR_MODE (1 << 7)
 #define NE16_FLAG_MODE_3x3 (0 << 5)
@@ -91,10 +90,26 @@
 #define NE16_FLAG_MODE_BASIC (0 << 3)
 #define NE16_FLAG_MODE16 (1 << 3)
 
+/*  SHIFT  */
+
+#define NE16_SHIFT_FLAG_NORM_BIAS (25)
+#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
+#define NE16_SHIFT_FLAG_ROUNDING (11)
+
 /* Masks */
 
-#define NE16_MASK_QUANT_FUNCTION (1 << 23)
-#define NE16_MASK_QUANT_MODE (3 << 21)
+#define NE16_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NE16_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NE16_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NE16_MASK_QUANT_MODE (0x3 << 21)
+#define NE16_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NE16_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NE16_MASK_NORM_MODE (0x3 << 12)
+#define NE16_MASK_FLAG_ROUNDING (0x1 << 11)
+#define NE16_MASK_FLAG_STRIDE_2x2 (0x1 << 8)
+#define NE16_MASK_FLAG_MODE (0x3 << 5)
+#define NE16_MASK_FLAG_MODE16 (0x1 << 3)
+#define NE16_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
 
 /* PADDING */
 
diff --git a/neureka/README.md b/neureka/README.md
new file mode 100644
index 0000000..9c83f4e
--- /dev/null
+++ b/neureka/README.md
@@ -0,0 +1,34 @@
+# Neureka
+
+## Docs
+
+Github repo [link](https://github.com/siracusa-soc/ne).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [ ] Normalization and quantization
+    - [x] With
+    - [x] Without
+    - [x] Relu (w/ and w/o)
+    - [x] Bias (w/ and w/o)
+    - [ ] Per-channel shift
+    - [x] Per-layer shift
+    - [ ] Rounding
+- [x] Input type
+    - [x] uint8
+    - [x] int8
+- [x] Output type
+    - [x] int8
+    - [x] uint8 (only w/ Relu)
+    - [x] int32
+- [ ] Scale type
+    - [x] uint8
+    - [ ] uint32
+- [x] Bias type
+    - [x] int32
+- [ ] Weight type
+    - [x] int8
+    - [ ] int2-7
diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
new file mode 100644
index 0000000..57136fd
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -0,0 +1,78 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_siracusa_bsp.h"
+#include <pmsis.h>
+
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR                                \
+  (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR +                                   \
+   NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
+#define NEUREKA_SIRACUSA_MAX_STALL (8)
+#define NEUREKA_SIRACUSA_EVENT (1 << 12)
+#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
+
+void neureka_siracusa_hci_setpriority_neureka() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void neureka_siracusa_hci_setpriority_core() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void neureka_siracusa_hci_reset_max_stall() {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+      ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
+  *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+      max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
+  neureka_siracusa_hci_setpriority_neureka();
+  neureka_siracusa_hci_set_max_stall(conf->max_stall);
+}
+
+void neureka_siracusa_close() {
+  neureka_siracusa_hci_reset_max_stall();
+  neureka_siracusa_hci_setpriority_core();
+}
+
+void neureka_siracusa_event_wait_and_clear() {
+  eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT);
+}
+
+static const neureka_dev_t neureka_siracusa_dev = {
+    .hwpe_dev = (struct hwpe_dev_t){
+        .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}};
+
+const neureka_dev_t *neureka_siracusa_get_dev() {
+  return &neureka_siracusa_dev;
+}
diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h
new file mode 100644
index 0000000..be75a20
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.h
@@ -0,0 +1,67 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_siracusa_BSP_H__
+#define __NEUREKA_siracusa_BSP_H__
+
+#include "neureka.h"
+#include <stdint.h>
+
+/**
+ * neureka_siracusa_setpriority_neureka
+ *
+ * Set HCI interconnect bus priority to prioritize neureka.
+ */
+void neureka_siracusa_hci_setpriority_neureka();
+
+/**
+ * neureka_siracusa_setpriority_core
+ *
+ * Set HCI bus priority to prioritize cores.
+ */
+void neureka_siracusa_hci_setpriority_core();
+
+/**
+ * neureka_siracusa_hci_reset_maxstall
+ *
+ * Reset the HCI bus maxstall parameter.
+ * TODO: Check if it disables it also or just resets?
+ */
+void neureka_siracusa_hci_reset_max_stall();
+
+/**
+ * neureka_siracusa_hci_set_maxstall
+ *
+ * Set the HCI bus maxstall. Maxstall defines how many cycles
+ * will the HCI bus stall the lower priority master, i.e. neureka or core,
+ * before letting it do a transaction.
+ */
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall);
+
+typedef struct neureka_siracusa_conf_t {
+  int max_stall;
+} neureka_siracusa_conf_t;
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf);
+void neureka_siracusa_close();
+void neureka_siracusa_event_wait_and_clear();
+const neureka_dev_t *neureka_siracusa_get_dev();
+
+#endif // !__NEUREKA_siracusa_BSP_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
new file mode 100644
index 0000000..37eeab0
--- /dev/null
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -0,0 +1,54 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_GVSOC_H__
+#define __NEUREKA_GVSOC_H__
+
+#include "neureka.h"
+#include "neureka_task.h"
+
+#define NEUREKA_REG_GVSOC_LOG_LEVEL 24
+#define NEUREKA_REG_GVSOC_LOG_FORMAT 25
+
+typedef enum neureka_gvsoc_log_format_e {
+  NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0,
+  NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3
+} neureka_gvsoc_log_format_e;
+
+typedef enum neureka_gvsoc_log_level_e {
+  NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0,
+  NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1,
+  NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2,
+  NEUREKA_GVSOC_LOG_LEVEL_ALL = 3
+} neureka_gvsoc_log_level_e;
+
+static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
+                                       neureka_gvsoc_log_level_e log_level,
+                                       neureka_gvsoc_log_format_e format) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
+}
+
+static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
+  hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
+                      NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
+}
+
+#endif // __NEUREKA_GVSOC_H__
diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c
similarity index 56%
rename from neureka/inc/pulp_nnx_error_codes.h
rename to neureka/hal/neureka.c
index dc71575..dc829d9 100644
--- a/neureka/inc/pulp_nnx_error_codes.h
+++ b/neureka/hal/neureka.c
@@ -18,15 +18,20 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef __NE16_ERROR_CODES_H__
-#define __NE16_ERROR_CODES_H__
+#include "neureka.h"
 
-typedef enum {
-  success = 0,
-  weightBitwidthOutOfBounds,
-  unsupportedWeightOffsetMode,
-  unsupportedFeatureBitwidth,
-  dimensionMismatch
-} nnx_error_code;
+#define NEUREKA_STATUS_EMPTY (0x000)
+#define NEUREKA_STATUS_FULL (0x101)
 
-#endif // __NE16_ERROR_CODES_H__
\ No newline at end of file
+inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) {
+  uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
+  return (status & 0x1) + ((status >> 8) & 0x1);
+}
+
+inline int neureka_task_queue_empty(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY;
+}
+
+inline int neureka_task_queue_full(neureka_dev_t *dev) {
+  return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL;
+}
diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h
similarity index 62%
rename from neureka/src/pulp_nnx_util.c
rename to neureka/hal/neureka.h
index daaaf2b..eae77a1 100644
--- a/neureka/src/pulp_nnx_util.c
+++ b/neureka/hal/neureka.h
@@ -18,13 +18,20 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "pulp_nnx_util.h"
-#include "pulp_nnx_hal.h"
+#ifndef __NEUREKA_H__
+#define __NEUREKA_H__
 
-void nnx_activate_gvsoc_logging(int log_level) {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level);
-}
+#include "hwpe.h"
+#include <stdint.h>
 
-void nnx_deactivate_gvsoc_logging() {
-  NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0);
-}
+#define NEUREKA_TASK_QUEUE_SIZE (2)
+
+typedef struct neureka_dev_t {
+  hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
+} neureka_dev_t;
+
+int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev);
+int neureka_task_queue_empty(neureka_dev_t *dev);
+int neureka_task_queue_full(neureka_dev_t *dev);
+
+#endif // __NEUREKA_H__
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
new file mode 100644
index 0000000..501b2b9
--- /dev/null
+++ b/neureka/hal/neureka_task.c
@@ -0,0 +1,239 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_task.h"
+#include "neureka_task_defs.h"
+#include "pulp_nnx_util.h"
+
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                                  uint32_t i_width, uint32_t n_height,
+                                  uint32_t n_width) {
+  uint32_t tile_padding = padding;
+  if (i_height > 0) {
+    tile_padding &= ~(0xf << 28);
+  }
+  if (i_width < n_width - 1) {
+    tile_padding &= ~(0xf << 24);
+  }
+  if (i_height < n_height - 1) {
+    tile_padding &= ~(0xf << 20);
+  }
+  if (i_width > 0) {
+    tile_padding &= ~(0xf << 16);
+  }
+  return tile_padding;
+}
+
+void neureka_task_init(neureka_task_t *task) {
+  *task = (neureka_task_t){.data = {0}};
+}
+
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+                                 const uint8_t kernel_shape,
+                                 const uint8_t depthwise,
+                                 const uint8_t stride) {
+  task->depthwise = depthwise;
+  task->kernel_shape = kernel_shape;
+  task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+                                           : NEUREKA_SUBTILE_OUTPUT_CHANNEL;
+  task->subtile_input_channel = kernel_shape == 3
+                                    ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+                                    : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1;
+
+  const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
+                        : depthwise == 1  ? NEUREKA_FLAG_MODE_3x3_DW
+                                          : NEUREKA_FLAG_MODE_3x3;
+
+  task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE);
+  task->data.cfg.conf0 |= flag_mode;
+}
+
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+                           const uint8_t output_bits,
+                           const uint8_t weight_bits) {
+  neureka_quant_mode_e quantMode;
+  if (output_bits == 8) {
+    quantMode = quantMode8Bit;
+  } else {
+    quantMode = quantMode32Bit;
+  }
+
+  task->qw = weight_bits;
+  task->data.cfg.conf0 &=
+      ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS);
+  task->data.cfg.conf0 |= quantMode | (weight_bits - 1);
+}
+
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+                                 neureka_norm_t norm) {
+  task->data.cfg.conf0 &=
+      ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT |
+        NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS |
+        NEUREKA_MASK_FLAG_NORM_SHIFT);
+  task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function |
+                          (quant.shift_amount << 16) | norm.mode |
+                          norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+                          norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT;
+}
+
+void neureka_task_set_weight_offset(
+    neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+    const int32_t weight_offset) {
+  task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE;
+  task->data.cfg.conf0 |= weight_offset_mode;
+  task->data.cfg.weight_offset_factor = weight_offset;
+}
+
+void neureka_task_set_input_signed(neureka_task_t *task) {
+  task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED;
+}
+
+void neureka_task_set_input_unsigned(neureka_task_t *task) {
+  task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED;
+}
+
+void neureka_task_set_weight_source(neureka_task_t *task,
+                                    neureka_weight_source_e weight_source) {
+  task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE;
+  task->data.cfg.conf0 |= weight_source;
+}
+
+/** neureka_pad_ptr
+ *
+ * Calculate the pointer to the start of the ptr as if
+ * it was the start to the padded data.
+ * Necessary for input pointer when it's padded.
+ */
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                         const uint32_t width_stride, const uint8_t padding_top,
+                         const uint8_t padding_left) {
+  return ptr - (padding_top * width + padding_left) * width_stride;
+}
+
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+                           uint32_t w_in, uint32_t w_in_stride,
+                           uint8_t padding_top, uint8_t padding_left,
+                           uint32_t output_ptr, uint32_t weights_ptr,
+                           uint32_t scale_ptr, uint32_t shift_ptr,
+                           uint32_t bias_ptr) {
+  task->data.infeat_ptr =
+      neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
+  task->data.outfeat_ptr = output_ptr;
+  task->data.weights_ptr = weights_ptr;
+  task->data.scale_ptr = scale_ptr;
+  task->data.scale_shift_ptr = shift_ptr;
+  task->data.scale_bias_ptr = bias_ptr;
+}
+
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                              const uint32_t h_in_stride,
+                              const uint32_t w_in_stride,
+                              const uint32_t h_out_stride,
+                              const uint32_t w_out_stride) {
+  const uint32_t num_k_in =
+      nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
+
+  const neureka_stride_t input_stride = {
+      .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
+  task->data.cfg.input_stride = input_stride;
+
+  const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES,
+                                          .d1 = w_out_stride,
+                                          .d2 = h_out_stride};
+  task->data.cfg.output_stride = output_stride;
+
+  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES;
+  if (task->kernel_shape == 1) { // 1x1
+    task->data.cfg.weights_stride.d1 =
+        NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in;
+  } else if (!task->depthwise) { // 3x3
+    task->data.cfg.weights_stride.d1 =
+        NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in;
+  } else { // 3x3 depthwise
+    task->data.cfg.weights_stride.d1 = 0;
+  }
+  task->data.cfg.weights_stride.d2 = 0;
+}
+
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                               const uint32_t h_out, const uint32_t w_out,
+                               const uint32_t k_out,
+                               const uint8_t padding_bottom,
+                               const uint8_t padding_right) {
+  const uint16_t num_Ko =
+      nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+  const uint16_t num_Ki =
+      nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
+  const uint16_t num_Ho =
+      nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t num_Wo =
+      nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+
+  const uint16_t rem_Ko =
+      nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+  const uint16_t rem_Ki =
+      nnx_calculate_last_tile_size(k_in, task->subtile_input_channel);
+  const uint16_t rem_Ho =
+      nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+  const uint16_t rem_Wo =
+      nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+  const uint16_t rem_Hi =
+      (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
+  const uint16_t rem_Wi =
+      (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
+
+  const neureka_subtile_t subtile = {
+      .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+                 .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+      .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+                    .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+                    .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
+  task->data.cfg.subtile = subtile;
+}
+
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                              const uint8_t bottom, const uint8_t left,
+                              const uint8_t right, const uint8_t value) {
+  task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
+                           ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
+                           (value & 0xff);
+}
+
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                                  const uint8_t right, const uint8_t bottom,
+                                  const uint8_t left) {
+  task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
+                               ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
+}
+
+void neureka_task_set_dims(
+    neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t h_in_stride, const uint32_t w_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
+    const uint8_t padding_top, const uint8_t padding_bottom,
+    const uint8_t padding_right, const uint8_t padding_left) {
+  neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+                           w_out_stride);
+  neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
+                            padding_right);
+  neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
+                           padding_right, 0);
+}
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
new file mode 100644
index 0000000..2d06468
--- /dev/null
+++ b/neureka/hal/neureka_task.h
@@ -0,0 +1,187 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_TASK_H__
+#define __NEUREKA_TASK_H__
+
+#include "neureka_task_defs.h"
+#include <stdint.h>
+
+typedef enum neureka_task_flag_e {
+  neurekaTaskFlagFalse = 0,
+  neurekaTaskFlagTrue = 1
+} neureka_task_flag_e;
+
+typedef enum neureka_weight_source_e {
+  neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM,
+  neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM
+} neureka_weight_source_e;
+
+typedef enum neureka_weight_offset_mode_e {
+  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
+  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
+} neureka_weight_offset_mode_e;
+
+typedef enum {
+  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
+  normMode32Bit = NEUREKA_NORM_MODE_32BIT
+} neureka_norm_mode_e;
+
+typedef struct neureka_norm_t {
+  neureka_norm_mode_e mode;
+  int flag_bias;
+  int flag_shift;
+} neureka_norm_t;
+
+typedef enum neureka_quant_mode_e {
+  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
+  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
+} neureka_quant_mode_e;
+
+typedef enum neureka_quant_function_e {
+  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
+  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
+} neureka_quant_function_e;
+
+typedef struct neureka_quant_t {
+  // Shift amount must be in range 0x00-0x1F
+  unsigned shift_amount;
+  neureka_quant_function_e function;
+  int flag_rounding;
+} neureka_quant_t;
+
+typedef struct neureka_stride_t {
+  uint32_t d0;
+  uint32_t d1;
+  uint32_t d2;
+} neureka_stride_t;
+
+typedef struct neureka_subtile_remainder_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+  uint32_t HiWi;
+} neureka_subtile_remainder_t;
+
+typedef struct neureka_subtile_number_t {
+  uint32_t KoKi;
+  uint32_t HoWo;
+} neureka_subtile_number_t;
+
+typedef struct neureka_subtile_t {
+  neureka_subtile_remainder_t remainder;
+  neureka_subtile_number_t number;
+} neureka_subtile_t;
+
+typedef struct neureka_cfg_t {
+  neureka_stride_t input_stride;
+  neureka_stride_t output_stride;
+  neureka_stride_t weights_stride;
+  neureka_subtile_t subtile;
+  uint32_t padding;
+  uint32_t weight_offset_factor;
+  uint32_t filter_mask;
+  uint32_t conf0;
+} neureka_cfg_t;
+
+typedef struct neureka_task_data_t {
+  uint32_t weights_ptr;
+  uint32_t infeat_ptr;
+  uint32_t outfeat_ptr;
+  uint32_t scale_ptr;
+  uint32_t scale_shift_ptr;
+  uint32_t scale_bias_ptr;
+  neureka_cfg_t cfg;
+} neureka_task_data_t;
+
+typedef struct neureka_task_t {
+  neureka_task_data_t data;
+  uint8_t qw;
+  uint8_t subtile_output_channel;
+  uint8_t subtile_input_channel;
+  uint8_t kernel_shape;
+  uint8_t depthwise;
+  uint8_t id;
+} neureka_task_t;
+
+void neureka_task_init(neureka_task_t *task);
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+                                 const uint8_t kernel_shape,
+                                 const uint8_t depthwise, const uint8_t stride);
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+                           const uint8_t output_bits,
+                           const uint8_t weight_bits);
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+                                 neureka_norm_t norm);
+void neureka_task_set_weight_offset(
+    neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+    const int32_t weight_offset);
+void neureka_task_set_input_signed(neureka_task_t *task);
+void neureka_task_set_input_unsigned(neureka_task_t *task);
+void neureka_task_set_weight_source(neureka_task_t *task,
+                                    neureka_weight_source_e weight_source);
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+                                  uint32_t i_width, uint32_t n_height,
+                                  uint32_t n_width);
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+                         const uint32_t width_stride, const uint8_t padding_top,
+                         const uint8_t padding_left);
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+                           uint32_t w_in, uint32_t w_in_stride,
+                           uint8_t padding_top, uint8_t padding_left,
+                           uint32_t output_ptr, uint32_t weights_ptr,
+                           uint32_t scale_ptr, uint32_t shift_ptr,
+                           uint32_t bias_ptr);
+/** neureka_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+                              const uint32_t h_in_stride,
+                              const uint32_t w_in_stride,
+                              const uint32_t h_out_stride,
+                              const uint32_t w_out_stride);
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+                               const uint32_t h_out, const uint32_t w_out,
+                               const uint32_t k_out,
+                               const uint8_t padding_bottom,
+                               const uint8_t padding_right);
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+                              const uint8_t bottom, const uint8_t left,
+                              const uint8_t right, const uint8_t value);
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+                                  const uint8_t right, const uint8_t bottom,
+                                  const uint8_t left);
+/** neureka_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
+void neureka_task_set_dims(
+    neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+    const uint32_t h_in_stride, const uint32_t w_in_stride,
+    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+    const uint32_t h_out_stride, const uint32_t w_out_stride,
+    const uint8_t padding_top, const uint8_t padding_bottom,
+    const uint8_t padding_right, const uint8_t padding_left);
+
+#endif // !__NEUREKA_TASK_H__
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
new file mode 100644
index 0000000..fa08289
--- /dev/null
+++ b/neureka/hal/neureka_task_defs.h
@@ -0,0 +1,124 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_DEFS_H__
+#define __NEUREKA_DEFS_H__
+
+/* ARHITECTURE */
+
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32)
+
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28)
+
+#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6)
+#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6)
+#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32)
+
+#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32)
+#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32)
+
+/* TASK REGISTERS */
+
+// job configuration
+#define NEUREKA_REG_WEIGHTS_PTR 0
+#define NEUREKA_REG_INFEAT_PTR 1
+#define NEUREKA_REG_OUTFEAT_PTR 2
+#define NEUREKA_REG_SCALE_PTR 3
+#define NEUREKA_REG_SCALE_SHIFT_PTR 4
+#define NEUREKA_REG_SCALE_BIAS_PTR 5
+#define NEUREKA_REG_INFEAT_D0_STRIDE 6
+#define NEUREKA_REG_INFEAT_D1_STRIDE 7
+#define NEUREKA_REG_INFEAT_D2_STRIDE 8
+#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9
+#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10
+#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11
+#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12
+#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13
+#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14
+#define NEUREKA_REG_SUBTILE_REMAINDER_0 15
+#define NEUREKA_REG_SUBTILE_REMAINDER_1 16
+#define NEUREKA_REG_SUBTILE_REMAINDER_2 17
+#define NEUREKA_REG_SUBTILE_NUMBER_0 18
+#define NEUREKA_REG_SUBTILE_NUMBER_1 19
+#define NEUREKA_REG_PADDING 20
+#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21
+#define NEUREKA_REG_FILTER_MASKING 22
+#define NEUREKA_REG_CONF0 23
+
+/*  SHIFT  */
+
+#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26)
+#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
+#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
+#define NEUREKA_SHIFT_QUANT_SHIFT (16)
+
+/*  CONF0 FLAGS */
+
+#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26)
+#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
+#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
+#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
+#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
+#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
+#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
+// conf0[20:16] - quantization shift amount
+#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE                                  \
+  (1 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_STREAMIN (1 << 14)
+#define NEUREKA_NORM_MODE_8BIT (0 << 12)
+#define NEUREKA_NORM_MODE_32BIT (2 << 12)
+#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
+#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9)
+#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9)
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested
+#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
+#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
+#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
+#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
+
+/* Masks */
+
+#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26)
+#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NEUREKA_MASK_QUANT_MODE (0x3 << 21)
+#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NEUREKA_MASK_NORM_MODE (0x3 << 12)
+#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10)
+#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9)
+#define NEUREKA_MASK_FLAG_MODE (0x3 << 5)
+#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
+
+/* PADDING */
+
+#define NEUREKA_DONT_PAD (0)
+#define NEUREKA_MAX_PAD (2)
+
+/* NORM */
+#define NEUREKA_NORM_MAX_LEN (32)
+
+#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h
deleted file mode 100644
index e8ecba5..0000000
--- a/neureka/inc/pulp_nnx_defs.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_DEFS_H__
-#define __NEUREKA_DEFS_H__
-
-/* ARHITECTURE */
-
-#define NEUREKA_FILTER_SIZE (6)
-#define NEUREKA_FILTER_BUFFER_SIZE (8)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
-#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_CONTEXT_SIZE (2)
-#define NEUREKA_WEIGHT_BANDWIDTH (256)
-
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
-
-/* REGISTER MAP */
-
-#define NEUREKA_EVT0 12
-#define NEUREKA_EVT1 13
-#define NEUREKA_BASE_ADDR 0x00201000
-#define WEIGHT_MEM_BASE 0x10400000
-#define SRAM_OFFSET 0x00400000
-#define MRAM_OFFSET 0x00000000
-
-// Cluster
-#define CLUSTER_CTRL_BASE_ADDR 0x00200000
-#define CLUSTER_CTRL_HWPE_OFFS 0x18
-#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800
-
-/* REGISTER OFFSETS */
-
-// commands
-#define NEUREKA_TRIGGER 0x00
-#define NEUREKA_ACQUIRE 0x04
-#define NEUREKA_FINISHED 0x08
-#define NEUREKA_STATUS 0x0C
-#define NEUREKA_RUNNING_JOB 0x10
-#define NEUREKA_SOFT_CLEAR 0x14
-#define NEUREKA_SWSYNC 0x18
-#define NEUREKA_URISCY_IMEM 0x1C
-
-// job configuration
-#define NEUREKA_REGISTER_OFFSET 0x20
-
-#define NEUREKA_REG_WEIGHTS_PTR 0x00
-#define NEUREKA_REG_INFEAT_PTR 0x04
-#define NEUREKA_REG_OUTFEAT_PTR 0x08
-#define NEUREKA_REG_SCALE_PTR 0x0C
-#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10
-#define NEUREKA_REG_SCALE_BIAS_PTR 0x14
-#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18
-#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C
-#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20
-#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24
-#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28
-#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C
-#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30
-#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34
-#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38
-#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C
-#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40
-#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44
-#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48
-#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C
-#define NEUREKA_REG_PADDING 0x50
-#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54
-#define NEUREKA_REG_FILTER_MASKING 0x58
-#define NEUREKA_REG_CONF0 0x5C
-
-// Simulation only
-#define NEUREKA_REG_GVSOC_TRACE 0x60
-
-/*  SHIFT  */
-
-#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
-#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
-#define NEUREKA_SHIFT_QUANT_SHIFT (16)
-#define NEUREKA_SHIFT_ROUNDING (11)
-
-/*  CONF0 FLAGS */
-
-#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
-#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
-#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
-#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
-#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
-#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
-// conf0[20:16] - quantization shift amount
-#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
-#define NEUREKA_FLAG_STREAMIN (1 << 14)
-#define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12)
-#define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11)
-#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
-#define NEUREKA_FLAG_USE_WMEM (1 << 9)
-#define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDED_MODE (1 << 8)
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
-#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
-#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
-#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
-#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
-#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3)
-
-/* Masks */
-
-#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
-#define NEUREKA_MASK_QUANT_MODE (3 << 21)
-
-/* Miscellaneous */
-
-// Padding
-#define MAX_PAD (0xf)
-
-// Normalization
-#define NEUREKA_NORM_MAX_LEN (32)
-#define NO_NORM(length)                                                        \
-  {                                                                            \
-    .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL,      \
-    .length = length, .mode = normMode32Bit                                    \
-  }
-
-// Quantization
-#define NO_QUANT                                                               \
-  {                                                                            \
-    .shift_amount = 0, .mode = quantMode32Bit,                                 \
-    .function = quantFunctionIdentity                                          \
-  }
-
-// GVSOC trace levels
-#define NEUREKA_TRACE_LEVEL_JOB_START_END 0
-#define NEUREKA_TRACE_LEVEL_CONFIG 1
-#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2
-#define NEUREKA_TRACE_LEVEL_ALL 3
-
-// null
-#define NEUREKA_NULL ((void *)0)
-#define NEUREKA_STATUS_FULL (0x101)
-
-#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h
deleted file mode 100644
index 40bcec0..0000000
--- a/neureka/inc/pulp_nnx_hal.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_H__
-#define __NEUREKA_H__
-
-#include <stdint.h>
-
-#include "pulp_nnx_defs.h"
-#include "pulp_nnx_error_codes.h"
-
-#define NEUREKA_CG_ENABLE()                                                    \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |=        \
-      CLUSTER_CTRL_HWPE_CG_EN_MASK
-#define NEUREKA_CG_DISABLE()                                                   \
-  *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &=        \
-      ~CLUSTER_CTRL_HWPE_CG_EN_MASK
-
-#define NEUREKA_WRITE(offset, value)                                           \
-  *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value)
-#define NEUREKA_WRITE_BE(offset, value, be)                                    \
-  *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value)
-#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset))
-
-#define NEUREKA_WRITE_IO_REG(offset, value)                                    \
-  NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value))
-#define NEUREKA_WRITE_IO_REG_BE(offset, value, be)                             \
-  NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be))
-#define NEUREKA_READ_IO_REG(offset)                                            \
-  NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset))
-
-#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0)
-#define NEUREKA_BARRIER()                                                      \
-  do {                                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BUSYWAIT()                                                     \
-  do {                                                                         \
-  } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BARRIER_ACQUIRE(job_id)                                        \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);                                  \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-#define NEUREKA_NOBARRIER_ACQUIRE(job_id)                                      \
-  job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                      \
-  while (job_id < 0) {                                                         \
-    job_id = NEUREKA_READ(NEUREKA_ACQUIRE);                                    \
-  };
-
-#define DIVNCEIL(A, B) (((A - 1) / B) + 1)
-#define REMAINDER(A, B) (((A - 1) % B) + 1)
-#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff))
-
-#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE
-
-#define FLAG_USED (1)
-#define FLAG_UNUSED (0)
-
-typedef enum {
-  weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
-  weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
-} nnx_weight_offset_mode_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  uint16_t n_weights;
-  uint32_t bitwidth;
-  int32_t offset_factor;
-  nnx_weight_offset_mode_e offset_mode;
-} nnx_weights_t;
-
-typedef enum {
-  featureBitwidth8Bit = 8,
-  featureBitwidth16Bit = 16,
-  featureBitwidth32Bit = 32
-} nnx_feature_bitwidth_e;
-
-typedef struct {
-  void *data;
-  uint16_t height;
-  uint16_t width;
-  uint16_t depth;
-  nnx_feature_bitwidth_e bitwidth;
-} nnx_feature_t;
-
-typedef enum {
-  normMode8Bit = NEUREKA_NORM_MODE_8BIT,
-  normMode16Bit = NEUREKA_NORM_MODE_16BIT,
-  normMode32Bit = NEUREKA_NORM_MODE_32BIT
-} nnx_norm_mode_e;
-
-typedef struct {
-  nnx_norm_mode_e mode;
-  int flag_bias;
-  int flag_shift;
-} nnx_norm_t;
-
-typedef enum {
-  quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
-  quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
-  quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
-} nnx_quant_mode_e;
-
-typedef enum {
-  quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
-  quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
-} nnx_quant_function_e;
-
-// TODO: add rounding to quant. Should also be an enum? Best boolean...
-typedef struct {
-  // Shift amount must be in range 0x00-0x1F
-  unsigned shift_amount;
-  nnx_quant_mode_e mode;
-  nnx_quant_function_e function;
-  int flag_rounding;
-} nnx_quant_t;
-
-typedef struct {
-  uint32_t d0;
-  uint32_t d1;
-  uint32_t d2;
-} nnx_stride_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-  uint32_t HiWi;
-} nnx_subtile_remainder_t;
-
-typedef struct {
-  uint32_t KoKi;
-  uint32_t HoWo;
-} nnx_subtile_number_t;
-
-typedef struct {
-  nnx_subtile_remainder_t remainder;
-  nnx_subtile_number_t number;
-} nnx_subtile_t;
-
-typedef struct {
-  nnx_stride_t input_stride;
-  nnx_stride_t output_stride;
-  nnx_stride_t weights_stride;
-  nnx_subtile_t subtile;
-  uint32_t padding;
-  uint32_t weight_offset_factor;
-  uint32_t filter_mask;
-  uint32_t conf0;
-} nnx_cfg_t;
-
-typedef struct {
-  uint32_t weights_ptr;
-  uint32_t infeat_ptr;
-  uint32_t outfeat_ptr;
-  uint32_t scale_ptr;
-  uint32_t scale_shift_ptr;
-  uint32_t scale_bias_ptr;
-  nnx_cfg_t cfg;
-} nnx_task_t;
-
-int nnx_job_id();
-int nnx_empty();
-int nnx_full();
-void nnx_soft_clear();
-int nnx_acquire();
-void nnx_offload(nnx_task_t *task);
-void nnx_offload_ptr(nnx_task_t *task);
-void nnx_run_async();
-void nnx_run_blocking();
-void nnx_commit();
-void nnx_wait_empty();
-void nnx_wait_not_full();
-void nnx_wait_on_id(int id);
-void nnx_busywait();
-
-void nnx_task_init(nnx_task_t *task);
-int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom,
-                  uint32_t left, uint16_t value);
-int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant);
-void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom,
-                     uint8_t left);
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights,
-                            nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                        int k_out, int k_in);
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights,
-                               nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
-                                           int k_out, int k_in);
-
-#endif /* __NEUREKA_H__ */
diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h
deleted file mode 100644
index f29ff3e..0000000
--- a/neureka/inc/pulp_nnx_util.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __PULP_NNX_UTIL__
-#define __PULP_NNX_UTIL__
-
-void nnx_activate_gvsoc_logging(int use_dec);
-void nnx_deactivate_gvsoc_logging();
-
-#endif /* __PULP_NNX_UTIL__ */
diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c
deleted file mode 100644
index 1d99691..0000000
--- a/neureka/src/pulp_nnx_hal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Luka Macan <luka.macan@unibo.it>
- * Arpan Prasad <prasadar@iis.ee.ethz.ch>
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "pulp_nnx_hal.h"
-#include "pmsis.h"
-
-static int qw, weight_d0_stride, outbytes;
-
-// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and
-// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise
-// the compiler is not able to correctly factorize the NEUREKA base in case
-// several accesses are done, ending up with twice more code
-
-// __builtin_pulp_OffsetedX not defined - needs further investigation... (too
-// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK...
-
-int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); }
-
-int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; }
-
-int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); }
-
-void nnx_soft_clear() {
-  NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0);
-  for (volatile int i = 0; i < 10; i++)
-    ;
-}
-
-int nnx_acquire() {
-  int job_id = -1;
-  NEUREKA_BARRIER_ACQUIRE(job_id);
-  return job_id;
-}
-
-void nnx_offload(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_offload_ptr(nnx_task_t *task) {
-  int *task_data = (int *)task;
-  for (int i = 0; i < 6; ++i) {
-    NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
-  }
-}
-
-void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); }
-
-void nnx_run_blocking() {
-  nnx_run_async();
-  nnx_wait_empty();
-}
-
-void nnx_commit() {
-  NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger
-}
-
-void nnx_busywait() { NEUREKA_BUSYWAIT(); }
-
-void nnx_wait_empty() {
-  while (!nnx_empty())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_not_full() {
-  while (nnx_full())
-    NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_on_id(const int id) {
-  while (nnx_job_id() <= id) {
-    eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);
-  };
-}
-
-void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); }
-
-int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right,
-                  const uint32_t bottom, const uint32_t left,
-                  const uint16_t value) {
-  uint32_t padding = 0;
-  uint32_t flags = 0;
-
-  if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) {
-    return 1;
-  }
-
-  cfg->padding =
-      (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value;
-
-  return 0;
-}
-
-int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm,
-                   const nnx_quant_t quant) {
-  if (quant.shift_amount > 31) {
-    printf("ERROR! quant.shift_amount > 31\n");
-    return 1;
-  }
-
-  if (quant.mode == quantMode16Bit) {
-    printf("ERROR! quant.mode == quantMode16Bit\n");
-    return 1;
-  }
-
-  BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
-                          (quant.shift_amount << 16) |
-                          quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
-                          norm.mode |
-                          norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
-                          norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT);
-
-  return 0;
-}
-
-void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right,
-                     const uint8_t bottom, const uint8_t left) {
-  cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) |
-                     ((uint32_t)bottom << 8) | ((uint32_t)left << 0);
-}
-
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho;
-  const int rem_Wi = rem_Wo;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_in,
-      .d1 = k_in * w_out,
-      .d2 = k_in * 3 * 3 // copying arpan
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = weight_d0_stride * qw,
-      .d1 = weight_d0_stride * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height != output.height || input.width != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                        const int w_out, const int k_out,
-                                        const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
-  const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {.d0 = k_in,
-                                     .d1 = k_in * (w_out + 2),
-                                     .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE *
-                                           NEUREKA_FILTER_BUFFER_SIZE};
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3,
-      .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                            const nnx_feature_t input,
-                            const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != weights.depth || output.depth != weights.n_weights) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth,
-                           input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out,
-                                           const int w_out, const int k_out,
-                                           const int k_in) {
-
-  const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int num_Ki = num_Ko;
-  const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
-  const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
-  const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
-  const int rem_Ki = rem_Ko;
-  const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
-  const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
-  const int rem_Hi = rem_Ho + 2;
-  const int rem_Wi = rem_Wo + 2;
-
-  const nnx_subtile_t subtile = {
-      .number = {.KoKi = concat_half(num_Ko, num_Ki),
-                 .HoWo = concat_half(num_Ho, num_Wo)},
-      .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
-                    .HoWo = concat_half(rem_Ho, rem_Wo),
-                    .HiWi = concat_half(rem_Hi, rem_Wi)}};
-  cfg->subtile = subtile;
-
-  // Strides
-  const nnx_stride_t input_stride = {
-      .d0 = k_out,
-      .d1 = k_out * (w_out + 2),
-      .d2 = 0 // Unused
-  };
-  cfg->input_stride = input_stride;
-
-  const nnx_stride_t output_stride = {
-      .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
-  cfg->output_stride = output_stride;
-
-  const nnx_stride_t weights_stride = {
-      .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride,
-      .d1 = 0,
-      .d2 = 0 // Unused
-  };
-  cfg->weights_stride = weights_stride;
-
-  return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights,
-                               const nnx_feature_t input,
-                               const nnx_feature_t output) {
-  if (weights.bitwidth < 2 || weights.bitwidth > 8) {
-    return weightBitwidthOutOfBounds;
-  }
-
-  if (weights.offset_mode != weightOffsetModeLayerWise) {
-    // Currently only layer-wise mode is used.
-    return unsupportedWeightOffsetMode;
-  }
-
-  if ((input.bitwidth != featureBitwidth8Bit &&
-       input.bitwidth != featureBitwidth16Bit) ||
-      (output.bitwidth != featureBitwidth8Bit &&
-       output.bitwidth != featureBitwidth32Bit)) {
-    return unsupportedFeatureBitwidth;
-  }
-
-  if (input.height - 2 != output.height || input.width - 2 != output.width ||
-      input.depth != output.depth) {
-    return dimensionMismatch;
-  }
-
-  const int mode16 =
-      input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
-  BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 |
-                          (weights.bitwidth - 1));
-
-  // Global static variables needed by update_dims
-  outbytes = output.bitwidth / 8;
-  weight_d0_stride =
-      mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
-  qw = weights.bitwidth;
-
-  nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth,
-                              input.depth);
-
-  // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
-  cfg->weight_offset_factor = weights.offset_factor;
-
-  return 0;
-}
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 7ab0e99..f9799fc 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -79,25 +79,20 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
                                      uint32_t size_j, uint32_t size_k,
                                      uint32_t stride_j, uint32_t stride_k,
                                      uint32_t overlap_i, uint32_t overlap_j,
-                                     uint32_t offset_i, uint32_t offset_j,
-                                     uint8_t data_size) {
-  return ptr +
-         (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
-             data_size / 8 +
-         (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
+                                     uint32_t offset_i, uint32_t offset_j) {
+  return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j +
+         (j * (size_j - overlap_j) - offset_j) * stride_k;
 }
 
-void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker) {
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+                                 const uint32_t w_in, const uint32_t k_in,
+                                 const uint32_t h_out, const uint32_t w_out,
+                                 const uint32_t k_out, const uint8_t h_ker,
+                                 const uint8_t w_ker) {
   const uint8_t stride = 2;
-  const uint8_t bits = 8;
 
-  const uint32_t n_h = divnceil(h_out, stride);
-  const uint32_t n_w = divnceil(w_out, stride);
+  const uint32_t n_h = nnx_calculate_number_of_tiles(h_out, stride);
+  const uint32_t n_w = nnx_calculate_number_of_tiles(w_out, stride);
   const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
   const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
   const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
@@ -109,15 +104,15 @@ void ne16_nnx_dispatch_stride2x2(
 
   for (int i = 0; i < n_h; i++) {
     for (int j = 0; j < n_w; j++) {
-      task->data.infeat_ptr =
-          _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
-                        w_in_stride, k_in_stride, h_ker - stride,
-                        w_ker - stride, i == 0 ? 0 : input_height_offset,
-                        j == 0 ? 0 : input_width_offset, bits);
-      task->data.outfeat_ptr =
-          _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
-                        k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
-                        j == 0 ? 0 : output_width_offset, bits);
+      task->data.infeat_ptr = _get_tile_ptr(
+          input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
+          task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0,
+          h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset,
+          j == 0 ? 0 : input_width_offset);
+      task->data.outfeat_ptr = _get_tile_ptr(
+          output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1,
+          task->data.cfg.output_stride.d1 << 1, 0, 0,
+          i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset);
 
       task->data.cfg.padding =
           ne16_get_tile_padding(tile_padding, i, j, n_h, n_w);
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
new file mode 100644
index 0000000..0abb845
--- /dev/null
+++ b/src/pulp_nnx_neureka.c
@@ -0,0 +1,76 @@
+/*
+ * Luka Macan <luka.macan@unibo.it>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "pulp_nnx_neureka.h"
+#include "hwpe.h"
+#include "neureka.h"
+#include "pulp_nnx_util.h"
+#include <pmsis.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf) {
+  neureka_siracusa_open(conf);
+  hwpe_soft_clear(&dev->hwpe_dev);
+}
+
+void neureka_nnx_term(neureka_dev_t *dev) {
+  hwpe_soft_clear(&dev->hwpe_dev);
+  neureka_siracusa_close();
+}
+
+int neureka_nnx_dispatch_check(neureka_dev_t *dev) {
+  return !neureka_task_queue_full(dev);
+}
+
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev) {
+  while (!neureka_nnx_dispatch_check(dev)) {
+    neureka_siracusa_event_wait_and_clear();
+  }
+}
+
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) {
+  if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) {
+    return 1;
+  }
+  hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data,
+                             (int)(sizeof(neureka_task_data_t) / 4));
+  hwpe_task_queue_release_and_run(&dev->hwpe_dev);
+  return 0;
+}
+
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  // GVSOC model has a broken running_id so resolve_check
+  // conservativly looks if the task queue is empty.
+  return neureka_task_queue_empty(dev);
+#else
+  uint8_t prev_task_id = task->id - 1;
+  return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id ||
+           (hwpe_last_task_id(&dev->hwpe_dev) == task->id &&
+            !neureka_task_queue_empty(dev)));
+#endif
+}
+
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) {
+  while (!neureka_nnx_resolve_check(dev, task)) {
+    neureka_siracusa_event_wait_and_clear();
+  }
+}
diff --git a/test/.isort.cfg b/test/.isort.cfg
new file mode 100644
index 0000000..127bf37
--- /dev/null
+++ b/test/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+profile=black
+line_length=88
+skip_gitignore=true
diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index 5abb204..07dc597 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -48,8 +48,9 @@ def define(self, name, expr):
         if isinstance(expr, str):
             expr = f'"{expr}"'
         elif isinstance(expr, bool):
-            expr = int(expr)
-        expr = f"({expr})"
+            expr = f"({int(expr)})"
+        else:
+            expr = f"({expr})"
         return f"#define {name.upper()} {expr}\n"
 
     def vector_size(self, data):
@@ -158,7 +159,7 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None):
 
         if golden is not None:
             render += self.render_vector(
-                "golden_" + name, "PI_L1 " + _type, size, init=golden
+                "golden_" + name, "PI_L2 " + _type, size, init=golden
             )
             render += self.check(name)
 
diff --git a/test/Ne16.py b/test/Ne16.py
deleted file mode 100644
index 6de5ab5..0000000
--- a/test/Ne16.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Luka Macan <luka.macan@unibo.it>
-#
-# Copyright 2023 ETH Zurich and University of Bologna
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import numpy.typing as npt
-from TestClasses import IntegerType
-
-
-class Ne16:
-    ACCUMULATOR_TYPE = IntegerType(name="int32")
-
-    _CIN_SUBTILE = 16
-
-    @staticmethod
-    def weight_unroll(
-        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
-    ) -> npt.NDArray[np.uint8]:
-        """Unroll weight into expected memory format
-
-        Expected weight shape is (Cout, Cin, H, W).
-        The output shape is: (Cout, Cin_major, Bits, H x W, Cin_minor_bytes),
-        where Cin_major is the ceil(Cin / CIN_SUBTILE) and Cin_minor has to be padded with 0 to CIN_SUBTILE.
-        """
-        if depthwise:
-            weight = weight.transpose(1, 0, 2, 3)  # Swap Cout and Cin
-
-        Cout, Cin, H, W = weight.shape
-
-        # Pad Cin to be divisible with CIN_SUBTILE
-        if Cin % Ne16._CIN_SUBTILE != 0:
-            Cin_pad = Ne16._CIN_SUBTILE - Cin % Ne16._CIN_SUBTILE
-            weight = np.pad(
-                weight,
-                ((0, 0), (0, Cin_pad), (0, 0), (0, 0)),
-                "constant",
-                constant_values=0,
-            )
-
-        # Reshape into (Cout, Cin_major, Cin_minor, Flattened spatial, 1)
-        # The 1 at the end is required by the unpacking
-        Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
-        Cin_minor = Ne16._CIN_SUBTILE
-        weight = weight.reshape(Cout, Cin_major, Cin_minor, H * W, 1)
-
-        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
-        # (Cout, Cin_major, Cin_minor, Flattened spatial, Bits)
-        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
-
-        # Shuffle bits so that the final shape is:
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor)
-        weight = weight.transpose(0, 1, 4, 3, 2)
-
-        # Prepare for packing
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes, 8)
-        Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
-        weight = np.stack(np.split(weight, Cin_minor_bytes, axis=-1), axis=-2)
-
-        # Pack
-        # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes)
-        weight = np.packbits(weight, axis=-1, bitorder="little")
-
-        return weight.flatten()
-
-    @staticmethod
-    def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: int):
-        """Reverse of weight_roll"""
-        Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
-        Cin_minor = Ne16._CIN_SUBTILE
-        Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
-
-        weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1)
-        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
-        weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor)
-        weight = weight.transpose(0, 1, 4, 3, 2)
-        weight = np.packbits(weight, axis=-1, bitorder="little")
-        weight = weight.reshape(Cout, Cin_major * Cin_minor, H, W)
-        weight = weight[:, :Cin, :, :]
-
-        return weight
diff --git a/test/Ne16MemoryLayout.py b/test/Ne16MemoryLayout.py
new file mode 100644
index 0000000..30729ab
--- /dev/null
+++ b/test/Ne16MemoryLayout.py
@@ -0,0 +1,99 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+
+class Ne16MemoryLayout:
+    _CIN_SUBTILE = 16
+
+    @staticmethod
+    def weightEncode(
+        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+    ) -> npt.NDArray[np.uint8]:
+        """Unroll weight into expected memory format
+
+        Expected weight shape is (cout, cin, height, width).
+        The output shape is: (cout, cinMajor, Bits, height x width, cinMinorBytes),
+        where cinMajor is the ceil(cin / CIN_SUBTILE) and cinMinor has to be padded with 0 to CIN_SUBTILE.
+        """
+        if depthwise:
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+        cout, cin, height, width = weight.shape
+
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % Ne16MemoryLayout._CIN_SUBTILE != 0:
+            cinPad = Ne16MemoryLayout._CIN_SUBTILE - cin % Ne16MemoryLayout._CIN_SUBTILE
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+                "constant",
+                constant_values=0,
+            )
+            cin = cin + cinPad
+
+        # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1)
+        # The 1 at the end is required by the unpacking
+        cinMajor = cin // Ne16MemoryLayout._CIN_SUBTILE
+        cinMinor = Ne16MemoryLayout._CIN_SUBTILE
+        weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
+
+        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+        # (cout, cinMajor, cinMinor, flattened spatial, Bits)
+        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+        # Shuffle bits so that the final shape is:
+        # (cout, cinMajor, Bits, flattened spatial, cinMinor)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+
+        # Prepare for packing
+        # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes, 8)
+        cinMinorBytes = int(np.ceil(cinMinor / 8))
+        weight = np.stack(np.split(weight, cinMinorBytes, axis=-1), axis=-2)
+
+        # Pack
+        # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+
+        return weight.flatten()
+
+    @staticmethod
+    def weightDecode(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
+        """Reverse of weight_roll"""
+        cinMajor = int(np.ceil(cin / Ne16MemoryLayout._CIN_SUBTILE))
+        cinMinor = Ne16MemoryLayout._CIN_SUBTILE
+        cinMinorBytes = int(np.ceil(cinMinor / 8))
+
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1)
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+        weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
+
+        return weight
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
new file mode 100644
index 0000000..f2e66ad
--- /dev/null
+++ b/test/Ne16TestConf.py
@@ -0,0 +1,111 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from NnxTestClasses import NnxTestConf
+from TestClasses import IntegerType, KernelShape, Stride, implies
+
+
+class Ne16TestConf(NnxTestConf):
+    @field_validator("kernel_shape")
+    @classmethod
+    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+        assert v == KernelShape(height=1, width=1) or v == KernelShape(
+            height=3, width=3
+        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+        return v
+
+    @field_validator("stride")
+    @classmethod
+    def check_valid_stride(cls, v: Stride) -> Stride:
+        assert v == Stride(height=1, width=1) or v == Stride(
+            height=2, width=2
+        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
+        return v
+
+    @staticmethod
+    def _check_type(
+        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+    ) -> None:
+        assert (
+            _type in allowed_types
+        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+    @field_validator("in_type")
+    @classmethod
+    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("in_type", v, ["uint8"])
+        return v
+
+    @field_validator("out_type")
+    @classmethod
+    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
+        return v
+
+    @field_validator("weight_type")
+    @classmethod
+    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+        Ne16TestConf._check_type("weight_type", v, ["int8"])
+        return v
+
+    @field_validator("scale_type")
+    @classmethod
+    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
+        return v
+
+    @field_validator("bias_type")
+    @classmethod
+    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            Ne16TestConf._check_type("bias_type", v, ["int32"])
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_channel_stride_with_stride_2x2(self) -> Ne16TestConf:
+        assert implies(
+            self.stride == Stride(height=2, width=2),
+            self.out_channel * (self.out_type._bits // 8) % 2 == 0,
+        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf:
+        assert implies(
+            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
+        assert implies(
+            not self.has_norm_quant,
+            self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
+        ), (
+            f"Without quantization, the output type has to be equal to the "
+            f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+        )
+        return self
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
new file mode 100644
index 0000000..08b3601
--- /dev/null
+++ b/test/NeuralEngineFunctionalModel.py
@@ -0,0 +1,123 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from TestClasses import IntegerType, Padding, Stride
+
+
+class NeuralEngineFunctionalModel:
+    ACCUMULATOR_TYPE = IntegerType(name="int32")
+
+    @staticmethod
+    def _cast(
+        tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
+    ) -> torch.Tensor:
+        if saturate:
+            return tensor.clamp(_type.min, _type.max)
+        else:
+            return tensor & ((1 << _type._bits) - 1)
+
+    def _norm_quant(
+        self,
+        tensor: torch.Tensor,
+        scale: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        global_shift: torch.Tensor,
+        out_type: IntegerType,
+        bias_type: Optional[IntegerType],
+        has_bias: bool,
+        has_relu: bool,
+    ) -> torch.Tensor:
+        # Scale accumulators are in 48bit, so keeping the data in 64bit
+        tensor = tensor * scale
+        assert tensor.dtype == torch.int64
+
+        if has_bias:
+            assert bias is not None
+            assert bias_type is not None
+            # Saturating cast to int32
+            tensor = NeuralEngineFunctionalModel._cast(
+                tensor, bias_type, saturate=True
+            ).type(torch.int32)
+
+            tensor = tensor + bias
+            tensor = NeuralEngineFunctionalModel._cast(
+                tensor, bias_type, saturate=False
+            ).type(torch.int32)
+
+        if has_relu:
+            tensor = F.relu(tensor)
+
+        tensor = tensor >> global_shift
+
+        # Saturate into out_type
+        tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
+
+        return tensor
+
+    def convolution(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        scale: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        global_shift: Optional[torch.Tensor],
+        padding: Padding,
+        stride: Stride,
+        depthwise: bool,
+        out_type: IntegerType,
+        bias_type: Optional[IntegerType],
+        has_norm_quant: bool,
+        has_bias: bool,
+        has_relu: bool,
+        verbose: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        _ = kwargs
+
+        input_padded = F.pad(
+            input,
+            (
+                padding.left,
+                padding.right,
+                padding.top,
+                padding.bottom,
+            ),
+            "constant",
+            0,
+        )
+
+        # Accumulators are 32bit non-saturating.
+        # Calculate in higher precision (int64)
+        output = F.conv2d(
+            input=input_padded,
+            weight=weight,
+            stride=(stride.height, stride.width),
+            groups=weight.shape[0] if depthwise else 1,
+        ).type(torch.int64)
+
+        # Cast to accumulator type
+        output = NeuralEngineFunctionalModel._cast(
+            output, NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, saturate=False
+        ).type(torch.int32)
+
+        if verbose:
+            print("INTERMEDIATE RESULTS (pre-normalization/requant):")
+            print(output)
+
+        if has_norm_quant:
+            assert scale is not None
+            assert global_shift is not None
+            output = self._norm_quant(
+                output,
+                scale,
+                bias,
+                global_shift,
+                out_type,
+                bias_type,
+                has_bias,
+                has_relu,
+            )
+
+        return output
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
new file mode 100644
index 0000000..80a2786
--- /dev/null
+++ b/test/NeurekaMemoryLayout.py
@@ -0,0 +1,158 @@
+# Luka Macan <luka.macan@unibo.it>
+# Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+from TestClasses import IntegerType
+
+
+class NeurekaMemoryLayout:
+    _WEIGHT_BANDWIDTH = 256
+    _CIN_SUBTILE_1x1 = 32
+    _CIN_SUBTILE_3x3 = 28
+
+    @staticmethod
+    def weightEncode(
+        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+    ) -> npt.NDArray[np.uint8]:
+        """Unroll weight into expected memory format
+
+        Expected weight shape is (cout, cin, H, W).
+        The produced memory layout depends on the weight kernel shape:
+          - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+          - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+        where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
+        """
+        if depthwise:
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+        cout, cin, height, width = weight.shape
+        cinSubtile = (
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+        )
+
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % cinSubtile != 0:
+            cinPad = cinSubtile - cin % cinSubtile
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+                "constant",
+                constant_values=0,
+            )
+
+        # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+        # The 1 at the end is required by the unpacking
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
+
+        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+        # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
+        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+        # Shuffle bits so that the final shape is:
+        # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+
+        # Pack dimensions to fit into weight bandwidth
+        if height == 3 and width == 3:
+            # (cout * cinMajor * Bits, H * W * cinSubtile)
+            weight = weight.reshape(-1, height * width * cinSubtile)
+            # Pad only the last dimension to weight bandwidth size
+            # (-1, Weight Bandwidth)
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                "constant",
+                constant_values=0,
+            )
+        elif height == 1 and width == 1:
+            # Tile cinSubtile into tiles of size 4
+            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
+            )  # cout, cinMajor, bits, 1, 8, 4
+            # Pad bits to 8
+            if bits < 8:
+                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+                weight = np.pad(
+                    weight,
+                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+            weight = weight.transpose(0, 1, 3, 4, 2, 5)
+            # (-1, Weight Bandwidth)
+            weight = weight.reshape(
+                cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
+            )  # cout*cinMajor, 256b
+
+        # Prepare for packing
+        # (-1, Weight Bandwidth Bytes, 8)
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
+        weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
+
+        # Pack bits
+        # (-1, Weight Bandwidth Bytes)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+
+        return weight.flatten()
+
+    @staticmethod
+    def weightDecode(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
+        """Reverse of weightEncode"""
+        cinSubtile = (
+            NeurekaMemoryLayout._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+        )
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        cinMinor = cinSubtile
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
+
+        weight = weight.reshape(-1, weightBandwidthBytes, 1)
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+        weight = weight.reshape(-1, NeurekaMemoryLayout._WEIGHT_BANDWIDTH)
+
+        if height == 3 and width == 3:
+            weight = weight[:, : height * width * cinMinor]
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinMinor
+            ).transpose(0, 1, 4, 3, 2)
+        elif height == 1 and width == 1:
+            weight = weight[:, : height * width * cinMinor * 8]
+            weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
+                0, 1, 2, 4, 3
+            )
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
+
+        return weight
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
new file mode 100644
index 0000000..f878e68
--- /dev/null
+++ b/test/NeurekaTestConf.py
@@ -0,0 +1,101 @@
+# Luka Macan <luka.macan@unibo.it>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from NnxTestClasses import NnxTestConf
+from TestClasses import IntegerType, KernelShape, Stride, implies
+
+
+class NeurekaTestConf(NnxTestConf):
+    @field_validator("kernel_shape")
+    @classmethod
+    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+        assert v == KernelShape(height=1, width=1) or v == KernelShape(
+            height=3, width=3
+        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+        return v
+
+    @field_validator("stride")
+    @classmethod
+    def check_valid_stride(cls, v: Stride) -> Stride:
+        assert v == Stride(height=1, width=1), f"Unsupported stride {v}. Supported 1x1."
+        return v
+
+    @staticmethod
+    def _check_type(
+        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+    ) -> None:
+        assert (
+            _type in allowed_types
+        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+    @field_validator("in_type")
+    @classmethod
+    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("in_type", v, ["uint8", "int8"])
+        return v
+
+    @field_validator("out_type")
+    @classmethod
+    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
+        return v
+
+    @field_validator("weight_type")
+    @classmethod
+    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+        NeurekaTestConf._check_type("weight_type", v, ["int8"])
+        return v
+
+    @field_validator("scale_type")
+    @classmethod
+    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            NeurekaTestConf._check_type("scale_type", v, ["uint8", "uint32"])
+        return v
+
+    @field_validator("bias_type")
+    @classmethod
+    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+        if v is not None:
+            NeurekaTestConf._check_type("bias_type", v, ["int32"])
+        return v
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf:
+        assert implies(
+            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
+        assert implies(
+            not self.has_norm_quant,
+            self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
+        ), (
+            f"Without quantization, the output type has to be equal to the "
+            f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+        )
+        return self
diff --git a/test/Ne16TestClasses.py b/test/NnxTestClasses.py
similarity index 53%
rename from test/Ne16TestClasses.py
rename to test/NnxTestClasses.py
index d99e829..a7aaa00 100644
--- a/test/Ne16TestClasses.py
+++ b/test/NnxTestClasses.py
@@ -17,18 +17,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
-from typing import List, Union, Optional, Set, Tuple
-import torch
-import numpy as np
-import torch.nn.functional as F
+
 import os
-from Ne16 import Ne16
+from typing import Callable, Optional, Set, Tuple, Type, Union
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from pydantic import BaseModel, PositiveInt, field_validator, model_validator
+
 from HeaderWriter import HeaderWriter
-from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
-from pydantic import BaseModel, field_validator, model_validator, PositiveInt
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from TestClasses import IntegerType, KernelShape, Padding, Stride, implies
 
 
-class Ne16TestConf(BaseModel):
+class NnxTestConf(BaseModel):
     in_height: PositiveInt
     in_width: PositiveInt
     in_channel: PositiveInt
@@ -46,74 +49,8 @@ class Ne16TestConf(BaseModel):
     has_bias: bool
     has_relu: bool
 
-    @field_validator("kernel_shape")
-    @classmethod
-    def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
-        assert v == KernelShape(height=1, width=1) or v == KernelShape(
-            height=3, width=3
-        ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
-        return v
-
-    @field_validator("stride")
-    @classmethod
-    def check_valid_stride(cls, v: Stride) -> Stride:
-        assert v == Stride(height=1, width=1) or v == Stride(
-            height=2, width=2
-        ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
-        return v
-
-    @staticmethod
-    def _check_type(
-        name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
-    ) -> None:
-        assert (
-            _type in allowed_types
-        ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
-
-    @field_validator("in_type")
-    @classmethod
-    def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("in_type", v, ["uint8"])
-        return v
-
-    @field_validator("out_type")
-    @classmethod
-    def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("out_type", v, ["uint8", "int8"])
-        return v
-
-    @field_validator("weight_type")
-    @classmethod
-    def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
-        Ne16TestConf._check_type("weight_type", v, ["int8"])
-        return v
-
-    @field_validator("scale_type")
-    @classmethod
-    def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
-        if v is not None:
-            Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
-        return v
-
-    @field_validator("bias_type")
-    @classmethod
-    def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
-        if v is not None:
-            Ne16TestConf._check_type("bias_type", v, ["int32"])
-        return v
-
     @model_validator(mode="after")  # type: ignore
-    def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
-        assert implies(
-            self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
-        ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
-        return self
-
-    @model_validator(mode="after")  # type: ignore
-    def check_valid_depthwise(self) -> Ne16TestConf:
-        assert implies(
-            self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
-        ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+    def check_valid_depthwise_channels(self) -> NnxTestConf:
         assert implies(self.depthwise, self.in_channel == self.out_channel), (
             f"Input and output channel should be the same in a depthwise layer. "
             f"input channel: {self.in_channel}, output channel: {self.out_channel}"
@@ -121,21 +58,15 @@ def check_valid_depthwise(self) -> Ne16TestConf:
         return self
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
+    def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf:
         assert implies(
             self.kernel_shape == KernelShape(height=1, width=1),
             self.padding == Padding(top=0, bottom=0, left=0, right=0),
         ), f"No padding on 1x1 kernel. Given padding {self.padding}"
         return self
 
-    @field_validator("has_norm_quant")
-    @classmethod
-    def check_valid_has_norm_quant(cls, v: bool) -> bool:
-        assert v == True, f"Untested without has_norm_quant."
-        return v
-
     @model_validator(mode="after")  # type: ignore
-    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
+    def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
         if self.has_norm_quant:
             assert self.scale_type is not None, "Scale type was not provided."
             if self.has_bias:
@@ -143,25 +74,31 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
         return self
 
     @model_validator(mode="after")  # type: ignore
-    def check_valid_out_type_with_flags(self) -> Ne16TestConf:
-        assert implies(
-            not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
-        ), (
-            f"Without quantization, the output type has to be equal to the "
-            f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+    def check_has_relu_with_norm_quant(self) -> NnxTestConf:
+        assert implies(self.has_relu, self.has_norm_quant), (
+            f"Relu flag can only be enabled when norm_quant is enabled. "
+            f"Given has_relu {self.has_relu} and has_norm_quant {self.has_norm_quant}"
         )
-        assert implies(
-            self.has_norm_quant,
-            (self.has_relu and not self.out_type._signed)
-            or (not self.has_relu and self.out_type._signed),
-        ), (
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_has_bias_with_norm_quant(self) -> NnxTestConf:
+        assert implies(self.has_bias, self.has_norm_quant), (
+            f"Bias flag can only be enabled when norm_quant is enabled. "
+            f"Given has_bias {self.has_bias} and has_norm_quant {self.has_norm_quant}"
+        )
+        return self
+
+    @model_validator(mode="after")  # type: ignore
+    def check_valid_out_type_with_relu(self) -> NnxTestConf:
+        assert self.has_relu ^ self.out_type._signed, (
             f"Output type has to be unsigned when there is relu, otherwise signed. "
             f"Given output type {self.out_type} and has_relu {self.has_relu}"
         )
         return self
 
 
-class Ne16Test:
+class NnxTest:
     _CONF_NAME = "conf.json"
     _INPUT_NAME = "input.pt"
     _OUTPUT_NAME = "output.pt"
@@ -172,7 +109,7 @@ class Ne16Test:
 
     def __init__(
         self,
-        conf: Ne16TestConf,
+        conf: NnxTestConf,
         input: Optional[torch.Tensor],
         output: Optional[torch.Tensor],
         weight: Optional[torch.Tensor],
@@ -188,7 +125,7 @@ def __init__(
         self.bias = bias
         self.global_shift = global_shift
 
-    def is_valid(self):
+    def is_valid(self) -> bool:
         return all(
             [
                 self.input is not None,
@@ -203,22 +140,22 @@ def is_valid(self):
     def save_conf(self, path: Union[str, os.PathLike]) -> None:
         os.makedirs(path, exist_ok=True)
 
-        with open(os.path.join(path, Ne16Test._CONF_NAME), "w") as fp:
+        with open(os.path.join(path, NnxTest._CONF_NAME), "w") as fp:
             fp.write(self.conf.model_dump_json(indent=4))
 
     def save_data(self, path: Union[str, os.PathLike]) -> None:
         os.makedirs(path, exist_ok=True)
 
-        torch.save(self.input, os.path.join(path, Ne16Test._INPUT_NAME))
-        torch.save(self.output, os.path.join(path, Ne16Test._OUTPUT_NAME))
-        torch.save(self.weight, os.path.join(path, Ne16Test._WEIGHT_NAME))
+        torch.save(self.input, os.path.join(path, NnxTest._INPUT_NAME))
+        torch.save(self.output, os.path.join(path, NnxTest._OUTPUT_NAME))
+        torch.save(self.weight, os.path.join(path, NnxTest._WEIGHT_NAME))
         if self.scale is not None:
-            torch.save(self.scale, os.path.join(path, Ne16Test._SCALE_NAME))
+            torch.save(self.scale, os.path.join(path, NnxTest._SCALE_NAME))
         if self.bias is not None:
-            torch.save(self.bias, os.path.join(path, Ne16Test._BIAS_NAME))
+            torch.save(self.bias, os.path.join(path, NnxTest._BIAS_NAME))
         if self.global_shift is not None:
             torch.save(
-                self.global_shift, os.path.join(path, Ne16Test._GLOBAL_SHIFT_NAME)
+                self.global_shift, os.path.join(path, NnxTest._GLOBAL_SHIFT_NAME)
             )
 
     def save(self, path: Union[str, os.PathLike]) -> None:
@@ -228,154 +165,111 @@ def save(self, path: Union[str, os.PathLike]) -> None:
     @staticmethod
     def is_test_dir(path: Union[str, os.PathLike]) -> bool:
         fileset = set(os.listdir(path))
-        required_fileset = set([Ne16Test._CONF_NAME])
+        required_fileset = set([NnxTest._CONF_NAME])
         return required_fileset.issubset(fileset)
 
     @classmethod
-    def load(cls, path: Union[str, os.PathLike]) -> "Ne16Test":
-        assert Ne16Test.is_test_dir(
+    def load(cls, confCls: Type[NnxTestConf], path: Union[str, os.PathLike]) -> NnxTest:
+        assert NnxTest.is_test_dir(
             path
         ), f"ERROR: Test {path} does not contain the necessary files."
 
-        with open(os.path.join(path, Ne16Test._CONF_NAME), "r") as fp:
-            conf = Ne16TestConf.model_validate_json(fp.read())
+        with open(os.path.join(path, NnxTest._CONF_NAME), "r") as fp:
+            conf = confCls.model_validate_json(fp.read())
 
         def load_if_exist(filename: str) -> Optional[torch.Tensor]:
             filepath = os.path.join(path, filename)
             return torch.load(filepath) if os.path.isfile(filepath) else None
 
-        input = load_if_exist(Ne16Test._INPUT_NAME)
-        output = load_if_exist(Ne16Test._OUTPUT_NAME)
-        weight = load_if_exist(Ne16Test._WEIGHT_NAME)
-        scale = load_if_exist(Ne16Test._SCALE_NAME)
-        bias = load_if_exist(Ne16Test._BIAS_NAME)
-        global_shift = load_if_exist(Ne16Test._GLOBAL_SHIFT_NAME)
+        input = load_if_exist(NnxTest._INPUT_NAME)
+        output = load_if_exist(NnxTest._OUTPUT_NAME)
+        weight = load_if_exist(NnxTest._WEIGHT_NAME)
+        scale = load_if_exist(NnxTest._SCALE_NAME)
+        bias = load_if_exist(NnxTest._BIAS_NAME)
+        global_shift = load_if_exist(NnxTest._GLOBAL_SHIFT_NAME)
 
         return cls(conf, input, output, weight, scale, bias, global_shift)
 
 
-class Ne16TestGenerator:
+class NnxTestGenerator:
     _DEFAULT_SEED = 0
 
     @staticmethod
-    def _global_shift(
-        tensor: torch.Tensor, out_type: IntegerType, has_relu: bool
+    def _calculate_global_shift(
+        tensor: torch.Tensor, out_type: IntegerType
     ) -> torch.Tensor:
-        if has_relu:
-            # only adjust positive values
-            tensor = tensor[tensor > 0]
-
+        """Calculate global shift so that the output values are in the range of out_type"""
         s = tensor.type(torch.float64).std()
         target_s = 2 ** (out_type._bits - 1)
-        global_shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32)
-
-        return global_shift
+        return torch.ceil(torch.log2(s / target_s)).type(torch.int32)
 
     @staticmethod
-    def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]):
+    def _random_data(_type: IntegerType, shape: Tuple):
         return torch.randint(_type.min, _type.max, size=shape)
 
-    @staticmethod
-    def _cast(
-        tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
-    ) -> torch.Tensor:
-        if saturate:
-            return tensor.clamp(_type.min, _type.max)
-        else:
-            return tensor & ((1 << _type._bits) - 1)
-
     @staticmethod
     def from_conf(
-        conf: Ne16TestConf,
+        conf: NnxTestConf,
         input: Optional[torch.Tensor] = None,
         weight: Optional[torch.Tensor] = None,
         scale: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         global_shift: Optional[torch.Tensor] = None,
-    ) -> Ne16Test:
-        torch.manual_seed(Ne16TestGenerator._DEFAULT_SEED)
+        verbose: bool = False,
+    ) -> NnxTest:
+        torch.manual_seed(NnxTestGenerator._DEFAULT_SEED)
+
+        input_shape = (1, conf.in_channel, conf.in_height, conf.in_width)
+        weight_shape = (
+            conf.out_channel,
+            1 if conf.depthwise else conf.in_channel,
+            conf.kernel_shape.height,
+            conf.kernel_shape.width,
+        )
+        scale_shape = (1, conf.out_channel, 1, 1)
+        bias_shape = (1, conf.out_channel, 1, 1)
 
         if input is None:
-            input = Ne16TestGenerator._random_data(
+            input = NnxTestGenerator._random_data(
                 _type=conf.in_type,
-                shape=(1, conf.in_channel, conf.in_height, conf.in_width),
+                shape=input_shape,
             )
 
-        input_padded = F.pad(
-            input,
-            (
-                conf.padding.left,
-                conf.padding.right,
-                conf.padding.top,
-                conf.padding.bottom,
-            ),
-            "constant",
-            0,
-        )
-
         if weight is None:
-            weight = Ne16TestGenerator._random_data(
+            weight = NnxTestGenerator._random_data(
                 _type=conf.weight_type,
-                shape=(
-                    conf.out_channel,
-                    1 if conf.depthwise else conf.in_channel,
-                    conf.kernel_shape.height,
-                    conf.kernel_shape.width,
-                ),
+                shape=weight_shape,
             )
 
-        # Accumulators are 32bit non-saturating.
-        # Calculate in higher precision (int64)
-        output = F.conv2d(
-            input=input_padded,
-            weight=weight,
-            stride=(conf.stride.height, conf.stride.width),
-            groups=conf.in_channel if conf.depthwise else 1,
-        ).type(torch.int64)
-        # Use only the lower 32bits
-        output = Ne16TestGenerator._cast(
-            output, Ne16.ACCUMULATOR_TYPE, saturate=False
-        ).type(torch.int32)
-
         if conf.has_norm_quant:
             if scale is None:
                 assert conf.scale_type is not None
-                scale = Ne16TestGenerator._random_data(
-                    conf.scale_type, shape=(1, conf.out_channel, 1, 1)
+                scale = NnxTestGenerator._random_data(
+                    conf.scale_type, shape=scale_shape
                 )
-            # Scale accumulators are in 48bit, so keeping the data in 64bit
-            output = scale * output
-            assert output.dtype == torch.int64
-
-            if conf.has_bias:
-                # Saturating cast to int32
+            if conf.has_bias and bias is None:
                 assert conf.bias_type is not None
-                output = Ne16TestGenerator._cast(
-                    output, conf.bias_type, saturate=True
-                ).type(torch.int32)
-
-                if bias is None:
-                    bias = Ne16TestGenerator._random_data(
-                        conf.bias_type, shape=(1, conf.out_channel, 1, 1)
-                    ).type(torch.int32)
-                output = output + bias
-                output = Ne16TestGenerator._cast(
-                    output, conf.bias_type, saturate=False
+                bias = NnxTestGenerator._random_data(
+                    conf.bias_type, shape=bias_shape
                 ).type(torch.int32)
-
-            if conf.has_relu:
-                output = F.relu(output)
-
             if global_shift is None:
-                global_shift = Ne16TestGenerator._global_shift(
-                    output, conf.out_type, conf.has_relu
+                global_shift = torch.Tensor([0]).type(torch.int32)
+                output = NeuralEngineFunctionalModel().convolution(
+                    input,
+                    weight,
+                    scale,
+                    bias,
+                    global_shift,
+                    verbose=verbose,
+                    **conf.__dict__,
                 )
-            output = output >> global_shift
+                NnxTestGenerator._calculate_global_shift(output, conf.out_type)
 
-            # Saturate into out_type
-            output = Ne16TestGenerator._cast(output, conf.out_type, saturate=True)
+        output = NeuralEngineFunctionalModel().convolution(
+            input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
+        )
 
-        return Ne16Test(
+        return NnxTest(
             conf=conf,
             input=input,
             output=output,
@@ -386,28 +280,38 @@ def from_conf(
         )
 
     @staticmethod
-    def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test:
+    def regenerate(test: NnxTest, regen_tensors: Set[str]) -> NnxTest:
         test_tensors = set(["input", "output", "weight", "scale", "bias"])
         load_tensors = test_tensors - regen_tensors
         kwargs = {tensor: getattr(test, tensor) for tensor in load_tensors}
-        return Ne16TestGenerator.from_conf(test.conf, **kwargs)
+        return NnxTestGenerator.from_conf(test.conf, **kwargs)
 
 
-class Ne16TestHeaderGenerator:
+class NnxTestHeaderGenerator:
     DEFAULT_HEADERS_DIR = "app/gen"
 
-    def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None):
+    def __init__(
+        self,
+        weightEncode: Callable[
+            [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8]
+        ],
+        headers_dir: Optional[Union[str, os.PathLike]] = None,
+    ):
         if headers_dir is None:
-            headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR
+            headers_dir = NnxTestHeaderGenerator.DEFAULT_HEADERS_DIR
         self.header_writer = HeaderWriter(headers_dir)
+        # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag,
+        # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator
+        self.weightEncode = weightEncode
 
-    def generate(self, test_name: str, test: Ne16Test):
+    def generate(self, test_name: str, test: NnxTest):
         assert test.input is not None and test.output is not None
         _, in_channel, in_height, in_width = test.input.shape
         _, out_channel, out_height, out_width = test.output.shape
 
         # Render input
         in_ctype = test.conf.in_type.ctype()
+        in_signed = test.conf.in_type._signed
         in_data = test.input.permute(0, 2, 3, 1).ravel()
         self.header_writer.generate_vector_files(
             "input", _type=in_ctype, size=in_data.numel(), init=in_data
@@ -431,10 +335,10 @@ def generate(self, test_name: str, test: Ne16Test):
         weight_offset = -(2 ** (weight_bits - 1))
         weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape
         weight_data: np.ndarray = test.weight.numpy() - weight_offset
-        weight_init = Ne16.weight_unroll(
+        weight_init = self.weightEncode(
             weight_data.astype(np.uint8),
             weight_type._bits,
-            depthwise=test.conf.depthwise,
+            test.conf.depthwise,
         )
         self.header_writer.generate_vector_files(
             "weight", _type="uint8_t", size=weight_init.size, init=weight_init
@@ -470,13 +374,14 @@ def generate(self, test_name: str, test: Ne16Test):
                     "height": in_height,
                     "width": in_width,
                     "channel": in_channel,
-                    "bits": 8,
+                    "signed": in_signed,
+                    "bits": test.conf.in_type._bits,
                 },
                 "output": {
                     "height": out_height,
                     "width": out_width,
                     "channel": out_channel,
-                    "bits": 8,
+                    "bits": test.conf.out_type._bits,
                 },
                 "weight": {
                     "height": weight_ks_h,
@@ -486,8 +391,16 @@ def generate(self, test_name: str, test: Ne16Test):
                     "bits": weight_bits,
                     "offset": weight_offset,
                 },
-                "scale": {"bits": 8},
-                "bias": {"bits": 32},
+                "scale": {
+                    "bits": test.conf.scale_type._bits
+                    if test.conf.scale_type is not None
+                    else 0
+                },
+                "bias": {
+                    "bits": test.conf.bias_type._bits
+                    if test.conf.bias_type is not None
+                    else 0
+                },
                 "padding": {
                     "top": test.conf.padding.top,
                     "bottom": test.conf.padding.bottom,
diff --git a/test/README.md b/test/README.md
index c3d29c5..8442493 100644
--- a/test/README.md
+++ b/test/README.md
@@ -35,3 +35,9 @@ $ pytest test.py --help
 - [testgen.py](testgen.py): collection of helper tools for individual tests
 
 For more information you can run the script with the `-h` flag.
+
+## Application
+
+The Makefile in the `app/` uses a flag `ACCELERATOR` to decide which accelerator to use.
+The choices are _ne16_ or _neureka_.
+You can either export it or run it like `ACCELERATOR=<accelerator> make clean all run`.
diff --git a/test/TestClasses.py b/test/TestClasses.py
index c10641c..c6267d6 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -16,15 +16,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import cached_property
 import re
-from typing import Any, Dict, Literal, Optional, TYPE_CHECKING
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
+
 from pydantic import (
     BaseModel,
-    model_serializer,
-    model_validator,
     NonNegativeInt,
     PositiveInt,
+    model_serializer,
+    model_validator,
 )
 
 
diff --git a/test/app/Makefile b/test/app/Makefile
index 14f30fd..ca65892 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -40,6 +40,8 @@ INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp
 INC_DIRS += gen/inc
 
 INC_FLAGS += $(addprefix -I,$(INC_DIRS))
+APP_CFLAGS += $(INC_FLAGS)
+
 
 # Source files
 
@@ -58,7 +60,9 @@ APP_SRCS += $(wildcard gen/src/*.c)
 
 # Flags
 
-APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto
-APP_LDFLAGS += -flto
+ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:])
+APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE)
+
+APP_CFLAGS += -O2 -w -Wall -Werror
 
 include $(RULES_DIR)/pmsis_rules.mk
diff --git a/test/app/src/main.c b/test/app/src/main.c
index cc67050..7cce4bf 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -29,8 +29,9 @@ int main() {
   struct pi_cluster_conf cl_conf;
   struct pi_cluster_task cl_task;
 
-  printf("\n");
-  printf("Test %s starting\n", TEST_NAME);
+  printf("\nTest " TEST_NAME " starting\n");
+
+  printf("\nAccelerator: " NNX_ACCELERATOR "\n");
 
   printf("\n");
   layer_info();
@@ -43,13 +44,13 @@ int main() {
   }
   pi_cluster_send_task_to_cl(
       &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL));
-  pi_cluster_close(&cl_dev);
-
-  printf("\n");
-  printf("Test %s finished\n", TEST_NAME);
 
   printf("\n");
   check_output();
 
+  pi_cluster_close(&cl_dev);
+
+  printf("\nTest " TEST_NAME " finished\n");
+
   return 0;
 }
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index ffd93a1..0d98ff6 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -19,12 +19,89 @@
  */
 
 #include "nnx_layer.h"
+#include <pmsis.h>
+
+#ifdef NNX_NE16
+
 #include "ne16.h"
 #include "ne16_gvsoc.h"
 #include "ne16_pulp_bsp.h"
 #include "ne16_task.h"
 #include "pulp_nnx_ne16.h"
-#include <pmsis.h>
+
+typedef ne16_norm_mode_e nnx_norm_mode_e;
+typedef ne16_quant_t nnx_quant_t;
+typedef ne16_norm_t nnx_norm_t;
+typedef ne16_task_t nnx_task_t;
+typedef ne16_dev_t nnx_dev_t;
+typedef ne16_pulp_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue ne16TaskFlagTrue
+#define nnxTaskFlagFalse ne16TaskFlagFalse
+
+#define nnx_task_init ne16_task_init
+#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv
+#define nnx_task_set_bits ne16_task_set_bits
+#define nnx_task_set_norm_quant ne16_task_set_norm_quant
+#define nnx_task_set_weight_offset ne16_task_set_weight_offset
+#define nnx_task_set_dims ne16_task_set_dims
+#define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
+#define nnx_task_set_ptrs ne16_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_ALL
+#define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate ne16_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev ne16_pulp_get_dev
+
+#define nnx_init ne16_nnx_init
+#define nnx_dispatch_wait ne16_nnx_dispatch_wait
+#define nnx_dispatch_stride2x2 ne16_nnx_dispatch_stride2x2
+#define nnx_dispatch ne16_nnx_dispatch
+#define nnx_resolve_wait ne16_nnx_resolve_wait
+#define nnx_term ne16_nnx_term
+
+#elif defined NNX_NEUREKA
+
+#include "neureka.h"
+#include "neureka_gvsoc.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include "pulp_nnx_neureka.h"
+
+typedef neureka_norm_mode_e nnx_norm_mode_e;
+typedef neureka_quant_t nnx_quant_t;
+typedef neureka_norm_t nnx_norm_t;
+typedef neureka_task_t nnx_task_t;
+typedef neureka_dev_t nnx_dev_t;
+typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue neurekaTaskFlagTrue
+#define nnxTaskFlagFalse neurekaTaskFlagFalse
+
+#define nnx_task_init neureka_task_init
+#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv
+#define nnx_task_set_bits neureka_task_set_bits
+#define nnx_task_set_norm_quant neureka_task_set_norm_quant
+#define nnx_task_set_weight_offset neureka_task_set_weight_offset
+#define nnx_task_set_dims neureka_task_set_dims
+#define nnx_task_set_ptrs neureka_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL
+#define NNX_GVSOC_LOG_FORMAT NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate neureka_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev neureka_siracusa_get_dev
+
+#define nnx_init neureka_nnx_init
+#define nnx_dispatch_wait neureka_nnx_dispatch_wait
+#define nnx_dispatch neureka_nnx_dispatch
+#define nnx_resolve_wait neureka_nnx_resolve_wait
+#define nnx_term neureka_nnx_term
+
+#endif // NNX_NE16 || NNX_NEUREKA
 
 // Generated headers
 #include "bias.h"
@@ -34,73 +111,109 @@
 #include "scale.h"
 #include "weight.h"
 
-static void task_prepare(ne16_task_t *task) {
-  ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
-                 WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
-                 (ne16_quant_t){.shift_amount = OUTSHIFT,
-                                .mode = quantMode8Bit,
-                                .function = HAS_RELU ? quantFunctionRelu
-                                                     : quantFunctionIdentity,
-                                .flag_rounding = ne16TaskFlagFalse},
-                 (ne16_norm_t){.mode = normMode8Bit,
-                               .flag_bias = HAS_BIAS ? ne16TaskFlagTrue
-                                                     : ne16TaskFlagFalse,
-                               .flag_shift = ne16TaskFlagFalse},
-                 STRIDE_HEIGHT);
-
-  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
-    ne16_task_set_dims_stride2x2(
-        task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-        INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-        OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
-        PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
-  } else {
-    ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                       INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                       OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
-                       PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
-  }
-
-  ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
-                     INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
-                     (uint32_t)weight, (uint32_t)scale, NULL,
+static void task_prepare(nnx_task_t *task) {
+  nnx_task_init(task);
+  nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT);
+  nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
+
+#if HAS_NORM_QUANT == 1
+#if SCALE_BITS == 8
+  const nnx_norm_mode_e normMode = normMode8Bit;
+#elif SCALE_BITS == 32
+  const nnx_norm_mode_e normMode = normMode32Bit;
+#endif
+
+  nnx_task_set_norm_quant(
+      task,
+      (nnx_quant_t){.shift_amount = OUTSHIFT,
+                    .function =
+                        HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
+                    .flag_rounding = nnxTaskFlagFalse},
+      (nnx_norm_t){.mode = normMode,
+                   .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
+                   .flag_shift = nnxTaskFlagFalse});
+#endif // HAS_NORM_QUANT
+
+  nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
+
+#ifdef NNX_NEUREKA
+#ifdef NEUREKA_WEIGHT_SOURCE_WMEM
+  neureka_task_set_weight_source(task, neurekaWeightSourceWmem);
+#else
+  neureka_task_set_weight_source(task, neurekaWeightSourceTcdm);
+#endif
+#if INPUT_SIGNED == 1
+  neureka_task_set_input_signed(task);
+#else
+  neureka_task_set_input_unsigned(task);
+#endif
+#endif
+
+  const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
+  const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride;
+  const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8;
+  const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride;
+
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
+  nnx_task_set_dims_stride2x2(
+      task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+      OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride,
+      WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+      PADDING_LEFT);
+#else
+  nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+                    OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride,
+                    w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+                    PADDING_LEFT);
+#endif
+
+  nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, w_in_stride,
+                    PADDING_TOP, PADDING_LEFT, (uint32_t)output,
+                    (uint32_t)weight,
+#if HAS_NORM_QUANT == 1
+                    (uint32_t)scale, NULL,
 #if HAS_BIAS == 1
-                     (uint32_t)bias
+                    (uint32_t)bias
+#else
+                    NULL
+#endif
 #else
-                     NULL
+                    NULL, NULL, NULL
 #endif
   );
 }
 
-static void task_execute(ne16_task_t *task) {
-  ne16_dev_t *dev = ne16_pulp_get_dev();
+static void task_execute(nnx_task_t *task) {
+  nnx_dev_t *dev = nnx_bsp_get_dev();
 
-  ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG,
-                          NE16_GVSOC_LOG_FORMAT_HEXADECIMAL);
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT);
+#endif
 
-  ne16_pulp_conf_t conf = {.max_stall = 8};
-  ne16_nnx_init(dev, &conf);
+  nnx_bsp_conf_t conf = {.max_stall = 8};
+  nnx_init(dev, &conf);
 
-  ne16_nnx_dispatch_wait(dev);
+  nnx_dispatch_wait(dev);
 
-  if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
-    ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
-                                INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
-                                OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
-                                WEIGHT_HEIGHT, WEIGHT_WIDTH);
-  } else {
-    ne16_nnx_dispatch(dev, task);
-  }
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
+  nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT,
+                         OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT,
+                         WEIGHT_WIDTH);
+#else
+  nnx_dispatch(dev, task);
+#endif
 
-  ne16_nnx_resolve_wait(dev, task);
+  nnx_resolve_wait(dev, task);
 
-  ne16_nnx_term(dev);
+  nnx_term(dev);
 
-  ne16_gvsoc_log_deactivate(dev);
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+  nnx_gvsoc_log_deactivate(dev);
+#endif
 }
 
 void execute_nnx_layer(void *args) {
-  ne16_task_t task;
+  nnx_task_t task;
   task_prepare(&task);
   task_execute(&task);
 }
diff --git a/test/conf.toml b/test/conf.toml
index 1222f1d..c24055a 100644
--- a/test/conf.toml
+++ b/test/conf.toml
@@ -22,7 +22,7 @@
 # Ne16TestClasses.py:Ne16TestConf().check_valid()
 
 # Input dimensions
-in_height = 3
+in_height = 4
 in_width = 3
 in_channel = 8
 
diff --git a/test/conftest.py b/test/conftest.py
index 6c2c15b..3c0a316 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -18,7 +18,17 @@
 
 import os
 from typing import Union
-from Ne16TestClasses import Ne16Test, Ne16TestGenerator
+
+import pydantic
+import pytest
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from Ne16TestConf import Ne16TestConf
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import NnxTest, NnxTestGenerator
+
+_SUPPORTED_ACCELERATORS = ["ne16", "neureka"]
 
 
 def pytest_addoption(parser):
@@ -39,6 +49,13 @@ def pytest_addoption(parser):
         default=False,
         help="Recursively search for tests in given test directories.",
     )
+    parser.addoption(
+        "-A",
+        "--accelerator",
+        choices=_SUPPORTED_ACCELERATORS,
+        default="ne16",
+        help="Choose an accelerator to test. Default: ne16",
+    )
     parser.addoption(
         "--regenerate",
         action="store_true",
@@ -54,7 +71,7 @@ def pytest_addoption(parser):
 
 
 def _find_test_dirs(path: Union[str, os.PathLike]):
-    return [dirpath for dirpath, _, _ in os.walk(path) if Ne16Test.is_test_dir(dirpath)]
+    return [dirpath for dirpath, _, _ in os.walk(path) if NnxTest.is_test_dir(dirpath)]
 
 
 def pytest_generate_tests(metafunc):
@@ -62,6 +79,18 @@ def pytest_generate_tests(metafunc):
     recursive = metafunc.config.getoption("recursive")
     regenerate = metafunc.config.getoption("regenerate")
     timeout = metafunc.config.getoption("timeout")
+    nnxName = metafunc.config.getoption("accelerator")
+
+    if nnxName == "ne16":
+        nnxMemoryLayoutCls = Ne16MemoryLayout
+        nnxTestConfCls = Ne16TestConf
+    elif nnxName == "neureka":
+        nnxMemoryLayoutCls = NeurekaMemoryLayout
+        nnxTestConfCls = NeurekaTestConf
+    else:
+        assert (
+            False
+        ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}"
 
     if recursive:
         tests_dirs = test_dirs
@@ -69,12 +98,28 @@ def pytest_generate_tests(metafunc):
         for tests_dir in tests_dirs:
             test_dirs.extend(_find_test_dirs(tests_dir))
 
-    # (Re)Generate test data
+    # Load valid tests
+    nnxTestAndNames = []
     for test_dir in test_dirs:
-        test = Ne16Test.load(test_dir)
-        if not test.is_valid() or regenerate:
-            test = Ne16TestGenerator.from_conf(test.conf)
-            test.save_data(test_dir)
+        try:
+            test = NnxTest.load(nnxTestConfCls, test_dir)
+            # (Re)generate data
+            if not test.is_valid() or regenerate:
+                test = NnxTestGenerator.from_conf(test.conf)
+                test.save_data(test_dir)
+            nnxTestAndNames.append((test, test_dir))
+        except pydantic.ValidationError as e:
+            _ = e
+            nnxTestAndNames.append(
+                pytest.param(
+                    (None, test_dir),
+                    marks=pytest.mark.skipif(
+                        True, reason=f"Invalid test {test_dir}: {e.errors}"
+                    ),
+                )
+            )
 
-    metafunc.parametrize("path", test_dirs)
+    metafunc.parametrize("nnxTestAndName", nnxTestAndNames)
     metafunc.parametrize("timeout", [timeout])
+    metafunc.parametrize("nnxName", [nnxName])
+    metafunc.parametrize("nnxMemoryLayoutCls", [nnxMemoryLayoutCls])
diff --git a/test/requirements-dev.txt b/test/requirements-dev.txt
index fa0a75a..0956e5e 100644
--- a/test/requirements-dev.txt
+++ b/test/requirements-dev.txt
@@ -1,2 +1,3 @@
 pyright
 black
+isort
diff --git a/test/test.py b/test/test.py
index 39709b6..1893cdf 100644
--- a/test/test.py
+++ b/test/test.py
@@ -16,13 +16,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import locale
 import os
 import re
-from typing import Union, Optional, Tuple
-import locale
 import subprocess
-from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator
 from pathlib import Path
+from typing import Dict, Optional, Tuple, Type, Union
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator
 
 HORIZONTAL_LINE = "\n" + "-" * 100 + "\n"
 
@@ -49,17 +52,29 @@ def captured_output(
 
 
 def execute_command(
-    cmd: str, timeout: int = 30, cflags: Optional[str] = None
+    cmd: str,
+    timeout: int = 30,
+    cflags: Optional[str] = None,
+    envflags: Optional[Dict[str, str]] = None,
 ) -> Tuple[bool, str, str, Optional[str]]:
-    app_cflags = 'APP_CFLAGS="' + " ".join(cflags) + '" ' if cflags else ""
-    cmd = cmd + app_cflags
+    env = os.environ
+    if cflags:
+        env["APP_CFLAGS"] = '"' + " ".join(cflags) + '"'
+    if envflags:
+        for key, value in envflags.items():
+            env[key] = value
 
     status = None
     stdout = None
 
     try:
         proc = subprocess.run(
-            cmd.split(), check=True, capture_output=True, text=True, timeout=timeout
+            cmd.split(),
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            env=env,
         )
         status = True
         msg = "OK"
@@ -94,28 +109,35 @@ def assert_message(
     return retval
 
 
-def test(path: str, timeout: int):
-    test_name = path
-    test = Ne16Test.load(path)
-
-    Ne16TestHeaderGenerator().generate(test_name, test)
+def test(
+    nnxTestAndName: Tuple[NnxTest, str],
+    timeout: int,
+    nnxName: str,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+):
+    nnxTest, nnxTestName = nnxTestAndName
+    NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+        nnxTestName, nnxTest
+    )
 
     Path("app/src/nnx_layer.c").touch()
     cmd = f"make -C app all run platform=gvsoc"
-    passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout)
+    passed, msg, stdout, stderr = execute_command(
+        cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName}
+    )
 
-    assert passed, assert_message(msg, test_name, cmd, stdout, stderr)
+    assert passed, assert_message(msg, nnxTestName, cmd, stdout, stderr)
 
     match_success = re.search(r"> Success! No errors found.", stdout)
     match_fail = re.search(r"> Failure! Found (\d*)/(\d*) errors.", stdout)
 
     assert match_success or match_fail, assert_message(
-        "No regexes matched.", test_name, cmd, stdout
+        "No regexes matched.", nnxTestName, cmd, stdout
     )
 
     assert not match_fail, assert_message(
         f"Errors found: {match_fail.group(1)}/{match_fail.group(2)}",
-        test_name,
+        nnxTestName,
         cmd,
         stdout,
     )
diff --git a/test/testgen.py b/test/testgen.py
index e748f2e..521aecc 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -16,28 +16,61 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import argparse
 import json
+import os
+from typing import Optional, Set, Type, Union
+
 import toml
-from typing import Optional, Union, Set
-from Ne16TestClasses import (
-    Ne16TestConf,
-    Ne16TestGenerator,
-    Ne16Test,
-    Ne16TestHeaderGenerator,
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from Ne16TestConf import Ne16TestConf
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import (
+    NnxTest,
+    NnxTestConf,
+    NnxTestGenerator,
+    NnxTestHeaderGenerator,
 )
 
 
-def headers_gen(args, test: Optional[Ne16Test] = None):
+def headers_gen(
+    args,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+    nnxTestConfCls: Type[NnxTestConf],
+    test: Optional[NnxTest] = None,
+):
     if test is None:
-        test = Ne16Test.load(args.test_dir)
+        test = NnxTest.load(nnxTestConfCls, args.test_dir)
+    assert test is not None
     if not test.is_valid():
-        test = Ne16TestGenerator.from_conf(test.conf)
-    Ne16TestHeaderGenerator().generate(args.test_dir, test)
-
-
-def test_gen(args):
+        test = NnxTestGenerator.from_conf(test.conf)
+    NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+        args.test_dir, test
+    )
+
+
+def print_tensors(test: NnxTest):
+    print("INPUT TENSOR:")
+    print(test.input)
+    print("WEIGHT TENSOR:")
+    print(test.weight)
+    print("SCALE TENSOR:")
+    print(test.scale)
+    print("BIAS TENSOR:")
+    print(test.bias)
+    print("GLOBAL SHIFT TENSOR:")
+    print(test.global_shift)
+    print("EXPECTED OUTPUT TENSOR:")
+    print(test.output)
+
+
+def test_gen(
+    args,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+    nnxTestConfCls: Type[NnxTestConf],
+):
     if args.conf.endswith(".toml"):
         test_conf_dict = toml.load(args.conf)
     elif args.conf.endswith(".json"):
@@ -49,37 +82,71 @@ def test_gen(args):
         )
         exit(-1)
 
-    test_conf = Ne16TestConf.model_validate(test_conf_dict)
-    test = Ne16TestGenerator.from_conf(test_conf)
+    test_conf = nnxTestConfCls.model_validate(test_conf_dict)
+    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:
-        headers_gen(args, test)
-
-
-def _regen(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
-    test = Ne16Test.load(path)
-    test = Ne16TestGenerator.regenerate(test, regen_tensors)
+        headers_gen(args, nnxMemoryLayoutCls, nnxTestConfCls, test)
+    if args.print_tensors:
+        print_tensors(test)
+
+
+def _regen(
+    path: Union[str, os.PathLike],
+    regen_tensors: Set[str],
+    nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+    test = NnxTest.load(nnxTestConfCls, path)
+    test = NnxTestGenerator.regenerate(test, regen_tensors)
     test.save(path)
 
 
-def _regen_recursive(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
-    if Ne16Test.is_test_dir(path):
-        _regen(path, regen_tensors)
+def _regen_recursive(
+    path: Union[str, os.PathLike],
+    regen_tensors: Set[str],
+    nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+    if NnxTest.is_test_dir(path):
+        _regen(path, regen_tensors, nnxTestConfCls)
         return
 
     for dirpath, _, _ in os.walk(path):
-        _regen_recursive(dirpath, regen_tensors)
+        _regen_recursive(dirpath, regen_tensors, nnxTestConfCls)
 
 
-def test_regen(args):
+def test_regen(
+    args,
+    nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+    nnxTestConfCls: Type[NnxTestConf],
+):
+    _ = nnxMemoryLayoutCls
     regen_tensors = set(args.tensors + ["output"])
 
     for test_dir in args.test_dirs:
         if args.recurse:
-            _regen_recursive(test_dir, regen_tensors)
+            _regen_recursive(test_dir, regen_tensors, nnxTestConfCls)
         else:
-            _regen(test_dir, regen_tensors)
+            _regen(test_dir, regen_tensors, nnxTestConfCls)
+
+
+def add_common_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "-t",
+        "--test-dir",
+        type=str,
+        dest="test_dir",
+        required=True,
+        help="Path to the test.",
+    )
+
+    parser.add_argument(
+        "-a",
+        "--accelerator",
+        choices=["ne16", "neureka"],
+        default="ne16",
+        help="Choose an accelerator. Default: ne16",
+    )
 
 
 parser = argparse.ArgumentParser(
@@ -91,14 +158,7 @@ def test_regen(args):
 parser_header = subparsers.add_parser(
     "headers", description="Generate headers for a single test."
 )
-parser_header.add_argument(
-    "-t",
-    "--test-dir",
-    type=str,
-    dest="test_dir",
-    required=True,
-    help="Path to the test." "basename.",
-)
+add_common_arguments(parser_header)
 parser_header.set_defaults(func=headers_gen)
 
 parser_test = subparsers.add_parser(
@@ -112,14 +172,6 @@ def test_regen(args):
     required=True,
     help="Path to the configuration file.",
 )
-parser_test.add_argument(
-    "-t",
-    "--test-dir",
-    type=str,
-    dest="test_dir",
-    required=True,
-    help="Path to the test. " "basename.",
-)
 parser_test.add_argument(
     "--headers", action="store_true", default=False, help="Generate headers."
 )
@@ -130,6 +182,14 @@ def test_regen(args):
     dest="skip_save",
     help="Skip saving the test.",
 )
+parser_test.add_argument(
+    "--print-tensors",
+    action="store_true",
+    default=False,
+    dest="print_tensors",
+    help="Print tensor values to stdout.",
+)
+add_common_arguments(parser_test)
 parser_test.set_defaults(func=test_gen)
 
 parser_regen = subparsers.add_parser("regen", description="Regenerate test tensors.")
@@ -138,25 +198,27 @@ def test_regen(args):
     type=str,
     nargs="?",
     default=[],
-    help="Tensors that should be regenerated. Output " "included by default.",
-)
-parser_regen.add_argument(
-    "-t",
-    "--test-dir",
-    action="append",
-    dest="test_dirs",
-    required=True,
-    help="Path to the test.",
+    help="Tensors that should be regenerated. Output included by default.",
 )
 parser_regen.add_argument(
     "-r",
     "--recursive",
     action="store_true",
     default=False,
-    help="Recursively search for test directiories " "inside given test directories.",
+    help="Recursively search for test directiories inside given test directories.",
 )
+add_common_arguments(parser_regen)
 parser_regen.set_defaults(func=test_regen)
 
 args = parser.parse_args()
 
-args.func(args)
+if args.accelerator == "ne16":
+    nnxMemoryLayoutCls = Ne16MemoryLayout
+    nnxTestConfCls = Ne16TestConf
+elif args.accelerator == "neureka":
+    nnxMemoryLayoutCls = NeurekaMemoryLayout
+    nnxTestConfCls = NeurekaTestConf
+else:
+    assert False, f"Unsupported accelerator {args.accelerator}."
+
+args.func(args, nnxMemoryLayoutCls, nnxTestConfCls)
diff --git a/test/tests/test_102/conf.json b/test/tests/test_102/conf.json
new file mode 100644
index 0000000..d6d0c17
--- /dev/null
+++ b/test/tests/test_102/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 4,
+    "in_width": 3,
+    "in_channel": 8,
+    "out_channel": 8,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_103/conf.json b/test/tests/test_103/conf.json
new file mode 100644
index 0000000..3eff547
--- /dev/null
+++ b/test/tests/test_103/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 25,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_104/conf.json b/test/tests/test_104/conf.json
new file mode 100644
index 0000000..d6d00e4
--- /dev/null
+++ b/test/tests/test_104/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 25,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_105/conf.json b/test/tests/test_105/conf.json
new file mode 100644
index 0000000..0f34422
--- /dev/null
+++ b/test/tests/test_105/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 20,
+    "in_width": 15,
+    "in_channel": 40,
+    "out_channel": 40,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "int8",
+    "out_type": "uint8",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": true,
+    "has_bias": true,
+    "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_106/conf.json b/test/tests/test_106/conf.json
new file mode 100644
index 0000000..0b98f3a
--- /dev/null
+++ b/test/tests/test_106/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 17,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_107/conf.json b/test/tests/test_107/conf.json
new file mode 100644
index 0000000..2f8951c
--- /dev/null
+++ b/test/tests/test_107/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 17,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_108/conf.json b/test/tests/test_108/conf.json
new file mode 100644
index 0000000..7842aaa
--- /dev/null
+++ b/test/tests/test_108/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_109/conf.json b/test/tests/test_109/conf.json
new file mode 100644
index 0000000..a6b71c9
--- /dev/null
+++ b/test/tests/test_109/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": true,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_110/conf.json b/test/tests/test_110/conf.json
new file mode 100644
index 0000000..622efc4
--- /dev/null
+++ b/test/tests/test_110/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_111/conf.json b/test/tests/test_111/conf.json
new file mode 100644
index 0000000..d6714c4
--- /dev/null
+++ b/test/tests/test_111/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 1,
+        "width": 1
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 2,
+        "width": 2
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_112/conf.json b/test/tests/test_112/conf.json
new file mode 100644
index 0000000..1991c59
--- /dev/null
+++ b/test/tests/test_112/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 1,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_113/conf.json b/test/tests/test_113/conf.json
new file mode 100644
index 0000000..1dce097
--- /dev/null
+++ b/test/tests/test_113/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 0,
+        "left": 0,
+        "right": 1
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_114/conf.json b/test/tests/test_114/conf.json
new file mode 100644
index 0000000..c1ce5c3
--- /dev/null
+++ b/test/tests/test_114/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 0,
+        "bottom": 1,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_115/conf.json b/test/tests/test_115/conf.json
new file mode 100644
index 0000000..19153ba
--- /dev/null
+++ b/test/tests/test_115/conf.json
@@ -0,0 +1,29 @@
+{
+    "in_height": 15,
+    "in_width": 34,
+    "in_channel": 33,
+    "out_channel": 33,
+    "padding": {
+        "top": 1,
+        "bottom": 0,
+        "left": 0,
+        "right": 0
+    },
+    "kernel_shape": {
+        "height": 3,
+        "width": 3
+    },
+    "depthwise": false,
+    "stride": {
+        "height": 1,
+        "width": 1
+    },
+    "in_type": "uint8",
+    "out_type": "int32",
+    "weight_type": "int8",
+    "scale_type": "uint8",
+    "bias_type": "int32",
+    "has_norm_quant": false,
+    "has_bias": false,
+    "has_relu": false
+}
\ No newline at end of file
diff --git a/util/hwpe.c b/util/hwpe.c
index 53c1ace..0430081 100644
--- a/util/hwpe.c
+++ b/util/hwpe.c
@@ -31,11 +31,11 @@
 #define HWPE_TASK_REG_OFFSET 8
 
 inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
-  *(dev->base_addr + reg) = value;
+  dev->base_addr[reg] = value;
 }
 
 inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) {
-  return *(dev->base_addr + reg);
+  return dev->base_addr[reg];
 }
 
 inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
diff --git a/util/pulp_nnx_util.c b/util/pulp_nnx_util.c
index 34db512..0107fc1 100644
--- a/util/pulp_nnx_util.c
+++ b/util/pulp_nnx_util.c
@@ -20,14 +20,16 @@
 
 #include "pulp_nnx_util.h"
 
-inline int divnceil(const int dividend, const int divisor) {
-  return ((dividend - 1) / divisor) + 1;
+inline int nnx_calculate_number_of_tiles(const int dim_size,
+                                         const int tile_size) {
+  return ((dim_size - 1) / tile_size) + 1;
 }
 
-inline int remainder(const int dividend, const int divisor) {
-  return ((dividend - 1) % divisor) + 1;
+inline int nnx_calculate_last_tile_size(const int dim_size,
+                                        const int tile_size) {
+  return ((dim_size - 1) % tile_size) + 1;
 }
 
-inline uint32_t concat_half(const uint16_t high, const uint16_t low) {
+inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) {
   return ((uint32_t)high << 16) | low;
 }
diff --git a/util/pulp_nnx_util.h b/util/pulp_nnx_util.h
index 638e5d9..d167f6d 100644
--- a/util/pulp_nnx_util.h
+++ b/util/pulp_nnx_util.h
@@ -24,26 +24,28 @@
 #include <stdint.h>
 
 /**
- * divnceil
+ * nnx_calculate_number_of_iterations
  *
- * Does integer division and ceiling of it.
+ * Calculates the number of iterations to go through a dimension.
+ * It does it by dividing the dimension with the tile size and doing a ceiling
+ * the result.
  */
-int divnceil(const int dividend, const int divisor);
+int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size);
 
 /**
- * remainder
+ * nnx_calculate_last_tile_size
  *
- * Calculates the remainder but if the remainder should be 0,
- * returns divisor. Used for calculation of the last `remainding`
- * iteration of the tile.
+ * Calculates the size of the last executed tile by calculating the remainder of
+ * the dim_size and the tile_size. In case the remainder is 0, it returns the
+ * full tile_size.
  */
-int remainder(const int dividend, const int divisor);
+int nnx_calculate_last_tile_size(const int dim_size, const int tile_size);
 
 /**
  * concat_half
  *
  * Concatenate 2 16-bit numbers into a 32-bit number.
  */
-uint32_t concat_half(const uint16_t high, const uint16_t low);
+uint32_t nnx_concat_half(const uint16_t high, const uint16_t low);
 
 #endif // __NNX_UTIL_H__