diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b595682..4c7b267 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,25 +20,41 @@ stages:
- lint
- test
-format_python:
+python_format:
stage: lint
tags:
- python-lint
script:
- black --check .
-static_check_python:
+python_sort_imports:
+ stage: lint
+ tags:
+ - python-lint
+ script:
+ - isort --check test
+
+python_static_check:
stage: lint
tags:
- python-lint
script:
- pyright .
-run_test0:
+run_ne16_test:
stage: test
tags:
- gap9-sdk
artifacts:
untracked: true
script:
- - cd test && pytest test.py --test-dir tests --recursive
+ - cd test && pytest test.py --test-dir tests --recursive -A ne16
+
+run_neureka_test:
+ stage: test
+ tags:
+ - siracusa-sdk
+ artifacts:
+ untracked: true
+ script:
+ - cd test && pytest test.py --test-dir tests --recursive -A neureka
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48a4461..84b516f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
# Changelog
+## [Unreleased]
+
+### Added
+
+- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels
+- Support for kernels without normalization and quantization for NE16
+- isort check
+- publication citation
+
+### Changed
+
+- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
+- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
+- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
+
+### Removed
+
+- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2`
+- `mode` attribute from `ne16_quant_t` structure
+
## [0.3.0] - 2024-01-14
### Added
diff --git a/README.md b/README.md
index be8c9be..1671dc7 100644
--- a/README.md
+++ b/README.md
@@ -39,51 +39,22 @@ _Note: The accelerator can provide additional helper functions if needed._
## Accelerators
-### NE16
-
-Github repo [link](https://github.com/pulp-platform/ne16).
-
-#### Implemented features
-
-- [x] Convolution w/ kernel shape 1x1
-- [x] Convolution w/ kernel shape 3x3
-- [x] Depthwise convolution w/ kernel shape 3x3
-- [x] Stride 1x1
-- [x] Stride 2x2
-- [ ] Normalization and quantization
- - [x] With
- - [ ] Without
- - [x] Relu (w/ and w/o)
- - [x] Bias (w/ and w/o)
- - [ ] Per-channel shift
- - [x] Per-layer shift
- - [ ] Rounding
-- [ ] Input type
- - [x] uint8
- - [ ] uint16
-- [ ] Output type
- - [x] int8
- - [x] uint8 (only w/ Relu)
- - [ ] int32
- - [ ] uint32 (only w/ Relu)
-- [ ] Scale type
- - [x] uint8
- - [ ] uint16
- - [ ] uint32
-- [x] Bias type
- - [x] int32
-- [ ] Weight type
- - [x] int8
- - [ ] int2-7
-
-### Neureka
-
-**Untested and considered broken.**
+- [NE16](ne16/README.md)
+- [Neureka](neureka/README.md)
## Testing
You can find information about testing in the dedicated [README](test/README.md).
+### Environment
+
+The library was tested with following pairs of SDKs and compilers:
+
+| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash |
+| --- | --------------- | -------- | -------------------- |
+| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 |
+| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
+
## Contributing
Bug reports and feature requests should be reported through issues.
@@ -93,15 +64,38 @@ All the development should be done through forks and merged onto the `dev` branc
The library will follow the [Semantic Versioning](https://semver.org/).
-## Citing
+## Publication
+
+
+If you use PULP-NNX in your work, you can cite us:
+
+```
+@inproceedings{10.1145/3607889.3609092,
+ author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco},
+ title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study},
+ year = {2024},
+ isbn = {9798400702907},
+ publisher = {Association for Computing Machinery},
+ address = {New York, NY, USA},
+ url = {https://doi.org/10.1145/3607889.3609092},
+ doi = {10.1145/3607889.3609092},
+ abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.},
+ booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems},
+ pages = {9–10},
+ numpages = {2},
+ keywords = {TinyML, MCUs, deep learning, HW accelerators},
+ location = {, Hamburg, Germany, },
+ series = {CASES '23 Companion}
+}
+```
-*TBA*
+
## Contributors
* Luka Macan <[luka.macan@unibo.it](mailto:luka.macan@unibo.it)>
* Francesco Conti <[fconti@unibo.it](mailto:fconti@unibo.it)>
-* Arpan Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
+* Arpan Suravi Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)>
## License
diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
index eff9a60..97e6e2e 100644
--- a/inc/pulp_nnx_ne16.h
+++ b/inc/pulp_nnx_ne16.h
@@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev);
/** ne16_nnx_dispatch
*
* Dispatch a task to the accelerator.
- * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
*/
int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task);
@@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task);
*/
void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
-
/* Additional helper functions */
/** ne16_nnx_dispatch_stride2x2
@@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
* tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
* Works only if the k_out is divisible by 2.
*/
-void ne16_nnx_dispatch_stride2x2(
- ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
- const uint32_t w_in_stride, const uint32_t k_in_stride,
- const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
- const uint8_t h_ker, const uint8_t w_ker);
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+ const uint32_t w_in, const uint32_t k_in,
+ const uint32_t h_out, const uint32_t w_out,
+ const uint32_t k_out, const uint8_t h_ker,
+ const uint8_t w_ker);
diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
new file mode 100644
index 0000000..25ef4a8
--- /dev/null
+++ b/inc/pulp_nnx_neureka.h
@@ -0,0 +1,61 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include
+
+/* PULP-NNX interface */
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf);
+void neureka_nnx_term(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int neureka_nnx_dispatch_check(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
+ */
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
diff --git a/ne16/README.md b/ne16/README.md
new file mode 100644
index 0000000..9f05956
--- /dev/null
+++ b/ne16/README.md
@@ -0,0 +1,36 @@
+# NE16
+
+## Docs
+
+- Github repo [link](https://github.com/pulp-platform/ne16).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [x] Stride 2x2
+- [ ] Normalization and quantization
+ - [x] With
+ - [x] Without
+ - [x] Relu (w/ and w/o)
+ - [x] Bias (w/ and w/o)
+ - [ ] Per-channel shift
+ - [x] Per-layer shift
+ - [ ] Rounding
+- [ ] Input type
+ - [x] uint8
+ - [ ] uint16
+- [ ] Output type
+ - [x] int8
+ - [x] uint8 (only w/ Relu)
+ - [x] int32
+- [ ] Scale type
+ - [x] uint8
+ - [ ] uint16
+ - [ ] uint32
+- [x] Bias type
+ - [x] int32
+- [ ] Weight type
+ - [x] int8
+ - [ ] int2-7
diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c
index 97859b4..d92a7d5 100644
--- a/ne16/hal/ne16.c
+++ b/ne16/hal/ne16.c
@@ -23,8 +23,6 @@
#define NE16_STATUS_EMPTY (0x000)
#define NE16_STATUS_FULL (0x101)
-inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; }
-
inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) {
uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
return (status & 0x1) + ((status >> 8) & 0x1);
diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h
index c4c3a19..88ebee7 100644
--- a/ne16/hal/ne16.h
+++ b/ne16/hal/ne16.h
@@ -24,11 +24,12 @@
#include "hwpe.h"
#include
+#define NE16_TASK_QUEUE_SIZE (2)
+
typedef struct ne16_dev_t {
hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
} ne16_dev_t;
-int ne16_task_queue_size(ne16_dev_t *dev);
int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev);
int ne16_task_queue_empty(ne16_dev_t *dev);
int ne16_task_queue_full(ne16_dev_t *dev);
diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c
index 0ba54d5..f8408da 100644
--- a/ne16/hal/ne16_task.c
+++ b/ne16/hal/ne16_task.c
@@ -22,9 +22,9 @@
#include "ne16_task_defs.h"
#include "pulp_nnx_util.h"
-inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
- uint32_t i_width, uint32_t n_height,
- uint32_t n_width) {
+uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
+ uint32_t i_width, uint32_t n_height,
+ uint32_t n_width) {
uint32_t tile_padding = padding;
if (i_height > 0) {
tile_padding &= ~(0xf << 28);
@@ -41,41 +41,65 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
return tile_padding;
}
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
- const uint8_t depthwise, const uint8_t input_bits,
- const uint8_t output_bits, const uint8_t weights_bits,
- const ne16_weight_offset_mode_e weights_offset_mode,
- const uint32_t weights_offset_factor, ne16_quant_t quant,
- ne16_norm_t norm, const uint8_t stride) {
- const uint32_t flag_mode16 =
- input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
-
- *task = (ne16_task_t){
- .outbytes = output_bits / 8,
- .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16
- : NE16_WEIGHT_D0_STRIDE_MODE8,
- .qw = weights_bits,
- .stride_shift = stride == 2 ? 1 : 0,
- .output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT
- : NE16_OUTPUT_CHANNEL_THROUGHPUT,
- .kernel_shape = kernel_shape,
- .depthwise = depthwise,
- .data = {0}};
-
- const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+void ne16_task_init(ne16_task_t *task) { *task = (ne16_task_t){.data = {0}}; }
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+ const uint8_t depthwise, const uint8_t stride) {
+ task->depthwise = depthwise;
+ task->kernel_shape = kernel_shape;
+ task->subtile_output_channel =
+ depthwise ? NE16_SUBTILE_INPUT_CHANNEL : NE16_SUBTILE_OUTPUT_CHANNEL;
const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1
: depthwise == 1 ? NE16_FLAG_MODE_3x3_DW
: NE16_FLAG_MODE_3x3;
+ const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0;
+
+ task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE | NE16_MASK_FLAG_STRIDE_2x2);
+ task->data.cfg.conf0 |= flag_mode | flag_stride2x2;
+}
+
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+ const uint8_t output_bits, const uint8_t weight_bits) {
+ const uint32_t flag_mode16 =
+ input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC;
+
+ ne16_quant_mode_e quantMode;
+ if (output_bits == 16) {
+ quantMode = quantMode16Bit;
+ } else if (output_bits == 8) {
+ quantMode = quantMode8Bit;
+ } else {
+ quantMode = quantMode32Bit;
+ }
+
+ task->weight_d0_stride =
+ flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8;
+ task->qw = weight_bits;
+ task->data.cfg.conf0 &= ~(NE16_MASK_QUANT_MODE | NE16_MASK_FLAG_MODE16 |
+ NE16_MASK_FLAG_WEIGHT_BITS);
+ task->data.cfg.conf0 |= quantMode | flag_mode16 | (weight_bits - 1);
+}
+
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+ ne16_norm_t norm) {
+ task->data.cfg.conf0 &=
+ ~(NE16_MASK_QUANT_FUNCTION | NE16_MASK_SHIFT_AMOUNT |
+ NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE |
+ NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT);
task->data.cfg.conf0 |=
- NE16_FLAG_NORM_QUANT | quant.function | quant.mode |
- (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING |
- norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
- norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode |
- flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2;
+ NE16_FLAG_NORM_QUANT | quant.function | (quant.shift_amount << 16) |
+ quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING | norm.mode |
+ norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS |
+ norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT;
+}
- task->data.cfg.weight_offset_factor = weights_offset_factor;
+void ne16_task_set_weight_offset(ne16_task_t *task,
+ ne16_weight_offset_mode_e weight_offset_mode,
+ const int32_t weight_offset) {
+ task->data.cfg.conf0 &= ~NE16_MASK_WEIGHT_OFFSET_MODE;
+ task->data.cfg.conf0 |= weight_offset_mode;
+ task->data.cfg.weight_offset_factor = weight_offset;
}
/** ne16_pad_ptr
@@ -84,21 +108,18 @@ void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
* it was the start to the padded data.
* Necessary for input pointer when it's padded.
*/
-inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
- const uint32_t channel, const uint8_t bits,
- const uint8_t padding_top,
- const uint8_t padding_left) {
- return ptr - (padding_top * width + padding_left) * channel * bits / 8;
+uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
+ const uint8_t padding_top, const uint8_t padding_left) {
+ return ptr - (padding_top * width + padding_left) * width_stride;
}
-inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
- uint32_t w_in, uint32_t k_in, uint8_t bits_in,
- uint8_t padding_top, uint8_t padding_left,
- uint32_t output_ptr, uint32_t weights_ptr,
- uint32_t scale_ptr, uint32_t shift_ptr,
- uint32_t bias_ptr) {
+void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
+ uint32_t w_in_stride, uint8_t padding_top,
+ uint8_t padding_left, uint32_t output_ptr,
+ uint32_t weights_ptr, uint32_t scale_ptr,
+ uint32_t shift_ptr, uint32_t bias_ptr) {
task->data.infeat_ptr =
- ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left);
+ ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_ptr = output_ptr;
task->data.weights_ptr = weights_ptr;
task->data.scale_ptr = scale_ptr;
@@ -107,100 +128,101 @@ inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr,
}
void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+ const uint32_t h_in_stride,
const uint32_t w_in_stride,
- const uint32_t k_in_stride,
- const uint32_t w_out_stride,
- const uint32_t k_out_stride) {
- const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride) {
+ const uint32_t num_k_in =
+ nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
const ne16_stride_t input_stride = {
- .d0 = k_in_stride,
- .d1 = k_in_stride * w_in_stride,
- .d2 = task->depthwise ? 0
- : k_in_stride * NE16_FILTER_BUFFER_SIZE *
- NE16_FILTER_BUFFER_SIZE};
+ .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
task->data.cfg.input_stride = input_stride;
- // WARNING: Stride works only for even output channel sizes (divisible by 2)
- const ne16_stride_t output_stride = {
- .d0 = 32,
- .d1 = (k_out_stride * task->outbytes) >> task->stride_shift,
- .d2 =
- (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift};
+ const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES,
+ .d1 = w_out_stride,
+ .d2 = h_out_stride};
task->data.cfg.output_stride = output_stride;
if (task->kernel_shape == 1) {
task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw;
task->data.cfg.weights_stride.d1 =
task->weight_d0_stride * task->qw * num_k_in;
- task->data.cfg.weights_stride.d2 = 0;
} else if (!task->depthwise) {
task->data.cfg.weights_stride.d0 =
NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
task->data.cfg.weights_stride.d1 = NE16_FILTER_SIZE * NE16_FILTER_SIZE *
task->weight_d0_stride * task->qw *
num_k_in;
- task->data.cfg.weights_stride.d2 = 0;
} else {
task->data.cfg.weights_stride.d0 =
NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride;
task->data.cfg.weights_stride.d1 = 0;
- task->data.cfg.weights_stride.d2 = 0;
}
+ task->data.cfg.weights_stride.d2 = 0;
}
void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
const uint32_t h_out, const uint32_t w_out,
const uint32_t k_out, const uint8_t padding_bottom,
const uint8_t padding_right) {
- const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput);
- const uint16_t num_Ki = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
- const uint16_t num_Ho = divnceil(h_out, NE16_FILTER_SIZE);
- const uint16_t num_Wo = divnceil(w_out, NE16_FILTER_SIZE);
-
- const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput);
- const uint16_t rem_Ki = remainder(k_in, NE16_INPUT_CHANNEL_THROUGHPUT);
- const uint16_t rem_Ho = remainder(h_out, NE16_FILTER_SIZE);
- const uint16_t rem_Wo = remainder(w_out, NE16_FILTER_SIZE);
+ const uint16_t num_Ko =
+ nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+ const uint16_t num_Ki =
+ nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+ const uint16_t num_Ho =
+ nnx_calculate_number_of_tiles(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+ const uint16_t num_Wo =
+ nnx_calculate_number_of_tiles(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
+
+ const uint16_t rem_Ko =
+ nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+ const uint16_t rem_Ki =
+ nnx_calculate_last_tile_size(k_in, NE16_SUBTILE_INPUT_CHANNEL);
+ const uint16_t rem_Ho =
+ nnx_calculate_last_tile_size(h_out, NE16_SUBTILE_OUTPUT_HEIGHT);
+ const uint16_t rem_Wo =
+ nnx_calculate_last_tile_size(w_out, NE16_SUBTILE_OUTPUT_WIDTH);
const uint16_t rem_Hi =
(task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
const uint16_t rem_Wi =
(task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
const ne16_subtile_t subtile = {
- .number = {.KoKi = concat_half(num_Ko, num_Ki),
- .HoWo = concat_half(num_Ho, num_Wo)},
- .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
- .HoWo = concat_half(rem_Ho, rem_Wo),
- .HiWi = concat_half(rem_Hi, rem_Wi)}};
+ .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+ .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+ .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+ .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+ .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
task->data.cfg.subtile = subtile;
}
-inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
- const uint8_t bottom, const uint8_t left,
- const uint8_t right, const uint8_t value) {
+void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
+ const uint8_t bottom, const uint8_t left,
+ const uint8_t right, const uint8_t value) {
task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
(value & 0xff);
}
-inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
- const uint8_t right, const uint8_t bottom,
- const uint8_t left) {
+void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
+ const uint8_t right, const uint8_t bottom,
+ const uint8_t left) {
task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
((bottom & 0xff) << 8) | ((left & 0xff) << 0);
}
void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
- const uint32_t k_in, const uint32_t w_in_stride,
- const uint32_t k_in_stride, const uint32_t h_out,
+ const uint32_t k_in, const uint32_t h_in_stride,
+ const uint32_t w_in_stride, const uint32_t h_out,
const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
- const uint8_t padding_top, const uint8_t padding_bottom,
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride, const uint8_t padding_top,
+ const uint8_t padding_bottom,
const uint8_t padding_right,
const uint8_t padding_left) {
- ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
- k_out_stride);
+ ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+ w_out_stride);
ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
padding_right);
ne16_task_set_padding(task, padding_top, padding_bottom, padding_left,
@@ -209,18 +231,20 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
void ne16_task_set_dims_stride2x2(
ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
- const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+ const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
+ const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
const uint8_t padding_bottom, const uint8_t padding_right,
const uint8_t padding_left) {
const uint8_t stride = 2;
- ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride,
- k_out_stride);
+ // WARNING: works only for even output channel stride (divisible by 2)
+ ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride >> 1,
+ w_out_stride >> 1);
ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1,
- k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0);
+ k_out, h_in + padding_top >= 5 ? 0 : padding_bottom,
+ 0);
const uint8_t padding_bottom_new =
(h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom;
diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h
index df16b6c..69bc78c 100644
--- a/ne16/hal/ne16_task.h
+++ b/ne16/hal/ne16_task.h
@@ -60,7 +60,6 @@ typedef enum ne16_quant_function_e {
typedef struct ne16_quant_t {
// Shift amount must be in range 0x00-0x1F
unsigned shift_amount;
- ne16_quant_mode_e mode;
ne16_quant_function_e function;
int flag_rounding;
} ne16_quant_t;
@@ -110,38 +109,46 @@ typedef struct ne16_task_data_t {
typedef struct ne16_task_t {
ne16_task_data_t data;
- uint8_t outbytes;
uint8_t weight_d0_stride;
uint8_t qw;
- uint8_t stride_shift;
- uint8_t output_channel_throughput;
+ uint8_t subtile_output_channel;
uint8_t kernel_shape;
uint8_t depthwise;
uint8_t id;
} ne16_task_t;
-void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape,
- const uint8_t depthwise, const uint8_t input_bits,
- const uint8_t output_bits, const uint8_t weights_bits,
- const ne16_weight_offset_mode_e weights_offset_mode,
- const uint32_t weights_offset_factor, ne16_quant_t quant,
- ne16_norm_t norm, const uint8_t stride);
+void ne16_task_init(ne16_task_t *task);
+void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape,
+ const uint8_t depthwise, const uint8_t stride);
+void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits,
+ const uint8_t output_bits, const uint8_t weight_bits);
+void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant,
+ ne16_norm_t norm);
+void ne16_task_set_weight_offset(ne16_task_t *task,
+ ne16_weight_offset_mode_e weight_offset_mode,
+ const int32_t weight_offset);
uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
uint32_t i_width, uint32_t n_height,
uint32_t n_width);
uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
- const uint32_t channel, const uint8_t bits,
- const uint8_t padding_top, const uint8_t padding_left);
+ const uint32_t width_stride, const uint8_t padding_top,
+ const uint8_t padding_left);
void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
- uint32_t k_in, uint8_t bits_in, uint8_t padding_top,
+ uint32_t w_in_stride, uint8_t padding_top,
uint8_t padding_left, uint32_t output_ptr,
uint32_t weights_ptr, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
+/** ne16_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in,
+ const uint32_t h_in_stride,
const uint32_t w_in_stride,
- const uint32_t k_in_stride,
- const uint32_t w_out_stride,
- const uint32_t k_out_stride);
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride);
void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in,
const uint32_t h_out, const uint32_t w_out,
const uint32_t k_out, const uint8_t padding_bottom,
@@ -152,19 +159,32 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
const uint8_t right, const uint8_t bottom,
const uint8_t left);
+/** ne16_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
- const uint32_t k_in, const uint32_t w_in_stride,
- const uint32_t k_in_stride, const uint32_t h_out,
+ const uint32_t k_in, const uint32_t h_in_stride,
+ const uint32_t w_in_stride, const uint32_t h_out,
const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
- const uint8_t padding_top, const uint8_t padding_bottom,
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride, const uint8_t padding_top,
+ const uint8_t padding_bottom,
const uint8_t padding_right,
const uint8_t padding_left);
+/** ne16_task_set_dims_stride2x2
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the NE16 requires the channels to be contiguous.
+ */
void ne16_task_set_dims_stride2x2(
ne16_task_t *task, const uint32_t h_in, const uint32_t w_in,
- const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride,
+ const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride,
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
+ const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
const uint8_t padding_bottom, const uint8_t padding_right,
const uint8_t padding_left);
diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h
index 803e30e..d3d7297 100644
--- a/ne16/hal/ne16_task_defs.h
+++ b/ne16/hal/ne16_task_defs.h
@@ -25,8 +25,13 @@
#define NE16_FILTER_SIZE (3)
#define NE16_FILTER_BUFFER_SIZE (5)
-#define NE16_INPUT_CHANNEL_THROUGHPUT (16)
-#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32)
+#define NE16_SUBTILE_INPUT_HEIGHT (5)
+#define NE16_SUBTILE_INPUT_WIDTH (5)
+#define NE16_SUBTILE_INPUT_CHANNEL (16)
+#define NE16_SUBTILE_OUTPUT_HEIGHT (3)
+#define NE16_SUBTILE_OUTPUT_WIDTH (3)
+#define NE16_SUBTILE_OUTPUT_CHANNEL (32)
+#define NE16_OUTPUT_BANDWIDTH_BYTES (32)
#define NE16_WEIGHT_D0_STRIDE_MODE8 (2)
#define NE16_WEIGHT_D0_STRIDE_MODE16 (1)
@@ -59,12 +64,6 @@
#define NE16_REG_FILTER_MASKING 22
#define NE16_REG_CONF0 23
-/* SHIFT */
-
-#define NE16_SHIFT_FLAG_NORM_BIAS (25)
-#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
-#define NE16_SHIFT_ROUNDING (11)
-
/* CONF0 FLAGS */
#define NE16_FLAG_NORM_BIAS (1 << 25)
@@ -81,7 +80,7 @@
#define NE16_NORM_MODE_8BIT (0 << 12)
#define NE16_NORM_MODE_16BIT (1 << 12)
#define NE16_NORM_MODE_32BIT (2 << 12)
-#define NE16_FLAG_ROUND (1 << 11)
+#define NE16_FLAG_ROUNDING (1 << 11)
#define NE16_FLAG_STRIDE_2x2 (1 << 8)
#define NE16_FLAG_LINEAR_MODE (1 << 7)
#define NE16_FLAG_MODE_3x3 (0 << 5)
@@ -91,10 +90,26 @@
#define NE16_FLAG_MODE_BASIC (0 << 3)
#define NE16_FLAG_MODE16 (1 << 3)
+/* SHIFT */
+
+#define NE16_SHIFT_FLAG_NORM_BIAS (25)
+#define NE16_SHIFT_FLAG_NORM_SHIFT (24)
+#define NE16_SHIFT_FLAG_ROUNDING (11)
+
/* Masks */
-#define NE16_MASK_QUANT_FUNCTION (1 << 23)
-#define NE16_MASK_QUANT_MODE (3 << 21)
+#define NE16_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NE16_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NE16_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NE16_MASK_QUANT_MODE (0x3 << 21)
+#define NE16_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NE16_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NE16_MASK_NORM_MODE (0x3 << 12)
+#define NE16_MASK_FLAG_ROUNDING (0x1 << 11)
+#define NE16_MASK_FLAG_STRIDE_2x2 (0x1 << 8)
+#define NE16_MASK_FLAG_MODE (0x3 << 5)
+#define NE16_MASK_FLAG_MODE16 (0x1 << 3)
+#define NE16_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
/* PADDING */
diff --git a/neureka/README.md b/neureka/README.md
new file mode 100644
index 0000000..9c83f4e
--- /dev/null
+++ b/neureka/README.md
@@ -0,0 +1,34 @@
+# Neureka
+
+## Docs
+
+Github repo [link](https://github.com/siracusa-soc/ne).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [ ] Normalization and quantization
+ - [x] With
+ - [x] Without
+ - [x] Relu (w/ and w/o)
+ - [x] Bias (w/ and w/o)
+ - [ ] Per-channel shift
+ - [x] Per-layer shift
+ - [ ] Rounding
+- [x] Input type
+ - [x] uint8
+ - [x] int8
+- [x] Output type
+ - [x] int8
+ - [x] uint8 (only w/ Relu)
+ - [x] int32
+- [ ] Scale type
+ - [x] uint8
+ - [ ] uint32
+- [x] Bias type
+ - [x] int32
+- [ ] Weight type
+ - [x] int8
+ - [ ] int2-7
diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c
new file mode 100644
index 0000000..57136fd
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.c
@@ -0,0 +1,78 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_siracusa_bsp.h"
+#include
+
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR \
+ (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + \
+ NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS)
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100
+#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff
+#define NEUREKA_SIRACUSA_MAX_STALL (8)
+#define NEUREKA_SIRACUSA_EVENT (1 << 12)
+#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000)
+#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000)
+
+void neureka_siracusa_hci_setpriority_neureka() {
+ *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+ NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void neureka_siracusa_hci_setpriority_core() {
+ *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+ ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO;
+}
+
+void neureka_siracusa_hci_reset_max_stall() {
+ *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &=
+ ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) {
+ *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |=
+ max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL;
+}
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf) {
+ neureka_siracusa_hci_setpriority_neureka();
+ neureka_siracusa_hci_set_max_stall(conf->max_stall);
+}
+
+void neureka_siracusa_close() {
+ neureka_siracusa_hci_reset_max_stall();
+ neureka_siracusa_hci_setpriority_core();
+}
+
+void neureka_siracusa_event_wait_and_clear() {
+ eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT);
+}
+
+static const neureka_dev_t neureka_siracusa_dev = {
+ .hwpe_dev = (struct hwpe_dev_t){
+ .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}};
+
+const neureka_dev_t *neureka_siracusa_get_dev() {
+ return &neureka_siracusa_dev;
+}
diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h
new file mode 100644
index 0000000..be75a20
--- /dev/null
+++ b/neureka/bsp/neureka_siracusa_bsp.h
@@ -0,0 +1,67 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_siracusa_BSP_H__
+#define __NEUREKA_siracusa_BSP_H__
+
+#include "neureka.h"
+#include
+
+/**
+ * neureka_siracusa_setpriority_neureka
+ *
+ * Set HCI interconnect bus priority to prioritize neureka.
+ */
+void neureka_siracusa_hci_setpriority_neureka();
+
+/**
+ * neureka_siracusa_setpriority_core
+ *
+ * Set HCI bus priority to prioritize cores.
+ */
+void neureka_siracusa_hci_setpriority_core();
+
+/**
+ * neureka_siracusa_hci_reset_maxstall
+ *
+ * Reset the HCI bus maxstall parameter.
+ * TODO: Check if it disables it also or just resets?
+ */
+void neureka_siracusa_hci_reset_max_stall();
+
+/**
+ * neureka_siracusa_hci_set_maxstall
+ *
+ * Set the HCI bus maxstall. Maxstall defines how many cycles
+ * will the HCI bus stall the lower priority master, i.e. neureka or core,
+ * before letting it do a transaction.
+ */
+void neureka_siracusa_hci_set_max_stall(uint32_t max_stall);
+
+typedef struct neureka_siracusa_conf_t {
+ int max_stall;
+} neureka_siracusa_conf_t;
+
+void neureka_siracusa_open(neureka_siracusa_conf_t *conf);
+void neureka_siracusa_close();
+void neureka_siracusa_event_wait_and_clear();
+const neureka_dev_t *neureka_siracusa_get_dev();
+
+#endif // !__NEUREKA_siracusa_BSP_H__
diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h
new file mode 100644
index 0000000..37eeab0
--- /dev/null
+++ b/neureka/gvsoc/neureka_gvsoc.h
@@ -0,0 +1,54 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_GVSOC_H__
+#define __NEUREKA_GVSOC_H__
+
+#include "neureka.h"
+#include "neureka_task.h"
+
+#define NEUREKA_REG_GVSOC_LOG_LEVEL 24
+#define NEUREKA_REG_GVSOC_LOG_FORMAT 25
+
+typedef enum neureka_gvsoc_log_format_e {
+ NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0,
+ NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3
+} neureka_gvsoc_log_format_e;
+
+typedef enum neureka_gvsoc_log_level_e {
+ NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0,
+ NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1,
+ NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2,
+ NEUREKA_GVSOC_LOG_LEVEL_ALL = 3
+} neureka_gvsoc_log_level_e;
+
+static void neureka_gvsoc_log_activate(neureka_dev_t *dev,
+ neureka_gvsoc_log_level_e log_level,
+ neureka_gvsoc_log_format_e format) {
+ hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level);
+ hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format);
+}
+
+static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) {
+ hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL,
+ NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END);
+}
+
+#endif // __NEUREKA_GVSOC_H__
diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c
similarity index 56%
rename from neureka/inc/pulp_nnx_error_codes.h
rename to neureka/hal/neureka.c
index dc71575..dc829d9 100644
--- a/neureka/inc/pulp_nnx_error_codes.h
+++ b/neureka/hal/neureka.c
@@ -18,15 +18,20 @@
* SPDX-License-Identifier: Apache-2.0
*/
-#ifndef __NE16_ERROR_CODES_H__
-#define __NE16_ERROR_CODES_H__
+#include "neureka.h"
-typedef enum {
- success = 0,
- weightBitwidthOutOfBounds,
- unsupportedWeightOffsetMode,
- unsupportedFeatureBitwidth,
- dimensionMismatch
-} nnx_error_code;
+#define NEUREKA_STATUS_EMPTY (0x000)
+#define NEUREKA_STATUS_FULL (0x101)
-#endif // __NE16_ERROR_CODES_H__
\ No newline at end of file
+inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) {
+ uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
+ return (status & 0x1) + ((status >> 8) & 0x1);
+}
+
+inline int neureka_task_queue_empty(neureka_dev_t *dev) {
+ return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY;
+}
+
+inline int neureka_task_queue_full(neureka_dev_t *dev) {
+ return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL;
+}
diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h
similarity index 62%
rename from neureka/src/pulp_nnx_util.c
rename to neureka/hal/neureka.h
index daaaf2b..eae77a1 100644
--- a/neureka/src/pulp_nnx_util.c
+++ b/neureka/hal/neureka.h
@@ -18,13 +18,20 @@
* SPDX-License-Identifier: Apache-2.0
*/
-#include "pulp_nnx_util.h"
-#include "pulp_nnx_hal.h"
+#ifndef __NEUREKA_H__
+#define __NEUREKA_H__
-void nnx_activate_gvsoc_logging(int log_level) {
- NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level);
-}
+#include "hwpe.h"
+#include
-void nnx_deactivate_gvsoc_logging() {
- NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0);
-}
+#define NEUREKA_TASK_QUEUE_SIZE (2)
+
+typedef struct neureka_dev_t {
+ hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
+} neureka_dev_t;
+
+int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev);
+int neureka_task_queue_empty(neureka_dev_t *dev);
+int neureka_task_queue_full(neureka_dev_t *dev);
+
+#endif // __NEUREKA_H__
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
new file mode 100644
index 0000000..501b2b9
--- /dev/null
+++ b/neureka/hal/neureka_task.c
@@ -0,0 +1,239 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka_task.h"
+#include "neureka_task_defs.h"
+#include "pulp_nnx_util.h"
+
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+ uint32_t i_width, uint32_t n_height,
+ uint32_t n_width) {
+ uint32_t tile_padding = padding;
+ if (i_height > 0) {
+ tile_padding &= ~(0xf << 28);
+ }
+ if (i_width < n_width - 1) {
+ tile_padding &= ~(0xf << 24);
+ }
+ if (i_height < n_height - 1) {
+ tile_padding &= ~(0xf << 20);
+ }
+ if (i_width > 0) {
+ tile_padding &= ~(0xf << 16);
+ }
+ return tile_padding;
+}
+
+void neureka_task_init(neureka_task_t *task) {
+ *task = (neureka_task_t){.data = {0}};
+}
+
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+ const uint8_t kernel_shape,
+ const uint8_t depthwise,
+ const uint8_t stride) {
+ task->depthwise = depthwise;
+ task->kernel_shape = kernel_shape;
+ task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+ : NEUREKA_SUBTILE_OUTPUT_CHANNEL;
+ task->subtile_input_channel = kernel_shape == 3
+ ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
+ : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1;
+
+ const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1
+ : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW
+ : NEUREKA_FLAG_MODE_3x3;
+
+ task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE);
+ task->data.cfg.conf0 |= flag_mode;
+}
+
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+ const uint8_t output_bits,
+ const uint8_t weight_bits) {
+ neureka_quant_mode_e quantMode;
+ if (output_bits == 8) {
+ quantMode = quantMode8Bit;
+ } else {
+ quantMode = quantMode32Bit;
+ }
+
+ task->qw = weight_bits;
+ task->data.cfg.conf0 &=
+ ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS);
+ task->data.cfg.conf0 |= quantMode | (weight_bits - 1);
+}
+
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+ neureka_norm_t norm) {
+ task->data.cfg.conf0 &=
+ ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT |
+ NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS |
+ NEUREKA_MASK_FLAG_NORM_SHIFT);
+ task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function |
+ (quant.shift_amount << 16) | norm.mode |
+ norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
+ norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT;
+}
+
+void neureka_task_set_weight_offset(
+ neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+ const int32_t weight_offset) {
+ task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE;
+ task->data.cfg.conf0 |= weight_offset_mode;
+ task->data.cfg.weight_offset_factor = weight_offset;
+}
+
+void neureka_task_set_input_signed(neureka_task_t *task) {
+ task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED;
+}
+
+void neureka_task_set_input_unsigned(neureka_task_t *task) {
+ task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED;
+}
+
+void neureka_task_set_weight_source(neureka_task_t *task,
+ neureka_weight_source_e weight_source) {
+ task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE;
+ task->data.cfg.conf0 |= weight_source;
+}
+
+/** neureka_pad_ptr
+ *
+ * Calculate the pointer to the start of the ptr as if
+ * it was the start to the padded data.
+ * Necessary for input pointer when it's padded.
+ */
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+ const uint32_t width_stride, const uint8_t padding_top,
+ const uint8_t padding_left) {
+ return ptr - (padding_top * width + padding_left) * width_stride;
+}
+
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+ uint32_t w_in, uint32_t w_in_stride,
+ uint8_t padding_top, uint8_t padding_left,
+ uint32_t output_ptr, uint32_t weights_ptr,
+ uint32_t scale_ptr, uint32_t shift_ptr,
+ uint32_t bias_ptr) {
+ task->data.infeat_ptr =
+ neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
+ task->data.outfeat_ptr = output_ptr;
+ task->data.weights_ptr = weights_ptr;
+ task->data.scale_ptr = scale_ptr;
+ task->data.scale_shift_ptr = shift_ptr;
+ task->data.scale_bias_ptr = bias_ptr;
+}
+
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+ const uint32_t h_in_stride,
+ const uint32_t w_in_stride,
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride) {
+ const uint32_t num_k_in =
+ nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
+
+ const neureka_stride_t input_stride = {
+ .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0};
+ task->data.cfg.input_stride = input_stride;
+
+ const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES,
+ .d1 = w_out_stride,
+ .d2 = h_out_stride};
+ task->data.cfg.output_stride = output_stride;
+
+ task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES;
+ if (task->kernel_shape == 1) { // 1x1
+ task->data.cfg.weights_stride.d1 =
+ NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in;
+ } else if (!task->depthwise) { // 3x3
+ task->data.cfg.weights_stride.d1 =
+ NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in;
+ } else { // 3x3 depthwise
+ task->data.cfg.weights_stride.d1 = 0;
+ }
+ task->data.cfg.weights_stride.d2 = 0;
+}
+
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+ const uint32_t h_out, const uint32_t w_out,
+ const uint32_t k_out,
+ const uint8_t padding_bottom,
+ const uint8_t padding_right) {
+ const uint16_t num_Ko =
+ nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel);
+ const uint16_t num_Ki =
+ nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel);
+ const uint16_t num_Ho =
+ nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+ const uint16_t num_Wo =
+ nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+
+ const uint16_t rem_Ko =
+ nnx_calculate_last_tile_size(k_out, task->subtile_output_channel);
+ const uint16_t rem_Ki =
+ nnx_calculate_last_tile_size(k_in, task->subtile_input_channel);
+ const uint16_t rem_Ho =
+ nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT);
+ const uint16_t rem_Wo =
+ nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH);
+ const uint16_t rem_Hi =
+ (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom;
+ const uint16_t rem_Wi =
+ (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right;
+
+ const neureka_subtile_t subtile = {
+ .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki),
+ .HoWo = nnx_concat_half(num_Ho, num_Wo)},
+ .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki),
+ .HoWo = nnx_concat_half(rem_Ho, rem_Wo),
+ .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}};
+ task->data.cfg.subtile = subtile;
+}
+
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+ const uint8_t bottom, const uint8_t left,
+ const uint8_t right, const uint8_t value) {
+ task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) |
+ ((bottom & 0xf) << 20) | ((left & 0xf) << 16) |
+ (value & 0xff);
+}
+
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+ const uint8_t right, const uint8_t bottom,
+ const uint8_t left) {
+ task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
+ ((bottom & 0xff) << 8) | ((left & 0xff) << 0);
+}
+
+void neureka_task_set_dims(
+ neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+ const uint32_t h_in_stride, const uint32_t w_in_stride,
+ const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+ const uint32_t h_out_stride, const uint32_t w_out_stride,
+ const uint8_t padding_top, const uint8_t padding_bottom,
+ const uint8_t padding_right, const uint8_t padding_left) {
+ neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
+ w_out_stride);
+ neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
+ padding_right);
+ neureka_task_set_padding(task, padding_top, padding_bottom, padding_left,
+ padding_right, 0);
+}
diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h
new file mode 100644
index 0000000..2d06468
--- /dev/null
+++ b/neureka/hal/neureka_task.h
@@ -0,0 +1,187 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_TASK_H__
+#define __NEUREKA_TASK_H__
+
+#include "neureka_task_defs.h"
+#include
+
+typedef enum neureka_task_flag_e {
+ neurekaTaskFlagFalse = 0,
+ neurekaTaskFlagTrue = 1
+} neureka_task_flag_e;
+
+typedef enum neureka_weight_source_e {
+ neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM,
+ neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM
+} neureka_weight_source_e;
+
+typedef enum neureka_weight_offset_mode_e {
+ weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
+ weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
+} neureka_weight_offset_mode_e;
+
+typedef enum {
+ normMode8Bit = NEUREKA_NORM_MODE_8BIT,
+ normMode32Bit = NEUREKA_NORM_MODE_32BIT
+} neureka_norm_mode_e;
+
+typedef struct neureka_norm_t {
+ neureka_norm_mode_e mode;
+ int flag_bias;
+ int flag_shift;
+} neureka_norm_t;
+
+typedef enum neureka_quant_mode_e {
+ quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
+ quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
+} neureka_quant_mode_e;
+
+typedef enum neureka_quant_function_e {
+ quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
+ quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
+} neureka_quant_function_e;
+
+typedef struct neureka_quant_t {
+ // Shift amount must be in range 0x00-0x1F
+ unsigned shift_amount;
+ neureka_quant_function_e function;
+ int flag_rounding;
+} neureka_quant_t;
+
+typedef struct neureka_stride_t {
+ uint32_t d0;
+ uint32_t d1;
+ uint32_t d2;
+} neureka_stride_t;
+
+typedef struct neureka_subtile_remainder_t {
+ uint32_t KoKi;
+ uint32_t HoWo;
+ uint32_t HiWi;
+} neureka_subtile_remainder_t;
+
+typedef struct neureka_subtile_number_t {
+ uint32_t KoKi;
+ uint32_t HoWo;
+} neureka_subtile_number_t;
+
+typedef struct neureka_subtile_t {
+ neureka_subtile_remainder_t remainder;
+ neureka_subtile_number_t number;
+} neureka_subtile_t;
+
+typedef struct neureka_cfg_t {
+ neureka_stride_t input_stride;
+ neureka_stride_t output_stride;
+ neureka_stride_t weights_stride;
+ neureka_subtile_t subtile;
+ uint32_t padding;
+ uint32_t weight_offset_factor;
+ uint32_t filter_mask;
+ uint32_t conf0;
+} neureka_cfg_t;
+
+typedef struct neureka_task_data_t {
+ uint32_t weights_ptr;
+ uint32_t infeat_ptr;
+ uint32_t outfeat_ptr;
+ uint32_t scale_ptr;
+ uint32_t scale_shift_ptr;
+ uint32_t scale_bias_ptr;
+ neureka_cfg_t cfg;
+} neureka_task_data_t;
+
+typedef struct neureka_task_t {
+ neureka_task_data_t data;
+ uint8_t qw;
+ uint8_t subtile_output_channel;
+ uint8_t subtile_input_channel;
+ uint8_t kernel_shape;
+ uint8_t depthwise;
+ uint8_t id;
+} neureka_task_t;
+
+void neureka_task_init(neureka_task_t *task);
+void neureka_task_set_op_to_conv(neureka_task_t *task,
+ const uint8_t kernel_shape,
+ const uint8_t depthwise, const uint8_t stride);
+void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
+ const uint8_t output_bits,
+ const uint8_t weight_bits);
+void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant,
+ neureka_norm_t norm);
+void neureka_task_set_weight_offset(
+ neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode,
+ const int32_t weight_offset);
+void neureka_task_set_input_signed(neureka_task_t *task);
+void neureka_task_set_input_unsigned(neureka_task_t *task);
+void neureka_task_set_weight_source(neureka_task_t *task,
+ neureka_weight_source_e weight_source);
+uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
+ uint32_t i_width, uint32_t n_height,
+ uint32_t n_width);
+uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
+ const uint32_t width_stride, const uint8_t padding_top,
+ const uint8_t padding_left);
+void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
+ uint32_t w_in, uint32_t w_in_stride,
+ uint8_t padding_top, uint8_t padding_left,
+ uint32_t output_ptr, uint32_t weights_ptr,
+ uint32_t scale_ptr, uint32_t shift_ptr,
+ uint32_t bias_ptr);
+/** neureka_task_set_strides
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
+void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
+ const uint32_t h_in_stride,
+ const uint32_t w_in_stride,
+ const uint32_t h_out_stride,
+ const uint32_t w_out_stride);
+void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in,
+ const uint32_t h_out, const uint32_t w_out,
+ const uint32_t k_out,
+ const uint8_t padding_bottom,
+ const uint8_t padding_right);
+void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
+ const uint8_t bottom, const uint8_t left,
+ const uint8_t right, const uint8_t value);
+void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
+ const uint8_t right, const uint8_t bottom,
+ const uint8_t left);
+/** neureka_task_set_dims
+ *
+ * All the strides variables are strides between elements alongside that
+ * dimension and expressed in bytes. There is no stride variable for the channel
+ * dimension because the N-EUREKA requires the channels to be contiguous.
+ */
+void neureka_task_set_dims(
+ neureka_task_t *task, const uint32_t w_in, const uint32_t k_in,
+ const uint32_t h_in_stride, const uint32_t w_in_stride,
+ const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
+ const uint32_t h_out_stride, const uint32_t w_out_stride,
+ const uint8_t padding_top, const uint8_t padding_bottom,
+ const uint8_t padding_right, const uint8_t padding_left);
+
+#endif // !__NEUREKA_TASK_H__
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
new file mode 100644
index 0000000..fa08289
--- /dev/null
+++ b/neureka/hal/neureka_task_defs.h
@@ -0,0 +1,124 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __NEUREKA_DEFS_H__
+#define __NEUREKA_DEFS_H__
+
+/* ARHITECTURE */
+
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32)
+
+#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8)
+#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28)
+
+#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6)
+#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6)
+#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32)
+
+#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32)
+#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32)
+
+/* TASK REGISTERS */
+
+// job configuration
+#define NEUREKA_REG_WEIGHTS_PTR 0
+#define NEUREKA_REG_INFEAT_PTR 1
+#define NEUREKA_REG_OUTFEAT_PTR 2
+#define NEUREKA_REG_SCALE_PTR 3
+#define NEUREKA_REG_SCALE_SHIFT_PTR 4
+#define NEUREKA_REG_SCALE_BIAS_PTR 5
+#define NEUREKA_REG_INFEAT_D0_STRIDE 6
+#define NEUREKA_REG_INFEAT_D1_STRIDE 7
+#define NEUREKA_REG_INFEAT_D2_STRIDE 8
+#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9
+#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10
+#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11
+#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12
+#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13
+#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14
+#define NEUREKA_REG_SUBTILE_REMAINDER_0 15
+#define NEUREKA_REG_SUBTILE_REMAINDER_1 16
+#define NEUREKA_REG_SUBTILE_REMAINDER_2 17
+#define NEUREKA_REG_SUBTILE_NUMBER_0 18
+#define NEUREKA_REG_SUBTILE_NUMBER_1 19
+#define NEUREKA_REG_PADDING 20
+#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21
+#define NEUREKA_REG_FILTER_MASKING 22
+#define NEUREKA_REG_CONF0 23
+
+/* SHIFT */
+
+#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26)
+#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
+#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
+#define NEUREKA_SHIFT_QUANT_SHIFT (16)
+
+/* CONF0 FLAGS */
+
+#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26)
+#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
+#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
+#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
+#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
+#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
+#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
+// conf0[20:16] - quantization shift amount
+#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE \
+ (1 << 15) // Unimplemented in gvsoc
+#define NEUREKA_FLAG_STREAMIN (1 << 14)
+#define NEUREKA_NORM_MODE_8BIT (0 << 12)
+#define NEUREKA_NORM_MODE_32BIT (2 << 12)
+#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
+#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9)
+#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9)
+#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested
+#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
+#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
+#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
+#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
+
+/* Masks */
+
+#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26)
+#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25)
+#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24)
+#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23)
+#define NEUREKA_MASK_QUANT_MODE (0x3 << 21)
+#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16)
+#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15)
+#define NEUREKA_MASK_NORM_MODE (0x3 << 12)
+#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10)
+#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9)
+#define NEUREKA_MASK_FLAG_MODE (0x3 << 5)
+#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0)
+
+/* PADDING */
+
+#define NEUREKA_DONT_PAD (0)
+#define NEUREKA_MAX_PAD (2)
+
+/* NORM */
+#define NEUREKA_NORM_MAX_LEN (32)
+
+#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h
deleted file mode 100644
index e8ecba5..0000000
--- a/neureka/inc/pulp_nnx_defs.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Luka Macan
- * Arpan Prasad
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_DEFS_H__
-#define __NEUREKA_DEFS_H__
-
-/* ARHITECTURE */
-
-#define NEUREKA_FILTER_SIZE (6)
-#define NEUREKA_FILTER_BUFFER_SIZE (8)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28)
-#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32)
-#define NEUREKA_CONTEXT_SIZE (2)
-#define NEUREKA_WEIGHT_BANDWIDTH (256)
-
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8)
-#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16)
-
-/* REGISTER MAP */
-
-#define NEUREKA_EVT0 12
-#define NEUREKA_EVT1 13
-#define NEUREKA_BASE_ADDR 0x00201000
-#define WEIGHT_MEM_BASE 0x10400000
-#define SRAM_OFFSET 0x00400000
-#define MRAM_OFFSET 0x00000000
-
-// Cluster
-#define CLUSTER_CTRL_BASE_ADDR 0x00200000
-#define CLUSTER_CTRL_HWPE_OFFS 0x18
-#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800
-
-/* REGISTER OFFSETS */
-
-// commands
-#define NEUREKA_TRIGGER 0x00
-#define NEUREKA_ACQUIRE 0x04
-#define NEUREKA_FINISHED 0x08
-#define NEUREKA_STATUS 0x0C
-#define NEUREKA_RUNNING_JOB 0x10
-#define NEUREKA_SOFT_CLEAR 0x14
-#define NEUREKA_SWSYNC 0x18
-#define NEUREKA_URISCY_IMEM 0x1C
-
-// job configuration
-#define NEUREKA_REGISTER_OFFSET 0x20
-
-#define NEUREKA_REG_WEIGHTS_PTR 0x00
-#define NEUREKA_REG_INFEAT_PTR 0x04
-#define NEUREKA_REG_OUTFEAT_PTR 0x08
-#define NEUREKA_REG_SCALE_PTR 0x0C
-#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10
-#define NEUREKA_REG_SCALE_BIAS_PTR 0x14
-#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18
-#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C
-#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20
-#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24
-#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28
-#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C
-#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30
-#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34
-#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38
-#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C
-#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40
-#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44
-#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48
-#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C
-#define NEUREKA_REG_PADDING 0x50
-#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54
-#define NEUREKA_REG_FILTER_MASKING 0x58
-#define NEUREKA_REG_CONF0 0x5C
-
-// Simulation only
-#define NEUREKA_REG_GVSOC_TRACE 0x60
-
-/* SHIFT */
-
-#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25)
-#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24)
-#define NEUREKA_SHIFT_QUANT_SHIFT (16)
-#define NEUREKA_SHIFT_ROUNDING (11)
-
-/* CONF0 FLAGS */
-
-#define NEUREKA_FLAG_NORM_BIAS (1 << 25)
-#define NEUREKA_FLAG_NORM_SHIFT (1 << 24)
-#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23)
-#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23)
-#define NEUREKA_QUANT_MODE_8BIT (0 << 21)
-#define NEUREKA_QUANT_MODE_16BIT (1 << 21)
-#define NEUREKA_QUANT_MODE_32BIT (2 << 21)
-// conf0[20:16] - quantization shift amount
-#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15)
-#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15)
-#define NEUREKA_FLAG_STREAMIN (1 << 14)
-#define NEUREKA_NORM_MODE_8BIT (0 << 12)
-#define NEUREKA_NORM_MODE_16BIT (1 << 12)
-#define NEUREKA_NORM_MODE_32BIT (2 << 12)
-#define NEUREKA_FLAG_ROUND (1 << 11)
-#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10)
-#define NEUREKA_FLAG_USE_WMEM (1 << 9)
-#define NEUREKA_FLAG_USE_TCDM (0 << 9)
-#define NEUREKA_FLAG_STRIDED_MODE (1 << 8)
-#define NEUREKA_FLAG_LINEAR_MODE (1 << 7)
-#define NEUREKA_FLAG_MODE_3x3 (0 << 5)
-#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5)
-#define NEUREKA_FLAG_MODE_1x1 (2 << 5)
-#define NEUREKA_FLAG_NORM_QUANT (1 << 4)
-#define NEUREKA_FLAG_MODE_BASIC (0 << 3)
-#define NEUREKA_FLAG_MODE16 (1 << 3)
-
-/* Masks */
-
-#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23)
-#define NEUREKA_MASK_QUANT_MODE (3 << 21)
-
-/* Miscellaneous */
-
-// Padding
-#define MAX_PAD (0xf)
-
-// Normalization
-#define NEUREKA_NORM_MAX_LEN (32)
-#define NO_NORM(length) \
- { \
- .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL, \
- .length = length, .mode = normMode32Bit \
- }
-
-// Quantization
-#define NO_QUANT \
- { \
- .shift_amount = 0, .mode = quantMode32Bit, \
- .function = quantFunctionIdentity \
- }
-
-// GVSOC trace levels
-#define NEUREKA_TRACE_LEVEL_JOB_START_END 0
-#define NEUREKA_TRACE_LEVEL_CONFIG 1
-#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2
-#define NEUREKA_TRACE_LEVEL_ALL 3
-
-// null
-#define NEUREKA_NULL ((void *)0)
-#define NEUREKA_STATUS_FULL (0x101)
-
-#endif // __NEUREKA_DEFS_H__
diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h
deleted file mode 100644
index 40bcec0..0000000
--- a/neureka/inc/pulp_nnx_hal.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Luka Macan
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __NEUREKA_H__
-#define __NEUREKA_H__
-
-#include
-
-#include "pulp_nnx_defs.h"
-#include "pulp_nnx_error_codes.h"
-
-#define NEUREKA_CG_ENABLE() \
- *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |= \
- CLUSTER_CTRL_HWPE_CG_EN_MASK
-#define NEUREKA_CG_DISABLE() \
- *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &= \
- ~CLUSTER_CTRL_HWPE_CG_EN_MASK
-
-#define NEUREKA_WRITE(offset, value) \
- *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value)
-#define NEUREKA_WRITE_BE(offset, value, be) \
- *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value)
-#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset))
-
-#define NEUREKA_WRITE_IO_REG(offset, value) \
- NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value))
-#define NEUREKA_WRITE_IO_REG_BE(offset, value, be) \
- NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be))
-#define NEUREKA_READ_IO_REG(offset) \
- NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset))
-
-#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0)
-#define NEUREKA_BARRIER() \
- do { \
- eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \
- } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BUSYWAIT() \
- do { \
- } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0)
-#define NEUREKA_BARRIER_ACQUIRE(job_id) \
- job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \
- while (job_id < 0) { \
- eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \
- job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \
- };
-#define NEUREKA_NOBARRIER_ACQUIRE(job_id) \
- job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \
- while (job_id < 0) { \
- job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \
- };
-
-#define DIVNCEIL(A, B) (((A - 1) / B) + 1)
-#define REMAINDER(A, B) (((A - 1) % B) + 1)
-#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff))
-
-#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE
-
-#define FLAG_USED (1)
-#define FLAG_UNUSED (0)
-
-typedef enum {
- weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC,
- weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE
-} nnx_weight_offset_mode_e;
-
-typedef struct {
- void *data;
- uint16_t height;
- uint16_t width;
- uint16_t depth;
- uint16_t n_weights;
- uint32_t bitwidth;
- int32_t offset_factor;
- nnx_weight_offset_mode_e offset_mode;
-} nnx_weights_t;
-
-typedef enum {
- featureBitwidth8Bit = 8,
- featureBitwidth16Bit = 16,
- featureBitwidth32Bit = 32
-} nnx_feature_bitwidth_e;
-
-typedef struct {
- void *data;
- uint16_t height;
- uint16_t width;
- uint16_t depth;
- nnx_feature_bitwidth_e bitwidth;
-} nnx_feature_t;
-
-typedef enum {
- normMode8Bit = NEUREKA_NORM_MODE_8BIT,
- normMode16Bit = NEUREKA_NORM_MODE_16BIT,
- normMode32Bit = NEUREKA_NORM_MODE_32BIT
-} nnx_norm_mode_e;
-
-typedef struct {
- nnx_norm_mode_e mode;
- int flag_bias;
- int flag_shift;
-} nnx_norm_t;
-
-typedef enum {
- quantMode8Bit = NEUREKA_QUANT_MODE_8BIT,
- quantMode16Bit = NEUREKA_QUANT_MODE_16BIT,
- quantMode32Bit = NEUREKA_QUANT_MODE_32BIT
-} nnx_quant_mode_e;
-
-typedef enum {
- quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY,
- quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU
-} nnx_quant_function_e;
-
-// TODO: add rounding to quant. Should also be an enum? Best boolean...
-typedef struct {
- // Shift amount must be in range 0x00-0x1F
- unsigned shift_amount;
- nnx_quant_mode_e mode;
- nnx_quant_function_e function;
- int flag_rounding;
-} nnx_quant_t;
-
-typedef struct {
- uint32_t d0;
- uint32_t d1;
- uint32_t d2;
-} nnx_stride_t;
-
-typedef struct {
- uint32_t KoKi;
- uint32_t HoWo;
- uint32_t HiWi;
-} nnx_subtile_remainder_t;
-
-typedef struct {
- uint32_t KoKi;
- uint32_t HoWo;
-} nnx_subtile_number_t;
-
-typedef struct {
- nnx_subtile_remainder_t remainder;
- nnx_subtile_number_t number;
-} nnx_subtile_t;
-
-typedef struct {
- nnx_stride_t input_stride;
- nnx_stride_t output_stride;
- nnx_stride_t weights_stride;
- nnx_subtile_t subtile;
- uint32_t padding;
- uint32_t weight_offset_factor;
- uint32_t filter_mask;
- uint32_t conf0;
-} nnx_cfg_t;
-
-typedef struct {
- uint32_t weights_ptr;
- uint32_t infeat_ptr;
- uint32_t outfeat_ptr;
- uint32_t scale_ptr;
- uint32_t scale_shift_ptr;
- uint32_t scale_bias_ptr;
- nnx_cfg_t cfg;
-} nnx_task_t;
-
-int nnx_job_id();
-int nnx_empty();
-int nnx_full();
-void nnx_soft_clear();
-int nnx_acquire();
-void nnx_offload(nnx_task_t *task);
-void nnx_offload_ptr(nnx_task_t *task);
-void nnx_run_async();
-void nnx_run_blocking();
-void nnx_commit();
-void nnx_wait_empty();
-void nnx_wait_not_full();
-void nnx_wait_on_id(int id);
-void nnx_busywait();
-
-void nnx_task_init(nnx_task_t *task);
-int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom,
- uint32_t left, uint16_t value);
-int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant);
-void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom,
- uint8_t left);
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights,
- nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
- int k_out, int k_in);
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights,
- nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
- int k_out, int k_in);
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights,
- nnx_feature_t input, nnx_feature_t output);
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out,
- int k_out, int k_in);
-
-#endif /* __NEUREKA_H__ */
diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h
deleted file mode 100644
index f29ff3e..0000000
--- a/neureka/inc/pulp_nnx_util.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Luka Macan
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifndef __PULP_NNX_UTIL__
-#define __PULP_NNX_UTIL__
-
-void nnx_activate_gvsoc_logging(int use_dec);
-void nnx_deactivate_gvsoc_logging();
-
-#endif /* __PULP_NNX_UTIL__ */
diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c
deleted file mode 100644
index 1d99691..0000000
--- a/neureka/src/pulp_nnx_hal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Luka Macan
- * Arpan Prasad
- *
- * Copyright 2023 ETH Zurich and University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "pulp_nnx_hal.h"
-#include "pmsis.h"
-
-static int qw, weight_d0_stride, outbytes;
-
-// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and
-// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise
-// the compiler is not able to correctly factorize the NEUREKA base in case
-// several accesses are done, ending up with twice more code
-
-// __builtin_pulp_OffsetedX not defined - needs further investigation... (too
-// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK...
-
-int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); }
-
-int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; }
-
-int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); }
-
-void nnx_soft_clear() {
- NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0);
- for (volatile int i = 0; i < 10; i++)
- ;
-}
-
-int nnx_acquire() {
- int job_id = -1;
- NEUREKA_BARRIER_ACQUIRE(job_id);
- return job_id;
-}
-
-void nnx_offload(nnx_task_t *task) {
- int *task_data = (int *)task;
- for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) {
- NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
- }
-}
-
-void nnx_offload_ptr(nnx_task_t *task) {
- int *task_data = (int *)task;
- for (int i = 0; i < 6; ++i) {
- NEUREKA_WRITE_IO_REG(i * 4, task_data[i]);
- }
-}
-
-void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); }
-
-void nnx_run_blocking() {
- nnx_run_async();
- nnx_wait_empty();
-}
-
-void nnx_commit() {
- NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger
-}
-
-void nnx_busywait() { NEUREKA_BUSYWAIT(); }
-
-void nnx_wait_empty() {
- while (!nnx_empty())
- NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_not_full() {
- while (nnx_full())
- NEUREKA_BARRIER_NOSTATUS();
-}
-
-void nnx_wait_on_id(const int id) {
- while (nnx_job_id() <= id) {
- eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0);
- };
-}
-
-void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); }
-
-int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right,
- const uint32_t bottom, const uint32_t left,
- const uint16_t value) {
- uint32_t padding = 0;
- uint32_t flags = 0;
-
- if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) {
- return 1;
- }
-
- cfg->padding =
- (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value;
-
- return 0;
-}
-
-int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm,
- const nnx_quant_t quant) {
- if (quant.shift_amount > 31) {
- printf("ERROR! quant.shift_amount > 31\n");
- return 1;
- }
-
- if (quant.mode == quantMode16Bit) {
- printf("ERROR! quant.mode == quantMode16Bit\n");
- return 1;
- }
-
- BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode |
- (quant.shift_amount << 16) |
- quant.flag_rounding << NEUREKA_SHIFT_ROUNDING |
- norm.mode |
- norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS |
- norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT);
-
- return 0;
-}
-
-void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right,
- const uint8_t bottom, const uint8_t left) {
- cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) |
- ((uint32_t)bottom << 8) | ((uint32_t)left << 0);
-}
-
-nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out,
- const int w_out, const int k_out,
- const int k_in) {
-
- const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
- const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
- const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
- const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
- const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
- const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT);
- const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
- const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
- const int rem_Hi = rem_Ho;
- const int rem_Wi = rem_Wo;
-
- const nnx_subtile_t subtile = {
- .number = {.KoKi = concat_half(num_Ko, num_Ki),
- .HoWo = concat_half(num_Ho, num_Wo)},
- .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
- .HoWo = concat_half(rem_Ho, rem_Wo),
- .HiWi = concat_half(rem_Hi, rem_Wi)}};
- cfg->subtile = subtile;
-
- // Strides
- const nnx_stride_t input_stride = {
- .d0 = k_in,
- .d1 = k_in * w_out,
- .d2 = k_in * 3 * 3 // copying arpan
- };
- cfg->input_stride = input_stride;
-
- const nnx_stride_t output_stride = {
- .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
- cfg->output_stride = output_stride;
-
- const nnx_stride_t weights_stride = {
- .d0 = weight_d0_stride * qw,
- .d1 = weight_d0_stride * qw * num_Ki,
- .d2 = 0 // Unused
- };
- cfg->weights_stride = weights_stride;
-
- return 0;
-}
-
-nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights,
- const nnx_feature_t input,
- const nnx_feature_t output) {
- if (weights.bitwidth < 2 || weights.bitwidth > 8) {
- return weightBitwidthOutOfBounds;
- }
-
- if (weights.offset_mode != weightOffsetModeLayerWise) {
- // Currently only layer-wise mode is used.
- return unsupportedWeightOffsetMode;
- }
-
- if ((input.bitwidth != featureBitwidth8Bit &&
- input.bitwidth != featureBitwidth16Bit) ||
- (output.bitwidth != featureBitwidth8Bit &&
- output.bitwidth != featureBitwidth32Bit)) {
- return unsupportedFeatureBitwidth;
- }
-
- if (input.height != output.height || input.width != output.width ||
- input.depth != weights.depth || output.depth != weights.n_weights) {
- return dimensionMismatch;
- }
-
- const int mode16 =
- input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
- BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 |
- (weights.bitwidth - 1));
-
- // Global static variables needed by update_dims
- outbytes = output.bitwidth / 8;
- weight_d0_stride =
- mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
- qw = weights.bitwidth;
-
- nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth,
- input.depth);
-
- // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
- cfg->weight_offset_factor = weights.offset_factor;
-
- return 0;
-}
-
-nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out,
- const int w_out, const int k_out,
- const int k_in) {
-
- const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
- const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
- const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
- const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
- const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT);
- const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
- const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
- const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
- const int rem_Hi = rem_Ho + 2;
- const int rem_Wi = rem_Wo + 2;
-
- const nnx_subtile_t subtile = {
- .number = {.KoKi = concat_half(num_Ko, num_Ki),
- .HoWo = concat_half(num_Ho, num_Wo)},
- .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
- .HoWo = concat_half(rem_Ho, rem_Wo),
- .HiWi = concat_half(rem_Hi, rem_Wi)}};
- cfg->subtile = subtile;
-
- // Strides
- const nnx_stride_t input_stride = {.d0 = k_in,
- .d1 = k_in * (w_out + 2),
- .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE *
- NEUREKA_FILTER_BUFFER_SIZE};
- cfg->input_stride = input_stride;
-
- const nnx_stride_t output_stride = {
- .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
- cfg->output_stride = output_stride;
-
- const nnx_stride_t weights_stride = {
- .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3,
- .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki,
- .d2 = 0 // Unused
- };
- cfg->weights_stride = weights_stride;
-
- return 0;
-}
-
-nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights,
- const nnx_feature_t input,
- const nnx_feature_t output) {
- if (weights.bitwidth < 2 || weights.bitwidth > 8) {
- return weightBitwidthOutOfBounds;
- }
-
- if (weights.offset_mode != weightOffsetModeLayerWise) {
- // Currently only layer-wise mode is used.
- return unsupportedWeightOffsetMode;
- }
-
- if ((input.bitwidth != featureBitwidth8Bit &&
- input.bitwidth != featureBitwidth16Bit) ||
- (output.bitwidth != featureBitwidth8Bit &&
- output.bitwidth != featureBitwidth32Bit)) {
- return unsupportedFeatureBitwidth;
- }
-
- if (input.height - 2 != output.height || input.width - 2 != output.width ||
- input.depth != weights.depth || output.depth != weights.n_weights) {
- return dimensionMismatch;
- }
-
- const int mode16 =
- input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
- BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 |
- (weights.bitwidth - 1));
-
- // Global static variables needed by update_dims
- outbytes = output.bitwidth / 8;
- weight_d0_stride =
- mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
- qw = weights.bitwidth;
-
- nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth,
- input.depth);
-
- // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
- cfg->weight_offset_factor = weights.offset_factor;
-
- return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out,
- const int w_out, const int k_out,
- const int k_in) {
-
- const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
- const int num_Ki = num_Ko;
- const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE);
- const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE);
-
- const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3);
- const int rem_Ki = rem_Ko;
- const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE);
- const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE);
- const int rem_Hi = rem_Ho + 2;
- const int rem_Wi = rem_Wo + 2;
-
- const nnx_subtile_t subtile = {
- .number = {.KoKi = concat_half(num_Ko, num_Ki),
- .HoWo = concat_half(num_Ho, num_Wo)},
- .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki),
- .HoWo = concat_half(rem_Ho, rem_Wo),
- .HiWi = concat_half(rem_Hi, rem_Wi)}};
- cfg->subtile = subtile;
-
- // Strides
- const nnx_stride_t input_stride = {
- .d0 = k_out,
- .d1 = k_out * (w_out + 2),
- .d2 = 0 // Unused
- };
- cfg->input_stride = input_stride;
-
- const nnx_stride_t output_stride = {
- .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out};
- cfg->output_stride = output_stride;
-
- const nnx_stride_t weights_stride = {
- .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride,
- .d1 = 0,
- .d2 = 0 // Unused
- };
- cfg->weights_stride = weights_stride;
-
- return 0;
-}
-
-nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights,
- const nnx_feature_t input,
- const nnx_feature_t output) {
- if (weights.bitwidth < 2 || weights.bitwidth > 8) {
- return weightBitwidthOutOfBounds;
- }
-
- if (weights.offset_mode != weightOffsetModeLayerWise) {
- // Currently only layer-wise mode is used.
- return unsupportedWeightOffsetMode;
- }
-
- if ((input.bitwidth != featureBitwidth8Bit &&
- input.bitwidth != featureBitwidth16Bit) ||
- (output.bitwidth != featureBitwidth8Bit &&
- output.bitwidth != featureBitwidth32Bit)) {
- return unsupportedFeatureBitwidth;
- }
-
- if (input.height - 2 != output.height || input.width - 2 != output.width ||
- input.depth != output.depth) {
- return dimensionMismatch;
- }
-
- const int mode16 =
- input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC;
-
- BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 |
- (weights.bitwidth - 1));
-
- // Global static variables needed by update_dims
- outbytes = output.bitwidth / 8;
- weight_d0_stride =
- mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8;
- qw = weights.bitwidth;
-
- nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth,
- input.depth);
-
- // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth);
- cfg->weight_offset_factor = weights.offset_factor;
-
- return 0;
-}
diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c
index 7ab0e99..f9799fc 100644
--- a/src/pulp_nnx_ne16.c
+++ b/src/pulp_nnx_ne16.c
@@ -79,25 +79,20 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i,
uint32_t size_j, uint32_t size_k,
uint32_t stride_j, uint32_t stride_k,
uint32_t overlap_i, uint32_t overlap_j,
- uint32_t offset_i, uint32_t offset_j,
- uint8_t data_size) {
- return ptr +
- (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k *
- data_size / 8 +
- (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8;
+ uint32_t offset_i, uint32_t offset_j) {
+ return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j +
+ (j * (size_j - overlap_j) - offset_j) * stride_k;
}
-void ne16_nnx_dispatch_stride2x2(
- ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
- const uint32_t w_in_stride, const uint32_t k_in_stride,
- const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
- const uint32_t w_out_stride, const uint32_t k_out_stride,
- const uint8_t h_ker, const uint8_t w_ker) {
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+ const uint32_t w_in, const uint32_t k_in,
+ const uint32_t h_out, const uint32_t w_out,
+ const uint32_t k_out, const uint8_t h_ker,
+ const uint8_t w_ker) {
const uint8_t stride = 2;
- const uint8_t bits = 8;
- const uint32_t n_h = divnceil(h_out, stride);
- const uint32_t n_w = divnceil(w_out, stride);
+ const uint32_t n_h = nnx_calculate_number_of_tiles(h_out, stride);
+ const uint32_t n_w = nnx_calculate_number_of_tiles(w_out, stride);
const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0;
const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0;
const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0;
@@ -109,15 +104,15 @@ void ne16_nnx_dispatch_stride2x2(
for (int i = 0; i < n_h; i++) {
for (int j = 0; j < n_w; j++) {
- task->data.infeat_ptr =
- _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
- w_in_stride, k_in_stride, h_ker - stride,
- w_ker - stride, i == 0 ? 0 : input_height_offset,
- j == 0 ? 0 : input_width_offset, bits);
- task->data.outfeat_ptr =
- _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride,
- k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset,
- j == 0 ? 0 : output_width_offset, bits);
+ task->data.infeat_ptr = _get_tile_ptr(
+ input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in,
+ task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0,
+ h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset,
+ j == 0 ? 0 : input_width_offset);
+ task->data.outfeat_ptr = _get_tile_ptr(
+ output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1,
+ task->data.cfg.output_stride.d1 << 1, 0, 0,
+ i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset);
task->data.cfg.padding =
ne16_get_tile_padding(tile_padding, i, j, n_h, n_w);
diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c
new file mode 100644
index 0000000..0abb845
--- /dev/null
+++ b/src/pulp_nnx_neureka.c
@@ -0,0 +1,76 @@
+/*
+ * Luka Macan
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "pulp_nnx_neureka.h"
+#include "hwpe.h"
+#include "neureka.h"
+#include "pulp_nnx_util.h"
+#include
+#include
+#include
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf) {
+ neureka_siracusa_open(conf);
+ hwpe_soft_clear(&dev->hwpe_dev);
+}
+
+void neureka_nnx_term(neureka_dev_t *dev) {
+ hwpe_soft_clear(&dev->hwpe_dev);
+ neureka_siracusa_close();
+}
+
+int neureka_nnx_dispatch_check(neureka_dev_t *dev) {
+ return !neureka_task_queue_full(dev);
+}
+
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev) {
+ while (!neureka_nnx_dispatch_check(dev)) {
+ neureka_siracusa_event_wait_and_clear();
+ }
+}
+
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) {
+ if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) {
+ return 1;
+ }
+ hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data,
+ (int)(sizeof(neureka_task_data_t) / 4));
+ hwpe_task_queue_release_and_run(&dev->hwpe_dev);
+ return 0;
+}
+
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) {
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+ // GVSOC model has a broken running_id so resolve_check
+ // conservativly looks if the task queue is empty.
+ return neureka_task_queue_empty(dev);
+#else
+ uint8_t prev_task_id = task->id - 1;
+ return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id ||
+ (hwpe_last_task_id(&dev->hwpe_dev) == task->id &&
+ !neureka_task_queue_empty(dev)));
+#endif
+}
+
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) {
+ while (!neureka_nnx_resolve_check(dev, task)) {
+ neureka_siracusa_event_wait_and_clear();
+ }
+}
diff --git a/test/.isort.cfg b/test/.isort.cfg
new file mode 100644
index 0000000..127bf37
--- /dev/null
+++ b/test/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+profile=black
+line_length=88
+skip_gitignore=true
diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
index 5abb204..07dc597 100644
--- a/test/HeaderWriter.py
+++ b/test/HeaderWriter.py
@@ -48,8 +48,9 @@ def define(self, name, expr):
if isinstance(expr, str):
expr = f'"{expr}"'
elif isinstance(expr, bool):
- expr = int(expr)
- expr = f"({expr})"
+ expr = f"({int(expr)})"
+ else:
+ expr = f"({expr})"
return f"#define {name.upper()} {expr}\n"
def vector_size(self, data):
@@ -158,7 +159,7 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None):
if golden is not None:
render += self.render_vector(
- "golden_" + name, "PI_L1 " + _type, size, init=golden
+ "golden_" + name, "PI_L2 " + _type, size, init=golden
)
render += self.check(name)
diff --git a/test/Ne16.py b/test/Ne16.py
deleted file mode 100644
index 6de5ab5..0000000
--- a/test/Ne16.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Luka Macan
-#
-# Copyright 2023 ETH Zurich and University of Bologna
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import numpy.typing as npt
-from TestClasses import IntegerType
-
-
-class Ne16:
- ACCUMULATOR_TYPE = IntegerType(name="int32")
-
- _CIN_SUBTILE = 16
-
- @staticmethod
- def weight_unroll(
- weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
- ) -> npt.NDArray[np.uint8]:
- """Unroll weight into expected memory format
-
- Expected weight shape is (Cout, Cin, H, W).
- The output shape is: (Cout, Cin_major, Bits, H x W, Cin_minor_bytes),
- where Cin_major is the ceil(Cin / CIN_SUBTILE) and Cin_minor has to be padded with 0 to CIN_SUBTILE.
- """
- if depthwise:
- weight = weight.transpose(1, 0, 2, 3) # Swap Cout and Cin
-
- Cout, Cin, H, W = weight.shape
-
- # Pad Cin to be divisible with CIN_SUBTILE
- if Cin % Ne16._CIN_SUBTILE != 0:
- Cin_pad = Ne16._CIN_SUBTILE - Cin % Ne16._CIN_SUBTILE
- weight = np.pad(
- weight,
- ((0, 0), (0, Cin_pad), (0, 0), (0, 0)),
- "constant",
- constant_values=0,
- )
-
- # Reshape into (Cout, Cin_major, Cin_minor, Flattened spatial, 1)
- # The 1 at the end is required by the unpacking
- Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
- Cin_minor = Ne16._CIN_SUBTILE
- weight = weight.reshape(Cout, Cin_major, Cin_minor, H * W, 1)
-
- # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
- # (Cout, Cin_major, Cin_minor, Flattened spatial, Bits)
- weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
-
- # Shuffle bits so that the final shape is:
- # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor)
- weight = weight.transpose(0, 1, 4, 3, 2)
-
- # Prepare for packing
- # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes, 8)
- Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
- weight = np.stack(np.split(weight, Cin_minor_bytes, axis=-1), axis=-2)
-
- # Pack
- # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes)
- weight = np.packbits(weight, axis=-1, bitorder="little")
-
- return weight.flatten()
-
- @staticmethod
- def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: int):
- """Reverse of weight_roll"""
- Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE))
- Cin_minor = Ne16._CIN_SUBTILE
- Cin_minor_bytes = int(np.ceil(Cin_minor / 8))
-
- weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1)
- weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
- weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor)
- weight = weight.transpose(0, 1, 4, 3, 2)
- weight = np.packbits(weight, axis=-1, bitorder="little")
- weight = weight.reshape(Cout, Cin_major * Cin_minor, H, W)
- weight = weight[:, :Cin, :, :]
-
- return weight
diff --git a/test/Ne16MemoryLayout.py b/test/Ne16MemoryLayout.py
new file mode 100644
index 0000000..30729ab
--- /dev/null
+++ b/test/Ne16MemoryLayout.py
@@ -0,0 +1,99 @@
+# Luka Macan
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+
+class Ne16MemoryLayout:
+ _CIN_SUBTILE = 16
+
+ @staticmethod
+ def weightEncode(
+ weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+ ) -> npt.NDArray[np.uint8]:
+ """Unroll weight into expected memory format
+
+ Expected weight shape is (cout, cin, height, width).
+ The output shape is: (cout, cinMajor, Bits, height x width, cinMinorBytes),
+ where cinMajor is the ceil(cin / CIN_SUBTILE) and cinMinor has to be padded with 0 to CIN_SUBTILE.
+ """
+ if depthwise:
+ weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin
+
+ cout, cin, height, width = weight.shape
+
+ # Pad cin to be divisible with CIN_SUBTILE
+ if cin % Ne16MemoryLayout._CIN_SUBTILE != 0:
+ cinPad = Ne16MemoryLayout._CIN_SUBTILE - cin % Ne16MemoryLayout._CIN_SUBTILE
+ weight = np.pad(
+ weight,
+ ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+ "constant",
+ constant_values=0,
+ )
+ cin = cin + cinPad
+
+ # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1)
+ # The 1 at the end is required by the unpacking
+ cinMajor = cin // Ne16MemoryLayout._CIN_SUBTILE
+ cinMinor = Ne16MemoryLayout._CIN_SUBTILE
+ weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1)
+
+ # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+ # (cout, cinMajor, cinMinor, flattened spatial, Bits)
+ weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+ # Shuffle bits so that the final shape is:
+ # (cout, cinMajor, Bits, flattened spatial, cinMinor)
+ weight = weight.transpose(0, 1, 4, 3, 2)
+
+ # Prepare for packing
+ # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes, 8)
+ cinMinorBytes = int(np.ceil(cinMinor / 8))
+ weight = np.stack(np.split(weight, cinMinorBytes, axis=-1), axis=-2)
+
+ # Pack
+ # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes)
+ weight = np.packbits(weight, axis=-1, bitorder="little")
+
+ return weight.flatten()
+
+ @staticmethod
+ def weightDecode(
+ weight: npt.NDArray[np.uint8],
+ bits: int,
+ cout: int,
+ cin: int,
+ height: int,
+ width: int,
+ ) -> npt.NDArray[np.uint8]:
+ """Reverse of weight_roll"""
+ cinMajor = int(np.ceil(cin / Ne16MemoryLayout._CIN_SUBTILE))
+ cinMinor = Ne16MemoryLayout._CIN_SUBTILE
+ cinMinorBytes = int(np.ceil(cinMinor / 8))
+
+ weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1)
+ weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+ weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor)
+ weight = weight.transpose(0, 1, 4, 3, 2)
+ weight = np.packbits(weight, axis=-1, bitorder="little")
+ weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+ weight = weight[:, :cin, :, :]
+
+ return weight
diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py
new file mode 100644
index 0000000..f2e66ad
--- /dev/null
+++ b/test/Ne16TestConf.py
@@ -0,0 +1,111 @@
+# Luka Macan
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from NnxTestClasses import NnxTestConf
+from TestClasses import IntegerType, KernelShape, Stride, implies
+
+
+class Ne16TestConf(NnxTestConf):
+ @field_validator("kernel_shape")
+ @classmethod
+ def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+ assert v == KernelShape(height=1, width=1) or v == KernelShape(
+ height=3, width=3
+ ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+ return v
+
+ @field_validator("stride")
+ @classmethod
+ def check_valid_stride(cls, v: Stride) -> Stride:
+ assert v == Stride(height=1, width=1) or v == Stride(
+ height=2, width=2
+ ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
+ return v
+
+ @staticmethod
+ def _check_type(
+ name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+ ) -> None:
+ assert (
+ _type in allowed_types
+ ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+ @field_validator("in_type")
+ @classmethod
+ def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+ Ne16TestConf._check_type("in_type", v, ["uint8"])
+ return v
+
+ @field_validator("out_type")
+ @classmethod
+ def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+ Ne16TestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
+ return v
+
+ @field_validator("weight_type")
+ @classmethod
+ def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+ Ne16TestConf._check_type("weight_type", v, ["int8"])
+ return v
+
+ @field_validator("scale_type")
+ @classmethod
+ def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+ if v is not None:
+ Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
+ return v
+
+ @field_validator("bias_type")
+ @classmethod
+ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+ if v is not None:
+ Ne16TestConf._check_type("bias_type", v, ["int32"])
+ return v
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_out_channel_stride_with_stride_2x2(self) -> Ne16TestConf:
+ assert implies(
+ self.stride == Stride(height=2, width=2),
+ self.out_channel * (self.out_type._bits // 8) % 2 == 0,
+ ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
+ return self
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf:
+ assert implies(
+ self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+ ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+ return self
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf:
+ assert implies(
+ not self.has_norm_quant,
+ self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
+ ), (
+ f"Without quantization, the output type has to be equal to the "
+ f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+ )
+ return self
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
new file mode 100644
index 0000000..08b3601
--- /dev/null
+++ b/test/NeuralEngineFunctionalModel.py
@@ -0,0 +1,123 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from TestClasses import IntegerType, Padding, Stride
+
+
+class NeuralEngineFunctionalModel:
+ ACCUMULATOR_TYPE = IntegerType(name="int32")
+
+ @staticmethod
+ def _cast(
+ tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
+ ) -> torch.Tensor:
+ if saturate:
+ return tensor.clamp(_type.min, _type.max)
+ else:
+ return tensor & ((1 << _type._bits) - 1)
+
+ def _norm_quant(
+ self,
+ tensor: torch.Tensor,
+ scale: torch.Tensor,
+ bias: Optional[torch.Tensor],
+ global_shift: torch.Tensor,
+ out_type: IntegerType,
+ bias_type: Optional[IntegerType],
+ has_bias: bool,
+ has_relu: bool,
+ ) -> torch.Tensor:
+ # Scale accumulators are in 48bit, so keeping the data in 64bit
+ tensor = tensor * scale
+ assert tensor.dtype == torch.int64
+
+ if has_bias:
+ assert bias is not None
+ assert bias_type is not None
+ # Saturating cast to int32
+ tensor = NeuralEngineFunctionalModel._cast(
+ tensor, bias_type, saturate=True
+ ).type(torch.int32)
+
+ tensor = tensor + bias
+ tensor = NeuralEngineFunctionalModel._cast(
+ tensor, bias_type, saturate=False
+ ).type(torch.int32)
+
+ if has_relu:
+ tensor = F.relu(tensor)
+
+ tensor = tensor >> global_shift
+
+ # Saturate into out_type
+ tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
+
+ return tensor
+
+ def convolution(
+ self,
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ scale: Optional[torch.Tensor],
+ bias: Optional[torch.Tensor],
+ global_shift: Optional[torch.Tensor],
+ padding: Padding,
+ stride: Stride,
+ depthwise: bool,
+ out_type: IntegerType,
+ bias_type: Optional[IntegerType],
+ has_norm_quant: bool,
+ has_bias: bool,
+ has_relu: bool,
+ verbose: bool = False,
+ **kwargs,
+ ) -> torch.Tensor:
+ _ = kwargs
+
+ input_padded = F.pad(
+ input,
+ (
+ padding.left,
+ padding.right,
+ padding.top,
+ padding.bottom,
+ ),
+ "constant",
+ 0,
+ )
+
+ # Accumulators are 32bit non-saturating.
+ # Calculate in higher precision (int64)
+ output = F.conv2d(
+ input=input_padded,
+ weight=weight,
+ stride=(stride.height, stride.width),
+ groups=weight.shape[0] if depthwise else 1,
+ ).type(torch.int64)
+
+ # Cast to accumulator type
+ output = NeuralEngineFunctionalModel._cast(
+ output, NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, saturate=False
+ ).type(torch.int32)
+
+ if verbose:
+ print("INTERMEDIATE RESULTS (pre-normalization/requant):")
+ print(output)
+
+ if has_norm_quant:
+ assert scale is not None
+ assert global_shift is not None
+ output = self._norm_quant(
+ output,
+ scale,
+ bias,
+ global_shift,
+ out_type,
+ bias_type,
+ has_bias,
+ has_relu,
+ )
+
+ return output
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
new file mode 100644
index 0000000..80a2786
--- /dev/null
+++ b/test/NeurekaMemoryLayout.py
@@ -0,0 +1,158 @@
+# Luka Macan
+# Arpan Suravi Prasad
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+from TestClasses import IntegerType
+
+
+class NeurekaMemoryLayout:
+ _WEIGHT_BANDWIDTH = 256
+ _CIN_SUBTILE_1x1 = 32
+ _CIN_SUBTILE_3x3 = 28
+
+ @staticmethod
+ def weightEncode(
+ weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+ ) -> npt.NDArray[np.uint8]:
+ """Unroll weight into expected memory format
+
+ Expected weight shape is (cout, cin, H, W).
+ The produced memory layout depends on the weight kernel shape:
+ - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+ - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+ where cinMajor is the ceil(cin / cin subtile ) and cinMinor has to be padded with 0 to cin subtile .
+ """
+ if depthwise:
+ weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin
+
+ cout, cin, height, width = weight.shape
+ cinSubtile = (
+ NeurekaMemoryLayout._CIN_SUBTILE_3x3
+ if height == 3
+ else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+ )
+
+ # Pad cin to be divisible with CIN_SUBTILE
+ if cin % cinSubtile != 0:
+ cinPad = cinSubtile - cin % cinSubtile
+ weight = np.pad(
+ weight,
+ ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+ "constant",
+ constant_values=0,
+ )
+
+ # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+ # The 1 at the end is required by the unpacking
+ cinMajor = int(np.ceil(cin / cinSubtile))
+ weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
+
+ # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+ # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
+ weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+ # Shuffle bits so that the final shape is:
+ # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
+ weight = weight.transpose(0, 1, 4, 3, 2)
+
+ # Pack dimensions to fit into weight bandwidth
+ if height == 3 and width == 3:
+ # (cout * cinMajor * Bits, H * W * cinSubtile)
+ weight = weight.reshape(-1, height * width * cinSubtile)
+ # Pad only the last dimension to weight bandwidth size
+ # (-1, Weight Bandwidth)
+ weight = np.pad(
+ weight,
+ ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
+ "constant",
+ constant_values=0,
+ )
+ elif height == 1 and width == 1:
+ # Tile cinSubtile into tiles of size 4
+ # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+ weight = weight.reshape(
+ cout, cinMajor, bits, height * width, cinSubtile // 4, 4
+ ) # cout, cinMajor, bits, 1, 8, 4
+ # Pad bits to 8
+ if bits < 8:
+ # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+ weight = np.pad(
+ weight,
+ ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+ mode="constant",
+ constant_values=0,
+ )
+ # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+ weight = weight.transpose(0, 1, 3, 4, 2, 5)
+ # (-1, Weight Bandwidth)
+ weight = weight.reshape(
+ cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
+ ) # cout*cinMajor, 256b
+
+ # Prepare for packing
+ # (-1, Weight Bandwidth Bytes, 8)
+ weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
+ weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
+
+ # Pack bits
+ # (-1, Weight Bandwidth Bytes)
+ weight = np.packbits(weight, axis=-1, bitorder="little")
+
+ return weight.flatten()
+
+ @staticmethod
+ def weightDecode(
+ weight: npt.NDArray[np.uint8],
+ bits: int,
+ cout: int,
+ cin: int,
+ height: int,
+ width: int,
+ ) -> npt.NDArray[np.uint8]:
+ """Reverse of weightEncode"""
+ cinSubtile = (
+ NeurekaMemoryLayout._CIN_SUBTILE_3x3
+ if height == 3
+ else NeurekaMemoryLayout._CIN_SUBTILE_1x1
+ )
+ cinMajor = int(np.ceil(cin / cinSubtile))
+ cinMinor = cinSubtile
+ weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
+
+ weight = weight.reshape(-1, weightBandwidthBytes, 1)
+ weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+ weight = weight.reshape(-1, NeurekaMemoryLayout._WEIGHT_BANDWIDTH)
+
+ if height == 3 and width == 3:
+ weight = weight[:, : height * width * cinMinor]
+ weight = weight.reshape(
+ cout, cinMajor, bits, height * width, cinMinor
+ ).transpose(0, 1, 4, 3, 2)
+ elif height == 1 and width == 1:
+ weight = weight[:, : height * width * cinMinor * 8]
+ weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
+ 0, 1, 2, 4, 3
+ )
+ weight = np.packbits(weight, axis=-1, bitorder="little")
+ weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+ weight = weight[:, :cin, :, :]
+
+ return weight
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
new file mode 100644
index 0000000..f878e68
--- /dev/null
+++ b/test/NeurekaTestConf.py
@@ -0,0 +1,101 @@
+# Luka Macan
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import List, Optional, Union
+
+from pydantic import field_validator, model_validator
+
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from NnxTestClasses import NnxTestConf
+from TestClasses import IntegerType, KernelShape, Stride, implies
+
+
+class NeurekaTestConf(NnxTestConf):
+ @field_validator("kernel_shape")
+ @classmethod
+ def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
+ assert v == KernelShape(height=1, width=1) or v == KernelShape(
+ height=3, width=3
+ ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
+ return v
+
+ @field_validator("stride")
+ @classmethod
+ def check_valid_stride(cls, v: Stride) -> Stride:
+ assert v == Stride(height=1, width=1), f"Unsupported stride {v}. Supported 1x1."
+ return v
+
+ @staticmethod
+ def _check_type(
+ name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
+ ) -> None:
+ assert (
+ _type in allowed_types
+ ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
+
+ @field_validator("in_type")
+ @classmethod
+ def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
+ NeurekaTestConf._check_type("in_type", v, ["uint8", "int8"])
+ return v
+
+ @field_validator("out_type")
+ @classmethod
+ def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
+ NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"])
+ return v
+
+ @field_validator("weight_type")
+ @classmethod
+ def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
+ NeurekaTestConf._check_type("weight_type", v, ["int8"])
+ return v
+
+ @field_validator("scale_type")
+ @classmethod
+ def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+ if v is not None:
+ NeurekaTestConf._check_type("scale_type", v, ["uint8", "uint32"])
+ return v
+
+ @field_validator("bias_type")
+ @classmethod
+ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
+ if v is not None:
+ NeurekaTestConf._check_type("bias_type", v, ["int32"])
+ return v
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf:
+ assert implies(
+ self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
+ ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+ return self
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf:
+ assert implies(
+ not self.has_norm_quant,
+ self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
+ ), (
+ f"Without quantization, the output type has to be equal to the "
+ f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+ )
+ return self
diff --git a/test/Ne16TestClasses.py b/test/NnxTestClasses.py
similarity index 53%
rename from test/Ne16TestClasses.py
rename to test/NnxTestClasses.py
index d99e829..a7aaa00 100644
--- a/test/Ne16TestClasses.py
+++ b/test/NnxTestClasses.py
@@ -17,18 +17,21 @@
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
-from typing import List, Union, Optional, Set, Tuple
-import torch
-import numpy as np
-import torch.nn.functional as F
+
import os
-from Ne16 import Ne16
+from typing import Callable, Optional, Set, Tuple, Type, Union
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from pydantic import BaseModel, PositiveInt, field_validator, model_validator
+
from HeaderWriter import HeaderWriter
-from TestClasses import implies, KernelShape, Padding, Stride, IntegerType
-from pydantic import BaseModel, field_validator, model_validator, PositiveInt
+from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel
+from TestClasses import IntegerType, KernelShape, Padding, Stride, implies
-class Ne16TestConf(BaseModel):
+class NnxTestConf(BaseModel):
in_height: PositiveInt
in_width: PositiveInt
in_channel: PositiveInt
@@ -46,74 +49,8 @@ class Ne16TestConf(BaseModel):
has_bias: bool
has_relu: bool
- @field_validator("kernel_shape")
- @classmethod
- def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape:
- assert v == KernelShape(height=1, width=1) or v == KernelShape(
- height=3, width=3
- ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3."
- return v
-
- @field_validator("stride")
- @classmethod
- def check_valid_stride(cls, v: Stride) -> Stride:
- assert v == Stride(height=1, width=1) or v == Stride(
- height=2, width=2
- ), f"Unsupported stride {v}. Supported 1x1 and 2x2."
- return v
-
- @staticmethod
- def _check_type(
- name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]]
- ) -> None:
- assert (
- _type in allowed_types
- ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}"
-
- @field_validator("in_type")
- @classmethod
- def check_valid_in_type(cls, v: IntegerType) -> IntegerType:
- Ne16TestConf._check_type("in_type", v, ["uint8"])
- return v
-
- @field_validator("out_type")
- @classmethod
- def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
- Ne16TestConf._check_type("out_type", v, ["uint8", "int8"])
- return v
-
- @field_validator("weight_type")
- @classmethod
- def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
- Ne16TestConf._check_type("weight_type", v, ["int8"])
- return v
-
- @field_validator("scale_type")
- @classmethod
- def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
- if v is not None:
- Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"])
- return v
-
- @field_validator("bias_type")
- @classmethod
- def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]:
- if v is not None:
- Ne16TestConf._check_type("bias_type", v, ["int32"])
- return v
-
@model_validator(mode="after") # type: ignore
- def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf:
- assert implies(
- self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0
- ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}"
- return self
-
- @model_validator(mode="after") # type: ignore
- def check_valid_depthwise(self) -> Ne16TestConf:
- assert implies(
- self.depthwise, self.kernel_shape == KernelShape(height=3, width=3)
- ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}."
+ def check_valid_depthwise_channels(self) -> NnxTestConf:
assert implies(self.depthwise, self.in_channel == self.out_channel), (
f"Input and output channel should be the same in a depthwise layer. "
f"input channel: {self.in_channel}, output channel: {self.out_channel}"
@@ -121,21 +58,15 @@ def check_valid_depthwise(self) -> Ne16TestConf:
return self
@model_validator(mode="after") # type: ignore
- def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf:
+ def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf:
assert implies(
self.kernel_shape == KernelShape(height=1, width=1),
self.padding == Padding(top=0, bottom=0, left=0, right=0),
), f"No padding on 1x1 kernel. Given padding {self.padding}"
return self
- @field_validator("has_norm_quant")
- @classmethod
- def check_valid_has_norm_quant(cls, v: bool) -> bool:
- assert v == True, f"Untested without has_norm_quant."
- return v
-
@model_validator(mode="after") # type: ignore
- def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
+ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf:
if self.has_norm_quant:
assert self.scale_type is not None, "Scale type was not provided."
if self.has_bias:
@@ -143,25 +74,31 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf:
return self
@model_validator(mode="after") # type: ignore
- def check_valid_out_type_with_flags(self) -> Ne16TestConf:
- assert implies(
- not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE
- ), (
- f"Without quantization, the output type has to be equal to the "
- f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}"
+ def check_has_relu_with_norm_quant(self) -> NnxTestConf:
+ assert implies(self.has_relu, self.has_norm_quant), (
+ f"Relu flag can only be enabled when norm_quant is enabled. "
+ f"Given has_relu {self.has_relu} and has_norm_quant {self.has_norm_quant}"
)
- assert implies(
- self.has_norm_quant,
- (self.has_relu and not self.out_type._signed)
- or (not self.has_relu and self.out_type._signed),
- ), (
+ return self
+
+ @model_validator(mode="after") # type: ignore
+ def check_has_bias_with_norm_quant(self) -> NnxTestConf:
+ assert implies(self.has_bias, self.has_norm_quant), (
+ f"Bias flag can only be enabled when norm_quant is enabled. "
+ f"Given has_bias {self.has_bias} and has_norm_quant {self.has_norm_quant}"
+ )
+ return self
+
+ @model_validator(mode="after") # type: ignore
+ def check_valid_out_type_with_relu(self) -> NnxTestConf:
+ assert self.has_relu ^ self.out_type._signed, (
f"Output type has to be unsigned when there is relu, otherwise signed. "
f"Given output type {self.out_type} and has_relu {self.has_relu}"
)
return self
-class Ne16Test:
+class NnxTest:
_CONF_NAME = "conf.json"
_INPUT_NAME = "input.pt"
_OUTPUT_NAME = "output.pt"
@@ -172,7 +109,7 @@ class Ne16Test:
def __init__(
self,
- conf: Ne16TestConf,
+ conf: NnxTestConf,
input: Optional[torch.Tensor],
output: Optional[torch.Tensor],
weight: Optional[torch.Tensor],
@@ -188,7 +125,7 @@ def __init__(
self.bias = bias
self.global_shift = global_shift
- def is_valid(self):
+ def is_valid(self) -> bool:
return all(
[
self.input is not None,
@@ -203,22 +140,22 @@ def is_valid(self):
def save_conf(self, path: Union[str, os.PathLike]) -> None:
os.makedirs(path, exist_ok=True)
- with open(os.path.join(path, Ne16Test._CONF_NAME), "w") as fp:
+ with open(os.path.join(path, NnxTest._CONF_NAME), "w") as fp:
fp.write(self.conf.model_dump_json(indent=4))
def save_data(self, path: Union[str, os.PathLike]) -> None:
os.makedirs(path, exist_ok=True)
- torch.save(self.input, os.path.join(path, Ne16Test._INPUT_NAME))
- torch.save(self.output, os.path.join(path, Ne16Test._OUTPUT_NAME))
- torch.save(self.weight, os.path.join(path, Ne16Test._WEIGHT_NAME))
+ torch.save(self.input, os.path.join(path, NnxTest._INPUT_NAME))
+ torch.save(self.output, os.path.join(path, NnxTest._OUTPUT_NAME))
+ torch.save(self.weight, os.path.join(path, NnxTest._WEIGHT_NAME))
if self.scale is not None:
- torch.save(self.scale, os.path.join(path, Ne16Test._SCALE_NAME))
+ torch.save(self.scale, os.path.join(path, NnxTest._SCALE_NAME))
if self.bias is not None:
- torch.save(self.bias, os.path.join(path, Ne16Test._BIAS_NAME))
+ torch.save(self.bias, os.path.join(path, NnxTest._BIAS_NAME))
if self.global_shift is not None:
torch.save(
- self.global_shift, os.path.join(path, Ne16Test._GLOBAL_SHIFT_NAME)
+ self.global_shift, os.path.join(path, NnxTest._GLOBAL_SHIFT_NAME)
)
def save(self, path: Union[str, os.PathLike]) -> None:
@@ -228,154 +165,111 @@ def save(self, path: Union[str, os.PathLike]) -> None:
@staticmethod
def is_test_dir(path: Union[str, os.PathLike]) -> bool:
fileset = set(os.listdir(path))
- required_fileset = set([Ne16Test._CONF_NAME])
+ required_fileset = set([NnxTest._CONF_NAME])
return required_fileset.issubset(fileset)
@classmethod
- def load(cls, path: Union[str, os.PathLike]) -> "Ne16Test":
- assert Ne16Test.is_test_dir(
+ def load(cls, confCls: Type[NnxTestConf], path: Union[str, os.PathLike]) -> NnxTest:
+ assert NnxTest.is_test_dir(
path
), f"ERROR: Test {path} does not contain the necessary files."
- with open(os.path.join(path, Ne16Test._CONF_NAME), "r") as fp:
- conf = Ne16TestConf.model_validate_json(fp.read())
+ with open(os.path.join(path, NnxTest._CONF_NAME), "r") as fp:
+ conf = confCls.model_validate_json(fp.read())
def load_if_exist(filename: str) -> Optional[torch.Tensor]:
filepath = os.path.join(path, filename)
return torch.load(filepath) if os.path.isfile(filepath) else None
- input = load_if_exist(Ne16Test._INPUT_NAME)
- output = load_if_exist(Ne16Test._OUTPUT_NAME)
- weight = load_if_exist(Ne16Test._WEIGHT_NAME)
- scale = load_if_exist(Ne16Test._SCALE_NAME)
- bias = load_if_exist(Ne16Test._BIAS_NAME)
- global_shift = load_if_exist(Ne16Test._GLOBAL_SHIFT_NAME)
+ input = load_if_exist(NnxTest._INPUT_NAME)
+ output = load_if_exist(NnxTest._OUTPUT_NAME)
+ weight = load_if_exist(NnxTest._WEIGHT_NAME)
+ scale = load_if_exist(NnxTest._SCALE_NAME)
+ bias = load_if_exist(NnxTest._BIAS_NAME)
+ global_shift = load_if_exist(NnxTest._GLOBAL_SHIFT_NAME)
return cls(conf, input, output, weight, scale, bias, global_shift)
-class Ne16TestGenerator:
+class NnxTestGenerator:
_DEFAULT_SEED = 0
@staticmethod
- def _global_shift(
- tensor: torch.Tensor, out_type: IntegerType, has_relu: bool
+ def _calculate_global_shift(
+ tensor: torch.Tensor, out_type: IntegerType
) -> torch.Tensor:
- if has_relu:
- # only adjust positive values
- tensor = tensor[tensor > 0]
-
+ """Calculate global shift so that the output values are in the range of out_type"""
s = tensor.type(torch.float64).std()
target_s = 2 ** (out_type._bits - 1)
- global_shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32)
-
- return global_shift
+ return torch.ceil(torch.log2(s / target_s)).type(torch.int32)
@staticmethod
- def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]):
+ def _random_data(_type: IntegerType, shape: Tuple):
return torch.randint(_type.min, _type.max, size=shape)
- @staticmethod
- def _cast(
- tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
- ) -> torch.Tensor:
- if saturate:
- return tensor.clamp(_type.min, _type.max)
- else:
- return tensor & ((1 << _type._bits) - 1)
-
@staticmethod
def from_conf(
- conf: Ne16TestConf,
+ conf: NnxTestConf,
input: Optional[torch.Tensor] = None,
weight: Optional[torch.Tensor] = None,
scale: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
global_shift: Optional[torch.Tensor] = None,
- ) -> Ne16Test:
- torch.manual_seed(Ne16TestGenerator._DEFAULT_SEED)
+ verbose: bool = False,
+ ) -> NnxTest:
+ torch.manual_seed(NnxTestGenerator._DEFAULT_SEED)
+
+ input_shape = (1, conf.in_channel, conf.in_height, conf.in_width)
+ weight_shape = (
+ conf.out_channel,
+ 1 if conf.depthwise else conf.in_channel,
+ conf.kernel_shape.height,
+ conf.kernel_shape.width,
+ )
+ scale_shape = (1, conf.out_channel, 1, 1)
+ bias_shape = (1, conf.out_channel, 1, 1)
if input is None:
- input = Ne16TestGenerator._random_data(
+ input = NnxTestGenerator._random_data(
_type=conf.in_type,
- shape=(1, conf.in_channel, conf.in_height, conf.in_width),
+ shape=input_shape,
)
- input_padded = F.pad(
- input,
- (
- conf.padding.left,
- conf.padding.right,
- conf.padding.top,
- conf.padding.bottom,
- ),
- "constant",
- 0,
- )
-
if weight is None:
- weight = Ne16TestGenerator._random_data(
+ weight = NnxTestGenerator._random_data(
_type=conf.weight_type,
- shape=(
- conf.out_channel,
- 1 if conf.depthwise else conf.in_channel,
- conf.kernel_shape.height,
- conf.kernel_shape.width,
- ),
+ shape=weight_shape,
)
- # Accumulators are 32bit non-saturating.
- # Calculate in higher precision (int64)
- output = F.conv2d(
- input=input_padded,
- weight=weight,
- stride=(conf.stride.height, conf.stride.width),
- groups=conf.in_channel if conf.depthwise else 1,
- ).type(torch.int64)
- # Use only the lower 32bits
- output = Ne16TestGenerator._cast(
- output, Ne16.ACCUMULATOR_TYPE, saturate=False
- ).type(torch.int32)
-
if conf.has_norm_quant:
if scale is None:
assert conf.scale_type is not None
- scale = Ne16TestGenerator._random_data(
- conf.scale_type, shape=(1, conf.out_channel, 1, 1)
+ scale = NnxTestGenerator._random_data(
+ conf.scale_type, shape=scale_shape
)
- # Scale accumulators are in 48bit, so keeping the data in 64bit
- output = scale * output
- assert output.dtype == torch.int64
-
- if conf.has_bias:
- # Saturating cast to int32
+ if conf.has_bias and bias is None:
assert conf.bias_type is not None
- output = Ne16TestGenerator._cast(
- output, conf.bias_type, saturate=True
- ).type(torch.int32)
-
- if bias is None:
- bias = Ne16TestGenerator._random_data(
- conf.bias_type, shape=(1, conf.out_channel, 1, 1)
- ).type(torch.int32)
- output = output + bias
- output = Ne16TestGenerator._cast(
- output, conf.bias_type, saturate=False
+ bias = NnxTestGenerator._random_data(
+ conf.bias_type, shape=bias_shape
).type(torch.int32)
-
- if conf.has_relu:
- output = F.relu(output)
-
if global_shift is None:
- global_shift = Ne16TestGenerator._global_shift(
- output, conf.out_type, conf.has_relu
+ global_shift = torch.Tensor([0]).type(torch.int32)
+ output = NeuralEngineFunctionalModel().convolution(
+ input,
+ weight,
+ scale,
+ bias,
+ global_shift,
+ verbose=verbose,
+ **conf.__dict__,
)
- output = output >> global_shift
+ NnxTestGenerator._calculate_global_shift(output, conf.out_type)
- # Saturate into out_type
- output = Ne16TestGenerator._cast(output, conf.out_type, saturate=True)
+ output = NeuralEngineFunctionalModel().convolution(
+ input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
+ )
- return Ne16Test(
+ return NnxTest(
conf=conf,
input=input,
output=output,
@@ -386,28 +280,38 @@ def from_conf(
)
@staticmethod
- def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test:
+ def regenerate(test: NnxTest, regen_tensors: Set[str]) -> NnxTest:
test_tensors = set(["input", "output", "weight", "scale", "bias"])
load_tensors = test_tensors - regen_tensors
kwargs = {tensor: getattr(test, tensor) for tensor in load_tensors}
- return Ne16TestGenerator.from_conf(test.conf, **kwargs)
+ return NnxTestGenerator.from_conf(test.conf, **kwargs)
-class Ne16TestHeaderGenerator:
+class NnxTestHeaderGenerator:
DEFAULT_HEADERS_DIR = "app/gen"
- def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None):
+ def __init__(
+ self,
+ weightEncode: Callable[
+ [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8]
+ ],
+ headers_dir: Optional[Union[str, os.PathLike]] = None,
+ ):
if headers_dir is None:
- headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR
+ headers_dir = NnxTestHeaderGenerator.DEFAULT_HEADERS_DIR
self.header_writer = HeaderWriter(headers_dir)
+ # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag,
+ # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator
+ self.weightEncode = weightEncode
- def generate(self, test_name: str, test: Ne16Test):
+ def generate(self, test_name: str, test: NnxTest):
assert test.input is not None and test.output is not None
_, in_channel, in_height, in_width = test.input.shape
_, out_channel, out_height, out_width = test.output.shape
# Render input
in_ctype = test.conf.in_type.ctype()
+ in_signed = test.conf.in_type._signed
in_data = test.input.permute(0, 2, 3, 1).ravel()
self.header_writer.generate_vector_files(
"input", _type=in_ctype, size=in_data.numel(), init=in_data
@@ -431,10 +335,10 @@ def generate(self, test_name: str, test: Ne16Test):
weight_offset = -(2 ** (weight_bits - 1))
weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape
weight_data: np.ndarray = test.weight.numpy() - weight_offset
- weight_init = Ne16.weight_unroll(
+ weight_init = self.weightEncode(
weight_data.astype(np.uint8),
weight_type._bits,
- depthwise=test.conf.depthwise,
+ test.conf.depthwise,
)
self.header_writer.generate_vector_files(
"weight", _type="uint8_t", size=weight_init.size, init=weight_init
@@ -470,13 +374,14 @@ def generate(self, test_name: str, test: Ne16Test):
"height": in_height,
"width": in_width,
"channel": in_channel,
- "bits": 8,
+ "signed": in_signed,
+ "bits": test.conf.in_type._bits,
},
"output": {
"height": out_height,
"width": out_width,
"channel": out_channel,
- "bits": 8,
+ "bits": test.conf.out_type._bits,
},
"weight": {
"height": weight_ks_h,
@@ -486,8 +391,16 @@ def generate(self, test_name: str, test: Ne16Test):
"bits": weight_bits,
"offset": weight_offset,
},
- "scale": {"bits": 8},
- "bias": {"bits": 32},
+ "scale": {
+ "bits": test.conf.scale_type._bits
+ if test.conf.scale_type is not None
+ else 0
+ },
+ "bias": {
+ "bits": test.conf.bias_type._bits
+ if test.conf.bias_type is not None
+ else 0
+ },
"padding": {
"top": test.conf.padding.top,
"bottom": test.conf.padding.bottom,
diff --git a/test/README.md b/test/README.md
index c3d29c5..8442493 100644
--- a/test/README.md
+++ b/test/README.md
@@ -35,3 +35,9 @@ $ pytest test.py --help
- [testgen.py](testgen.py): collection of helper tools for individual tests
For more information you can run the script with the `-h` flag.
+
+## Application
+
+The Makefile in the `app/` uses a flag `ACCELERATOR` to decide which accelerator to use.
+The choices are _ne16_ or _neureka_.
+You can either export it or run it like `ACCELERATOR= make clean all run`.
diff --git a/test/TestClasses.py b/test/TestClasses.py
index c10641c..c6267d6 100644
--- a/test/TestClasses.py
+++ b/test/TestClasses.py
@@ -16,15 +16,16 @@
#
# SPDX-License-Identifier: Apache-2.0
-from functools import cached_property
import re
-from typing import Any, Dict, Literal, Optional, TYPE_CHECKING
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional
+
from pydantic import (
BaseModel,
- model_serializer,
- model_validator,
NonNegativeInt,
PositiveInt,
+ model_serializer,
+ model_validator,
)
diff --git a/test/app/Makefile b/test/app/Makefile
index 14f30fd..ca65892 100644
--- a/test/app/Makefile
+++ b/test/app/Makefile
@@ -40,6 +40,8 @@ INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp
INC_DIRS += gen/inc
INC_FLAGS += $(addprefix -I,$(INC_DIRS))
+APP_CFLAGS += $(INC_FLAGS)
+
# Source files
@@ -58,7 +60,9 @@ APP_SRCS += $(wildcard gen/src/*.c)
# Flags
-APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto
-APP_LDFLAGS += -flto
+ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:])
+APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE)
+
+APP_CFLAGS += -O2 -w -Wall -Werror
include $(RULES_DIR)/pmsis_rules.mk
diff --git a/test/app/src/main.c b/test/app/src/main.c
index cc67050..7cce4bf 100644
--- a/test/app/src/main.c
+++ b/test/app/src/main.c
@@ -29,8 +29,9 @@ int main() {
struct pi_cluster_conf cl_conf;
struct pi_cluster_task cl_task;
- printf("\n");
- printf("Test %s starting\n", TEST_NAME);
+ printf("\nTest " TEST_NAME " starting\n");
+
+ printf("\nAccelerator: " NNX_ACCELERATOR "\n");
printf("\n");
layer_info();
@@ -43,13 +44,13 @@ int main() {
}
pi_cluster_send_task_to_cl(
&cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL));
- pi_cluster_close(&cl_dev);
-
- printf("\n");
- printf("Test %s finished\n", TEST_NAME);
printf("\n");
check_output();
+ pi_cluster_close(&cl_dev);
+
+ printf("\nTest " TEST_NAME " finished\n");
+
return 0;
}
diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c
index ffd93a1..0d98ff6 100644
--- a/test/app/src/nnx_layer.c
+++ b/test/app/src/nnx_layer.c
@@ -19,12 +19,89 @@
*/
#include "nnx_layer.h"
+#include
+
+#ifdef NNX_NE16
+
#include "ne16.h"
#include "ne16_gvsoc.h"
#include "ne16_pulp_bsp.h"
#include "ne16_task.h"
#include "pulp_nnx_ne16.h"
-#include
+
+typedef ne16_norm_mode_e nnx_norm_mode_e;
+typedef ne16_quant_t nnx_quant_t;
+typedef ne16_norm_t nnx_norm_t;
+typedef ne16_task_t nnx_task_t;
+typedef ne16_dev_t nnx_dev_t;
+typedef ne16_pulp_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue ne16TaskFlagTrue
+#define nnxTaskFlagFalse ne16TaskFlagFalse
+
+#define nnx_task_init ne16_task_init
+#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv
+#define nnx_task_set_bits ne16_task_set_bits
+#define nnx_task_set_norm_quant ne16_task_set_norm_quant
+#define nnx_task_set_weight_offset ne16_task_set_weight_offset
+#define nnx_task_set_dims ne16_task_set_dims
+#define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2
+#define nnx_task_set_ptrs ne16_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_ALL
+#define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate ne16_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev ne16_pulp_get_dev
+
+#define nnx_init ne16_nnx_init
+#define nnx_dispatch_wait ne16_nnx_dispatch_wait
+#define nnx_dispatch_stride2x2 ne16_nnx_dispatch_stride2x2
+#define nnx_dispatch ne16_nnx_dispatch
+#define nnx_resolve_wait ne16_nnx_resolve_wait
+#define nnx_term ne16_nnx_term
+
+#elif defined NNX_NEUREKA
+
+#include "neureka.h"
+#include "neureka_gvsoc.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include "pulp_nnx_neureka.h"
+
+typedef neureka_norm_mode_e nnx_norm_mode_e;
+typedef neureka_quant_t nnx_quant_t;
+typedef neureka_norm_t nnx_norm_t;
+typedef neureka_task_t nnx_task_t;
+typedef neureka_dev_t nnx_dev_t;
+typedef neureka_siracusa_conf_t nnx_bsp_conf_t;
+
+#define nnxTaskFlagTrue neurekaTaskFlagTrue
+#define nnxTaskFlagFalse neurekaTaskFlagFalse
+
+#define nnx_task_init neureka_task_init
+#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv
+#define nnx_task_set_bits neureka_task_set_bits
+#define nnx_task_set_norm_quant neureka_task_set_norm_quant
+#define nnx_task_set_weight_offset neureka_task_set_weight_offset
+#define nnx_task_set_dims neureka_task_set_dims
+#define nnx_task_set_ptrs neureka_task_set_ptrs
+
+#define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL
+#define NNX_GVSOC_LOG_FORMAT NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL
+#define nnx_gvsoc_log_activate neureka_gvsoc_log_activate
+#define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate
+
+#define nnx_bsp_get_dev neureka_siracusa_get_dev
+
+#define nnx_init neureka_nnx_init
+#define nnx_dispatch_wait neureka_nnx_dispatch_wait
+#define nnx_dispatch neureka_nnx_dispatch
+#define nnx_resolve_wait neureka_nnx_resolve_wait
+#define nnx_term neureka_nnx_term
+
+#endif // NNX_NE16 || NNX_NEUREKA
// Generated headers
#include "bias.h"
@@ -34,73 +111,109 @@
#include "scale.h"
#include "weight.h"
-static void task_prepare(ne16_task_t *task) {
- ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS,
- WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET,
- (ne16_quant_t){.shift_amount = OUTSHIFT,
- .mode = quantMode8Bit,
- .function = HAS_RELU ? quantFunctionRelu
- : quantFunctionIdentity,
- .flag_rounding = ne16TaskFlagFalse},
- (ne16_norm_t){.mode = normMode8Bit,
- .flag_bias = HAS_BIAS ? ne16TaskFlagTrue
- : ne16TaskFlagFalse,
- .flag_shift = ne16TaskFlagFalse},
- STRIDE_HEIGHT);
-
- if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
- ne16_task_set_dims_stride2x2(
- task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
- INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL,
- OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP,
- PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
- } else {
- ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
- INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
- OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP,
- PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT);
- }
-
- ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL,
- INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output,
- (uint32_t)weight, (uint32_t)scale, NULL,
+static void task_prepare(nnx_task_t *task) {
+ nnx_task_init(task);
+ nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT);
+ nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS);
+
+#if HAS_NORM_QUANT == 1
+#if SCALE_BITS == 8
+ const nnx_norm_mode_e normMode = normMode8Bit;
+#elif SCALE_BITS == 32
+ const nnx_norm_mode_e normMode = normMode32Bit;
+#endif
+
+ nnx_task_set_norm_quant(
+ task,
+ (nnx_quant_t){.shift_amount = OUTSHIFT,
+ .function =
+ HAS_RELU ? quantFunctionRelu : quantFunctionIdentity,
+ .flag_rounding = nnxTaskFlagFalse},
+ (nnx_norm_t){.mode = normMode,
+ .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse,
+ .flag_shift = nnxTaskFlagFalse});
+#endif // HAS_NORM_QUANT
+
+ nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET);
+
+#ifdef NNX_NEUREKA
+#ifdef NEUREKA_WEIGHT_SOURCE_WMEM
+ neureka_task_set_weight_source(task, neurekaWeightSourceWmem);
+#else
+ neureka_task_set_weight_source(task, neurekaWeightSourceTcdm);
+#endif
+#if INPUT_SIGNED == 1
+ neureka_task_set_input_signed(task);
+#else
+ neureka_task_set_input_unsigned(task);
+#endif
+#endif
+
+ const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8;
+ const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride;
+ const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8;
+ const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride;
+
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
+ nnx_task_set_dims_stride2x2(
+ task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+ OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride,
+ WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+ PADDING_LEFT);
+#else
+ nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride,
+ OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride,
+ w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT,
+ PADDING_LEFT);
+#endif
+
+ nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, w_in_stride,
+ PADDING_TOP, PADDING_LEFT, (uint32_t)output,
+ (uint32_t)weight,
+#if HAS_NORM_QUANT == 1
+ (uint32_t)scale, NULL,
#if HAS_BIAS == 1
- (uint32_t)bias
+ (uint32_t)bias
+#else
+ NULL
+#endif
#else
- NULL
+ NULL, NULL, NULL
#endif
);
}
-static void task_execute(ne16_task_t *task) {
- ne16_dev_t *dev = ne16_pulp_get_dev();
+static void task_execute(nnx_task_t *task) {
+ nnx_dev_t *dev = nnx_bsp_get_dev();
- ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG,
- NE16_GVSOC_LOG_FORMAT_HEXADECIMAL);
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+ nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT);
+#endif
- ne16_pulp_conf_t conf = {.max_stall = 8};
- ne16_nnx_init(dev, &conf);
+ nnx_bsp_conf_t conf = {.max_stall = 8};
+ nnx_init(dev, &conf);
- ne16_nnx_dispatch_wait(dev);
+ nnx_dispatch_wait(dev);
- if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) {
- ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH,
- INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH,
- OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL,
- WEIGHT_HEIGHT, WEIGHT_WIDTH);
- } else {
- ne16_nnx_dispatch(dev, task);
- }
+#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2
+ nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT,
+ OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT,
+ WEIGHT_WIDTH);
+#else
+ nnx_dispatch(dev, task);
+#endif
- ne16_nnx_resolve_wait(dev, task);
+ nnx_resolve_wait(dev, task);
- ne16_nnx_term(dev);
+ nnx_term(dev);
- ne16_gvsoc_log_deactivate(dev);
+#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC
+ nnx_gvsoc_log_deactivate(dev);
+#endif
}
void execute_nnx_layer(void *args) {
- ne16_task_t task;
+ nnx_task_t task;
task_prepare(&task);
task_execute(&task);
}
diff --git a/test/conf.toml b/test/conf.toml
index 1222f1d..c24055a 100644
--- a/test/conf.toml
+++ b/test/conf.toml
@@ -22,7 +22,7 @@
# Ne16TestClasses.py:Ne16TestConf().check_valid()
# Input dimensions
-in_height = 3
+in_height = 4
in_width = 3
in_channel = 8
diff --git a/test/conftest.py b/test/conftest.py
index 6c2c15b..3c0a316 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -18,7 +18,17 @@
import os
from typing import Union
-from Ne16TestClasses import Ne16Test, Ne16TestGenerator
+
+import pydantic
+import pytest
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from Ne16TestConf import Ne16TestConf
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import NnxTest, NnxTestGenerator
+
+_SUPPORTED_ACCELERATORS = ["ne16", "neureka"]
def pytest_addoption(parser):
@@ -39,6 +49,13 @@ def pytest_addoption(parser):
default=False,
help="Recursively search for tests in given test directories.",
)
+ parser.addoption(
+ "-A",
+ "--accelerator",
+ choices=_SUPPORTED_ACCELERATORS,
+ default="ne16",
+ help="Choose an accelerator to test. Default: ne16",
+ )
parser.addoption(
"--regenerate",
action="store_true",
@@ -54,7 +71,7 @@ def pytest_addoption(parser):
def _find_test_dirs(path: Union[str, os.PathLike]):
- return [dirpath for dirpath, _, _ in os.walk(path) if Ne16Test.is_test_dir(dirpath)]
+ return [dirpath for dirpath, _, _ in os.walk(path) if NnxTest.is_test_dir(dirpath)]
def pytest_generate_tests(metafunc):
@@ -62,6 +79,18 @@ def pytest_generate_tests(metafunc):
recursive = metafunc.config.getoption("recursive")
regenerate = metafunc.config.getoption("regenerate")
timeout = metafunc.config.getoption("timeout")
+ nnxName = metafunc.config.getoption("accelerator")
+
+ if nnxName == "ne16":
+ nnxMemoryLayoutCls = Ne16MemoryLayout
+ nnxTestConfCls = Ne16TestConf
+ elif nnxName == "neureka":
+ nnxMemoryLayoutCls = NeurekaMemoryLayout
+ nnxTestConfCls = NeurekaTestConf
+ else:
+ assert (
+ False
+ ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}"
if recursive:
tests_dirs = test_dirs
@@ -69,12 +98,28 @@ def pytest_generate_tests(metafunc):
for tests_dir in tests_dirs:
test_dirs.extend(_find_test_dirs(tests_dir))
- # (Re)Generate test data
+ # Load valid tests
+ nnxTestAndNames = []
for test_dir in test_dirs:
- test = Ne16Test.load(test_dir)
- if not test.is_valid() or regenerate:
- test = Ne16TestGenerator.from_conf(test.conf)
- test.save_data(test_dir)
+ try:
+ test = NnxTest.load(nnxTestConfCls, test_dir)
+ # (Re)generate data
+ if not test.is_valid() or regenerate:
+ test = NnxTestGenerator.from_conf(test.conf)
+ test.save_data(test_dir)
+ nnxTestAndNames.append((test, test_dir))
+ except pydantic.ValidationError as e:
+ _ = e
+ nnxTestAndNames.append(
+ pytest.param(
+ (None, test_dir),
+ marks=pytest.mark.skipif(
+ True, reason=f"Invalid test {test_dir}: {e.errors}"
+ ),
+ )
+ )
- metafunc.parametrize("path", test_dirs)
+ metafunc.parametrize("nnxTestAndName", nnxTestAndNames)
metafunc.parametrize("timeout", [timeout])
+ metafunc.parametrize("nnxName", [nnxName])
+ metafunc.parametrize("nnxMemoryLayoutCls", [nnxMemoryLayoutCls])
diff --git a/test/requirements-dev.txt b/test/requirements-dev.txt
index fa0a75a..0956e5e 100644
--- a/test/requirements-dev.txt
+++ b/test/requirements-dev.txt
@@ -1,2 +1,3 @@
pyright
black
+isort
diff --git a/test/test.py b/test/test.py
index 39709b6..1893cdf 100644
--- a/test/test.py
+++ b/test/test.py
@@ -16,13 +16,16 @@
#
# SPDX-License-Identifier: Apache-2.0
+import locale
import os
import re
-from typing import Union, Optional, Tuple
-import locale
import subprocess
-from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator
from pathlib import Path
+from typing import Dict, Optional, Tuple, Type, Union
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator
HORIZONTAL_LINE = "\n" + "-" * 100 + "\n"
@@ -49,17 +52,29 @@ def captured_output(
def execute_command(
- cmd: str, timeout: int = 30, cflags: Optional[str] = None
+ cmd: str,
+ timeout: int = 30,
+ cflags: Optional[str] = None,
+ envflags: Optional[Dict[str, str]] = None,
) -> Tuple[bool, str, str, Optional[str]]:
- app_cflags = 'APP_CFLAGS="' + " ".join(cflags) + '" ' if cflags else ""
- cmd = cmd + app_cflags
+ env = os.environ
+ if cflags:
+ env["APP_CFLAGS"] = '"' + " ".join(cflags) + '"'
+ if envflags:
+ for key, value in envflags.items():
+ env[key] = value
status = None
stdout = None
try:
proc = subprocess.run(
- cmd.split(), check=True, capture_output=True, text=True, timeout=timeout
+ cmd.split(),
+ check=True,
+ capture_output=True,
+ text=True,
+ timeout=timeout,
+ env=env,
)
status = True
msg = "OK"
@@ -94,28 +109,35 @@ def assert_message(
return retval
-def test(path: str, timeout: int):
- test_name = path
- test = Ne16Test.load(path)
-
- Ne16TestHeaderGenerator().generate(test_name, test)
+def test(
+ nnxTestAndName: Tuple[NnxTest, str],
+ timeout: int,
+ nnxName: str,
+ nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+):
+ nnxTest, nnxTestName = nnxTestAndName
+ NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+ nnxTestName, nnxTest
+ )
Path("app/src/nnx_layer.c").touch()
cmd = f"make -C app all run platform=gvsoc"
- passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout)
+ passed, msg, stdout, stderr = execute_command(
+ cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName}
+ )
- assert passed, assert_message(msg, test_name, cmd, stdout, stderr)
+ assert passed, assert_message(msg, nnxTestName, cmd, stdout, stderr)
match_success = re.search(r"> Success! No errors found.", stdout)
match_fail = re.search(r"> Failure! Found (\d*)/(\d*) errors.", stdout)
assert match_success or match_fail, assert_message(
- "No regexes matched.", test_name, cmd, stdout
+ "No regexes matched.", nnxTestName, cmd, stdout
)
assert not match_fail, assert_message(
f"Errors found: {match_fail.group(1)}/{match_fail.group(2)}",
- test_name,
+ nnxTestName,
cmd,
stdout,
)
diff --git a/test/testgen.py b/test/testgen.py
index e748f2e..521aecc 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -16,28 +16,61 @@
#
# SPDX-License-Identifier: Apache-2.0
-import os
import argparse
import json
+import os
+from typing import Optional, Set, Type, Union
+
import toml
-from typing import Optional, Union, Set
-from Ne16TestClasses import (
- Ne16TestConf,
- Ne16TestGenerator,
- Ne16Test,
- Ne16TestHeaderGenerator,
+
+from Ne16MemoryLayout import Ne16MemoryLayout
+from Ne16TestConf import Ne16TestConf
+from NeurekaMemoryLayout import NeurekaMemoryLayout
+from NeurekaTestConf import NeurekaTestConf
+from NnxTestClasses import (
+ NnxTest,
+ NnxTestConf,
+ NnxTestGenerator,
+ NnxTestHeaderGenerator,
)
-def headers_gen(args, test: Optional[Ne16Test] = None):
+def headers_gen(
+ args,
+ nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+ nnxTestConfCls: Type[NnxTestConf],
+ test: Optional[NnxTest] = None,
+):
if test is None:
- test = Ne16Test.load(args.test_dir)
+ test = NnxTest.load(nnxTestConfCls, args.test_dir)
+ assert test is not None
if not test.is_valid():
- test = Ne16TestGenerator.from_conf(test.conf)
- Ne16TestHeaderGenerator().generate(args.test_dir, test)
-
-
-def test_gen(args):
+ test = NnxTestGenerator.from_conf(test.conf)
+ NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate(
+ args.test_dir, test
+ )
+
+
+def print_tensors(test: NnxTest):
+ print("INPUT TENSOR:")
+ print(test.input)
+ print("WEIGHT TENSOR:")
+ print(test.weight)
+ print("SCALE TENSOR:")
+ print(test.scale)
+ print("BIAS TENSOR:")
+ print(test.bias)
+ print("GLOBAL SHIFT TENSOR:")
+ print(test.global_shift)
+ print("EXPECTED OUTPUT TENSOR:")
+ print(test.output)
+
+
+def test_gen(
+ args,
+ nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+ nnxTestConfCls: Type[NnxTestConf],
+):
if args.conf.endswith(".toml"):
test_conf_dict = toml.load(args.conf)
elif args.conf.endswith(".json"):
@@ -49,37 +82,71 @@ def test_gen(args):
)
exit(-1)
- test_conf = Ne16TestConf.model_validate(test_conf_dict)
- test = Ne16TestGenerator.from_conf(test_conf)
+ test_conf = nnxTestConfCls.model_validate(test_conf_dict)
+ test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors)
if not args.skip_save:
test.save(args.test_dir)
if args.headers:
- headers_gen(args, test)
-
-
-def _regen(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
- test = Ne16Test.load(path)
- test = Ne16TestGenerator.regenerate(test, regen_tensors)
+ headers_gen(args, nnxMemoryLayoutCls, nnxTestConfCls, test)
+ if args.print_tensors:
+ print_tensors(test)
+
+
+def _regen(
+ path: Union[str, os.PathLike],
+ regen_tensors: Set[str],
+ nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+ test = NnxTest.load(nnxTestConfCls, path)
+ test = NnxTestGenerator.regenerate(test, regen_tensors)
test.save(path)
-def _regen_recursive(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None:
- if Ne16Test.is_test_dir(path):
- _regen(path, regen_tensors)
+def _regen_recursive(
+ path: Union[str, os.PathLike],
+ regen_tensors: Set[str],
+ nnxTestConfCls: Type[NnxTestConf],
+) -> None:
+ if NnxTest.is_test_dir(path):
+ _regen(path, regen_tensors, nnxTestConfCls)
return
for dirpath, _, _ in os.walk(path):
- _regen_recursive(dirpath, regen_tensors)
+ _regen_recursive(dirpath, regen_tensors, nnxTestConfCls)
-def test_regen(args):
+def test_regen(
+ args,
+ nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]],
+ nnxTestConfCls: Type[NnxTestConf],
+):
+ _ = nnxMemoryLayoutCls
regen_tensors = set(args.tensors + ["output"])
for test_dir in args.test_dirs:
if args.recurse:
- _regen_recursive(test_dir, regen_tensors)
+ _regen_recursive(test_dir, regen_tensors, nnxTestConfCls)
else:
- _regen(test_dir, regen_tensors)
+ _regen(test_dir, regen_tensors, nnxTestConfCls)
+
+
+def add_common_arguments(parser: argparse.ArgumentParser):
+ parser.add_argument(
+ "-t",
+ "--test-dir",
+ type=str,
+ dest="test_dir",
+ required=True,
+ help="Path to the test.",
+ )
+
+ parser.add_argument(
+ "-a",
+ "--accelerator",
+ choices=["ne16", "neureka"],
+ default="ne16",
+ help="Choose an accelerator. Default: ne16",
+ )
parser = argparse.ArgumentParser(
@@ -91,14 +158,7 @@ def test_regen(args):
parser_header = subparsers.add_parser(
"headers", description="Generate headers for a single test."
)
-parser_header.add_argument(
- "-t",
- "--test-dir",
- type=str,
- dest="test_dir",
- required=True,
- help="Path to the test." "basename.",
-)
+add_common_arguments(parser_header)
parser_header.set_defaults(func=headers_gen)
parser_test = subparsers.add_parser(
@@ -112,14 +172,6 @@ def test_regen(args):
required=True,
help="Path to the configuration file.",
)
-parser_test.add_argument(
- "-t",
- "--test-dir",
- type=str,
- dest="test_dir",
- required=True,
- help="Path to the test. " "basename.",
-)
parser_test.add_argument(
"--headers", action="store_true", default=False, help="Generate headers."
)
@@ -130,6 +182,14 @@ def test_regen(args):
dest="skip_save",
help="Skip saving the test.",
)
+parser_test.add_argument(
+ "--print-tensors",
+ action="store_true",
+ default=False,
+ dest="print_tensors",
+ help="Print tensor values to stdout.",
+)
+add_common_arguments(parser_test)
parser_test.set_defaults(func=test_gen)
parser_regen = subparsers.add_parser("regen", description="Regenerate test tensors.")
@@ -138,25 +198,27 @@ def test_regen(args):
type=str,
nargs="?",
default=[],
- help="Tensors that should be regenerated. Output " "included by default.",
-)
-parser_regen.add_argument(
- "-t",
- "--test-dir",
- action="append",
- dest="test_dirs",
- required=True,
- help="Path to the test.",
+ help="Tensors that should be regenerated. Output included by default.",
)
parser_regen.add_argument(
"-r",
"--recursive",
action="store_true",
default=False,
- help="Recursively search for test directiories " "inside given test directories.",
+ help="Recursively search for test directiories inside given test directories.",
)
+add_common_arguments(parser_regen)
parser_regen.set_defaults(func=test_regen)
args = parser.parse_args()
-args.func(args)
+if args.accelerator == "ne16":
+ nnxMemoryLayoutCls = Ne16MemoryLayout
+ nnxTestConfCls = Ne16TestConf
+elif args.accelerator == "neureka":
+ nnxMemoryLayoutCls = NeurekaMemoryLayout
+ nnxTestConfCls = NeurekaTestConf
+else:
+ assert False, f"Unsupported accelerator {args.accelerator}."
+
+args.func(args, nnxMemoryLayoutCls, nnxTestConfCls)
diff --git a/test/tests/test_102/conf.json b/test/tests/test_102/conf.json
new file mode 100644
index 0000000..d6d0c17
--- /dev/null
+++ b/test/tests/test_102/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 4,
+ "in_width": 3,
+ "in_channel": 8,
+ "out_channel": 8,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "int8",
+ "out_type": "uint8",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": true,
+ "has_bias": true,
+ "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_103/conf.json b/test/tests/test_103/conf.json
new file mode 100644
index 0000000..3eff547
--- /dev/null
+++ b/test/tests/test_103/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 20,
+ "in_width": 15,
+ "in_channel": 40,
+ "out_channel": 25,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "int8",
+ "out_type": "uint8",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": true,
+ "has_bias": true,
+ "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_104/conf.json b/test/tests/test_104/conf.json
new file mode 100644
index 0000000..d6d00e4
--- /dev/null
+++ b/test/tests/test_104/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 20,
+ "in_width": 15,
+ "in_channel": 40,
+ "out_channel": 25,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 1,
+ "width": 1
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "int8",
+ "out_type": "uint8",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": true,
+ "has_bias": true,
+ "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_105/conf.json b/test/tests/test_105/conf.json
new file mode 100644
index 0000000..0f34422
--- /dev/null
+++ b/test/tests/test_105/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 20,
+ "in_width": 15,
+ "in_channel": 40,
+ "out_channel": 40,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": true,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "int8",
+ "out_type": "uint8",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": true,
+ "has_bias": true,
+ "has_relu": true
+}
\ No newline at end of file
diff --git a/test/tests/test_106/conf.json b/test/tests/test_106/conf.json
new file mode 100644
index 0000000..0b98f3a
--- /dev/null
+++ b/test/tests/test_106/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 17,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_107/conf.json b/test/tests/test_107/conf.json
new file mode 100644
index 0000000..2f8951c
--- /dev/null
+++ b/test/tests/test_107/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 17,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 1,
+ "width": 1
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_108/conf.json b/test/tests/test_108/conf.json
new file mode 100644
index 0000000..7842aaa
--- /dev/null
+++ b/test/tests/test_108/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": true,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_109/conf.json b/test/tests/test_109/conf.json
new file mode 100644
index 0000000..a6b71c9
--- /dev/null
+++ b/test/tests/test_109/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": true,
+ "stride": {
+ "height": 2,
+ "width": 2
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_110/conf.json b/test/tests/test_110/conf.json
new file mode 100644
index 0000000..622efc4
--- /dev/null
+++ b/test/tests/test_110/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 2,
+ "width": 2
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_111/conf.json b/test/tests/test_111/conf.json
new file mode 100644
index 0000000..d6714c4
--- /dev/null
+++ b/test/tests/test_111/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 1,
+ "width": 1
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 2,
+ "width": 2
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_112/conf.json b/test/tests/test_112/conf.json
new file mode 100644
index 0000000..1991c59
--- /dev/null
+++ b/test/tests/test_112/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 1,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_113/conf.json b/test/tests/test_113/conf.json
new file mode 100644
index 0000000..1dce097
--- /dev/null
+++ b/test/tests/test_113/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 0,
+ "left": 0,
+ "right": 1
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_114/conf.json b/test/tests/test_114/conf.json
new file mode 100644
index 0000000..c1ce5c3
--- /dev/null
+++ b/test/tests/test_114/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 0,
+ "bottom": 1,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/test/tests/test_115/conf.json b/test/tests/test_115/conf.json
new file mode 100644
index 0000000..19153ba
--- /dev/null
+++ b/test/tests/test_115/conf.json
@@ -0,0 +1,29 @@
+{
+ "in_height": 15,
+ "in_width": 34,
+ "in_channel": 33,
+ "out_channel": 33,
+ "padding": {
+ "top": 1,
+ "bottom": 0,
+ "left": 0,
+ "right": 0
+ },
+ "kernel_shape": {
+ "height": 3,
+ "width": 3
+ },
+ "depthwise": false,
+ "stride": {
+ "height": 1,
+ "width": 1
+ },
+ "in_type": "uint8",
+ "out_type": "int32",
+ "weight_type": "int8",
+ "scale_type": "uint8",
+ "bias_type": "int32",
+ "has_norm_quant": false,
+ "has_bias": false,
+ "has_relu": false
+}
\ No newline at end of file
diff --git a/util/hwpe.c b/util/hwpe.c
index 53c1ace..0430081 100644
--- a/util/hwpe.c
+++ b/util/hwpe.c
@@ -31,11 +31,11 @@
#define HWPE_TASK_REG_OFFSET 8
inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
- *(dev->base_addr + reg) = value;
+ dev->base_addr[reg] = value;
}
inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) {
- return *(dev->base_addr + reg);
+ return dev->base_addr[reg];
}
inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) {
diff --git a/util/pulp_nnx_util.c b/util/pulp_nnx_util.c
index 34db512..0107fc1 100644
--- a/util/pulp_nnx_util.c
+++ b/util/pulp_nnx_util.c
@@ -20,14 +20,16 @@
#include "pulp_nnx_util.h"
-inline int divnceil(const int dividend, const int divisor) {
- return ((dividend - 1) / divisor) + 1;
+inline int nnx_calculate_number_of_tiles(const int dim_size,
+ const int tile_size) {
+ return ((dim_size - 1) / tile_size) + 1;
}
-inline int remainder(const int dividend, const int divisor) {
- return ((dividend - 1) % divisor) + 1;
+inline int nnx_calculate_last_tile_size(const int dim_size,
+ const int tile_size) {
+ return ((dim_size - 1) % tile_size) + 1;
}
-inline uint32_t concat_half(const uint16_t high, const uint16_t low) {
+inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) {
return ((uint32_t)high << 16) | low;
}
diff --git a/util/pulp_nnx_util.h b/util/pulp_nnx_util.h
index 638e5d9..d167f6d 100644
--- a/util/pulp_nnx_util.h
+++ b/util/pulp_nnx_util.h
@@ -24,26 +24,28 @@
#include
/**
- * divnceil
+ * nnx_calculate_number_of_iterations
*
- * Does integer division and ceiling of it.
+ * Calculates the number of iterations to go through a dimension.
+ * It does it by dividing the dimension with the tile size and doing a ceiling
+ * the result.
*/
-int divnceil(const int dividend, const int divisor);
+int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size);
/**
- * remainder
+ * nnx_calculate_last_tile_size
*
- * Calculates the remainder but if the remainder should be 0,
- * returns divisor. Used for calculation of the last `remainding`
- * iteration of the tile.
+ * Calculates the size of the last executed tile by calculating the remainder of
+ * the dim_size and the tile_size. In case the remainder is 0, it returns the
+ * full tile_size.
*/
-int remainder(const int dividend, const int divisor);
+int nnx_calculate_last_tile_size(const int dim_size, const int tile_size);
/**
* concat_half
*
* Concatenate 2 16-bit numbers into a 32-bit number.
*/
-uint32_t concat_half(const uint16_t high, const uint16_t low);
+uint32_t nnx_concat_half(const uint16_t high, const uint16_t low);
#endif // __NNX_UTIL_H__