From 9afe07704f099cc9a052712797425577df3f73d5 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 1 Feb 2024 08:54:56 +0100 Subject: [PATCH] Add neureka support (#2) * Add neureka support * Fix Ne16 weight rolling was unpacking to bits instead of 8 * Fix Arpan's name in the contributors * Add multi-accelerator support and neureka as a target * Skip invalid tests * Add readme per accelerator * Remove ne16 input dim 2 stride calculation * Move common validators to NnxTest * Rename test/.py to test/MemoryLayout.py * Extract functional model from test gen * Add isort * Add without norm_quant * Remove -flto * Move stride shift to ne16_task_set_dims_stride2x2 function * Set quantMode in *_set_bits function * Fix output d0 stride * Fixes to strides and stride2x2 Fixed stride2x2 mode for 32bit output. Changed stride meaning from number of elements in that dimension, to number of bytes between elements in that dimension. This also required a change of dimension names. * Rename divnceil and remainder, and add nnx_ prefix * Add citation * Add sdk and compiler commit hashes * Change task size to a define * Remove __PLATFORM__ check from the library since it's pulp-sdk specific * Change channel and bits with w_in_stride for task_set_ptrs --- .gitlab-ci.yml | 24 +- CHANGELOG.md | 20 + README.md | 80 ++-- inc/pulp_nnx_ne16.h | 15 +- inc/pulp_nnx_neureka.h | 61 +++ ne16/README.md | 36 ++ ne16/hal/ne16.c | 2 - ne16/hal/ne16.h | 3 +- ne16/hal/ne16_task.c | 208 +++++---- ne16/hal/ne16_task.h | 64 ++- ne16/hal/ne16_task_defs.h | 37 +- neureka/README.md | 34 ++ neureka/bsp/neureka_siracusa_bsp.c | 78 ++++ neureka/bsp/neureka_siracusa_bsp.h | 67 +++ neureka/gvsoc/neureka_gvsoc.h | 54 +++ .../pulp_nnx_error_codes.h => hal/neureka.c} | 25 +- .../{src/pulp_nnx_util.c => hal/neureka.h} | 23 +- neureka/hal/neureka_task.c | 239 ++++++++++ neureka/hal/neureka_task.h | 187 ++++++++ neureka/hal/neureka_task_defs.h | 124 ++++++ neureka/inc/pulp_nnx_defs.h | 167 ------- neureka/inc/pulp_nnx_hal.h | 217 --------- neureka/inc/pulp_nnx_util.h | 27 -- neureka/src/pulp_nnx_hal.c | 412 ------------------ src/pulp_nnx_ne16.c | 43 +- src/pulp_nnx_neureka.c | 76 ++++ test/.isort.cfg | 4 + test/HeaderWriter.py | 7 +- test/Ne16.py | 94 ---- test/Ne16MemoryLayout.py | 99 +++++ test/Ne16TestConf.py | 111 +++++ test/NeuralEngineFunctionalModel.py | 123 ++++++ test/NeurekaMemoryLayout.py | 158 +++++++ test/NeurekaTestConf.py | 101 +++++ .../{Ne16TestClasses.py => NnxTestClasses.py} | 339 ++++++-------- test/README.md | 6 + test/TestClasses.py | 9 +- test/app/Makefile | 8 +- test/app/src/main.c | 13 +- test/app/src/nnx_layer.c | 217 ++++++--- test/conf.toml | 2 +- test/conftest.py | 61 ++- test/requirements-dev.txt | 1 + test/test.py | 54 ++- test/testgen.py | 174 +++++--- test/tests/test_102/conf.json | 29 ++ test/tests/test_103/conf.json | 29 ++ test/tests/test_104/conf.json | 29 ++ test/tests/test_105/conf.json | 29 ++ test/tests/test_106/conf.json | 29 ++ test/tests/test_107/conf.json | 29 ++ test/tests/test_108/conf.json | 29 ++ test/tests/test_109/conf.json | 29 ++ test/tests/test_110/conf.json | 29 ++ test/tests/test_111/conf.json | 29 ++ test/tests/test_112/conf.json | 29 ++ test/tests/test_113/conf.json | 29 ++ test/tests/test_114/conf.json | 29 ++ test/tests/test_115/conf.json | 29 ++ util/hwpe.c | 4 +- util/pulp_nnx_util.c | 12 +- util/pulp_nnx_util.h | 20 +- 62 files changed, 2827 insertions(+), 1519 deletions(-) create mode 100644 inc/pulp_nnx_neureka.h create mode 100644 ne16/README.md create mode 100644 neureka/README.md create mode 100644 neureka/bsp/neureka_siracusa_bsp.c create mode 100644 neureka/bsp/neureka_siracusa_bsp.h create mode 100644 neureka/gvsoc/neureka_gvsoc.h rename neureka/{inc/pulp_nnx_error_codes.h => hal/neureka.c} (56%) rename neureka/{src/pulp_nnx_util.c => hal/neureka.h} (62%) create mode 100644 neureka/hal/neureka_task.c create mode 100644 neureka/hal/neureka_task.h create mode 100644 neureka/hal/neureka_task_defs.h delete mode 100644 neureka/inc/pulp_nnx_defs.h delete mode 100644 neureka/inc/pulp_nnx_hal.h delete mode 100644 neureka/inc/pulp_nnx_util.h delete mode 100644 neureka/src/pulp_nnx_hal.c create mode 100644 src/pulp_nnx_neureka.c create mode 100644 test/.isort.cfg delete mode 100644 test/Ne16.py create mode 100644 test/Ne16MemoryLayout.py create mode 100644 test/Ne16TestConf.py create mode 100644 test/NeuralEngineFunctionalModel.py create mode 100644 test/NeurekaMemoryLayout.py create mode 100644 test/NeurekaTestConf.py rename test/{Ne16TestClasses.py => NnxTestClasses.py} (53%) create mode 100644 test/tests/test_102/conf.json create mode 100644 test/tests/test_103/conf.json create mode 100644 test/tests/test_104/conf.json create mode 100644 test/tests/test_105/conf.json create mode 100644 test/tests/test_106/conf.json create mode 100644 test/tests/test_107/conf.json create mode 100644 test/tests/test_108/conf.json create mode 100644 test/tests/test_109/conf.json create mode 100644 test/tests/test_110/conf.json create mode 100644 test/tests/test_111/conf.json create mode 100644 test/tests/test_112/conf.json create mode 100644 test/tests/test_113/conf.json create mode 100644 test/tests/test_114/conf.json create mode 100644 test/tests/test_115/conf.json diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b595682..4c7b267 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,25 +20,41 @@ stages: - lint - test -format_python: +python_format: stage: lint tags: - python-lint script: - black --check . -static_check_python: +python_sort_imports: + stage: lint + tags: + - python-lint + script: + - isort --check test + +python_static_check: stage: lint tags: - python-lint script: - pyright . -run_test0: +run_ne16_test: stage: test tags: - gap9-sdk artifacts: untracked: true script: - - cd test && pytest test.py --test-dir tests --recursive + - cd test && pytest test.py --test-dir tests --recursive -A ne16 + +run_neureka_test: + stage: test + tags: + - siracusa-sdk + artifacts: + untracked: true + script: + - cd test && pytest test.py --test-dir tests --recursive -A neureka diff --git a/CHANGELOG.md b/CHANGELOG.md index 48a4461..84b516f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## [Unreleased] + +### Added + +- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels +- Support for kernels without normalization and quantization for NE16 +- isort check +- publication citation + +### Changed + +- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant` +- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension +- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE` + +### Removed + +- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2` +- `mode` attribute from `ne16_quant_t` structure + ## [0.3.0] - 2024-01-14 ### Added diff --git a/README.md b/README.md index be8c9be..1671dc7 100644 --- a/README.md +++ b/README.md @@ -39,51 +39,22 @@ _Note: The accelerator can provide additional helper functions if needed._ ## Accelerators -### NE16 - -Github repo [link](https://github.com/pulp-platform/ne16). - -#### Implemented features - -- [x] Convolution w/ kernel shape 1x1 -- [x] Convolution w/ kernel shape 3x3 -- [x] Depthwise convolution w/ kernel shape 3x3 -- [x] Stride 1x1 -- [x] Stride 2x2 -- [ ] Normalization and quantization - - [x] With - - [ ] Without - - [x] Relu (w/ and w/o) - - [x] Bias (w/ and w/o) - - [ ] Per-channel shift - - [x] Per-layer shift - - [ ] Rounding -- [ ] Input type - - [x] uint8 - - [ ] uint16 -- [ ] Output type - - [x] int8 - - [x] uint8 (only w/ Relu) - - [ ] int32 - - [ ] uint32 (only w/ Relu) -- [ ] Scale type - - [x] uint8 - - [ ] uint16 - - [ ] uint32 -- [x] Bias type - - [x] int32 -- [ ] Weight type - - [x] int8 - - [ ] int2-7 - -### Neureka - -**Untested and considered broken.** +- [NE16](ne16/README.md) +- [Neureka](neureka/README.md) ## Testing You can find information about testing in the dedicated [README](test/README.md). +### Environment + +The library was tested with following pairs of SDKs and compilers: + +| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash | +| --- | --------------- | -------- | -------------------- | +| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 | +| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) | + ## Contributing Bug reports and feature requests should be reported through issues. @@ -93,15 +64,38 @@ All the development should be done through forks and merged onto the `dev` branc The library will follow the [Semantic Versioning](https://semver.org/). -## Citing +## Publication + +
+If you use PULP-NNX in your work, you can cite us: + +``` +@inproceedings{10.1145/3607889.3609092, + author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco}, + title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study}, + year = {2024}, + isbn = {9798400702907}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3607889.3609092}, + doi = {10.1145/3607889.3609092}, + abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.}, + booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems}, + pages = {9–10}, + numpages = {2}, + keywords = {TinyML, MCUs, deep learning, HW accelerators}, + location = {, Hamburg, Germany, }, + series = {CASES '23 Companion} +} +``` -*TBA* +
## Contributors * Luka Macan <[luka.macan@unibo.it](mailto:luka.macan@unibo.it)> * Francesco Conti <[fconti@unibo.it](mailto:fconti@unibo.it)> -* Arpan Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)> +* Arpan Suravi Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)> ## License diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h index eff9a60..97e6e2e 100644 --- a/inc/pulp_nnx_ne16.h +++ b/inc/pulp_nnx_ne16.h @@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev); /** ne16_nnx_dispatch * * Dispatch a task to the accelerator. - * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns + * 0. */ int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task); @@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task); */ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task); - /* Additional helper functions */ /** ne16_nnx_dispatch_stride2x2 @@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task); * tile the tile to the subtile's spatial dimensions (in this case 3x3 output). * Works only if the k_out is divisible by 2. */ -void ne16_nnx_dispatch_stride2x2( - ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker); +void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task, + const uint32_t w_in, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t h_ker, + const uint8_t w_ker); diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h new file mode 100644 index 0000000..25ef4a8 --- /dev/null +++ b/inc/pulp_nnx_neureka.h @@ -0,0 +1,61 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka.h" +#include "neureka_siracusa_bsp.h" +#include "neureka_task.h" +#include + +/* PULP-NNX interface */ + +void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf); +void neureka_nnx_term(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_check + * + * Check whether you can dispatch to the accelerator. + */ +int neureka_nnx_dispatch_check(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_wait + * + * Block until you can dispatch to the accelerator. + */ +void neureka_nnx_dispatch_wait(neureka_dev_t *dev); + +/** neureka_nnx_dispatch + * + * Dispatch a task to the accelerator. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns + * 0. + */ +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_check + * + * Check whether the task has been resolved. + */ +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_wait + * + * Block until you can resolve the task. + */ +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task); diff --git a/ne16/README.md b/ne16/README.md new file mode 100644 index 0000000..9f05956 --- /dev/null +++ b/ne16/README.md @@ -0,0 +1,36 @@ +# NE16 + +## Docs + +- Github repo [link](https://github.com/pulp-platform/ne16). + +## Implemented features + +- [x] Convolution w/ kernel shape 1x1 +- [x] Convolution w/ kernel shape 3x3 +- [x] Depthwise convolution w/ kernel shape 3x3 +- [x] Stride 2x2 +- [ ] Normalization and quantization + - [x] With + - [x] Without + - [x] Relu (w/ and w/o) + - [x] Bias (w/ and w/o) + - [ ] Per-channel shift + - [x] Per-layer shift + - [ ] Rounding +- [ ] Input type + - [x] uint8 + - [ ] uint16 +- [ ] Output type + - [x] int8 + - [x] uint8 (only w/ Relu) + - [x] int32 +- [ ] Scale type + - [x] uint8 + - [ ] uint16 + - [ ] uint32 +- [x] Bias type + - [x] int32 +- [ ] Weight type + - [x] int8 + - [ ] int2-7 diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c index 97859b4..d92a7d5 100644 --- a/ne16/hal/ne16.c +++ b/ne16/hal/ne16.c @@ -23,8 +23,6 @@ #define NE16_STATUS_EMPTY (0x000) #define NE16_STATUS_FULL (0x101) -inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; } - inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) { uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); return (status & 0x1) + ((status >> 8) & 0x1); diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h index c4c3a19..88ebee7 100644 --- a/ne16/hal/ne16.h +++ b/ne16/hal/ne16.h @@ -24,11 +24,12 @@ #include "hwpe.h" #include +#define NE16_TASK_QUEUE_SIZE (2) + typedef struct ne16_dev_t { hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ } ne16_dev_t; -int ne16_task_queue_size(ne16_dev_t *dev); int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev); int ne16_task_queue_empty(ne16_dev_t *dev); int ne16_task_queue_full(ne16_dev_t *dev); diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c index 0ba54d5..f8408da 100644 --- a/ne16/hal/ne16_task.c +++ b/ne16/hal/ne16_task.c @@ -22,9 +22,9 @@ #include "ne16_task_defs.h" #include "pulp_nnx_util.h" -inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, - uint32_t i_width, uint32_t n_height, - uint32_t n_width) { +uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { uint32_t tile_padding = padding; if (i_height > 0) { tile_padding &= ~(0xf << 28); @@ -41,41 +41,65 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, return tile_padding; } -void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const ne16_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, ne16_quant_t quant, - ne16_norm_t norm, const uint8_t stride) { - const uint32_t flag_mode16 = - input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC; - - *task = (ne16_task_t){ - .outbytes = output_bits / 8, - .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 - : NE16_WEIGHT_D0_STRIDE_MODE8, - .qw = weights_bits, - .stride_shift = stride == 2 ? 1 : 0, - .output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT - : NE16_OUTPUT_CHANNEL_THROUGHPUT, - .kernel_shape = kernel_shape, - .depthwise = depthwise, - .data = {0}}; - - const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0; +void ne16_task_init(ne16_task_t *task) { *task = (ne16_task_t){.data = {0}}; } +void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = + depthwise ? NE16_SUBTILE_INPUT_CHANNEL : NE16_SUBTILE_OUTPUT_CHANNEL; const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1 : depthwise == 1 ? NE16_FLAG_MODE_3x3_DW : NE16_FLAG_MODE_3x3; + const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0; + + task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE | NE16_MASK_FLAG_STRIDE_2x2); + task->data.cfg.conf0 |= flag_mode | flag_stride2x2; +} + +void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weight_bits) { + const uint32_t flag_mode16 = + input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC; + + ne16_quant_mode_e quantMode; + if (output_bits == 16) { + quantMode = quantMode16Bit; + } else if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->weight_d0_stride = + flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8; + task->qw = weight_bits; + task->data.cfg.conf0 &= ~(NE16_MASK_QUANT_MODE | NE16_MASK_FLAG_MODE16 | + NE16_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | flag_mode16 | (weight_bits - 1); +} + +void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant, + ne16_norm_t norm) { + task->data.cfg.conf0 &= + ~(NE16_MASK_QUANT_FUNCTION | NE16_MASK_SHIFT_AMOUNT | + NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE | + NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT); task->data.cfg.conf0 |= - NE16_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING | - norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode | - flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2; + NE16_FLAG_NORM_QUANT | quant.function | (quant.shift_amount << 16) | + quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING | norm.mode | + norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT; +} - task->data.cfg.weight_offset_factor = weights_offset_factor; +void ne16_task_set_weight_offset(ne16_task_t *task, + ne16_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NE16_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; } /** ne16_pad_ptr @@ -84,21 +108,18 @@ void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, * it was the start to the padded data. * Necessary for input pointer when it's padded. */ -inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, - const uint8_t padding_left) { - return ptr - (padding_top * width + padding_left) * channel * bits / 8; +uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride, + const uint8_t padding_top, const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * width_stride; } -inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, - uint32_t w_in, uint32_t k_in, uint8_t bits_in, - uint8_t padding_top, uint8_t padding_left, - uint32_t output_ptr, uint32_t weights_ptr, - uint32_t scale_ptr, uint32_t shift_ptr, - uint32_t bias_ptr) { +void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in, + uint32_t w_in_stride, uint8_t padding_top, + uint8_t padding_left, uint32_t output_ptr, + uint32_t weights_ptr, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr) { task->data.infeat_ptr = - ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); + ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left); task->data.outfeat_ptr = output_ptr; task->data.weights_ptr = weights_ptr; task->data.scale_ptr = scale_ptr; @@ -107,100 +128,101 @@ inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, } void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride) { - const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = + nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL); const ne16_stride_t input_stride = { - .d0 = k_in_stride, - .d1 = k_in_stride * w_in_stride, - .d2 = task->depthwise ? 0 - : k_in_stride * NE16_FILTER_BUFFER_SIZE * - NE16_FILTER_BUFFER_SIZE}; + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; task->data.cfg.input_stride = input_stride; - // WARNING: Stride works only for even output channel sizes (divisible by 2) - const ne16_stride_t output_stride = { - .d0 = 32, - .d1 = (k_out_stride * task->outbytes) >> task->stride_shift, - .d2 = - (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift}; + const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; task->data.cfg.output_stride = output_stride; if (task->kernel_shape == 1) { task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw; task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw * num_k_in; - task->data.cfg.weights_stride.d2 = 0; } else if (!task->depthwise) { task->data.cfg.weights_stride.d0 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride; task->data.cfg.weights_stride.d1 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride * task->qw * num_k_in; - task->data.cfg.weights_stride.d2 = 0; } else { task->data.cfg.weights_stride.d0 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride; task->data.cfg.weights_stride.d1 = 0; - task->data.cfg.weights_stride.d2 = 0; } + task->data.cfg.weights_stride.d2 = 0; } void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint8_t padding_bottom, const uint8_t padding_right) { - const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput); - const uint16_t num_Ki = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); - const uint16_t num_Ho = divnceil(h_out, NE16_FILTER_SIZE); - const uint16_t num_Wo = divnceil(w_out, NE16_FILTER_SIZE); - - const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput); - const uint16_t rem_Ki = remainder(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); - const uint16_t rem_Ho = remainder(h_out, NE16_FILTER_SIZE); - const uint16_t rem_Wo = remainder(w_out, NE16_FILTER_SIZE); + const uint16_t num_Ko = + nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel); + const uint16_t num_Ki = + nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL); + const uint16_t num_Ho = + nnx_calculate_number_of_tiles(h_out, NE16_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = + nnx_calculate_number_of_tiles(w_out, NE16_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = + nnx_calculate_last_tile_size(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = + nnx_calculate_last_tile_size(k_in, NE16_SUBTILE_INPUT_CHANNEL); + const uint16_t rem_Ho = + nnx_calculate_last_tile_size(h_out, NE16_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = + nnx_calculate_last_tile_size(w_out, NE16_SUBTILE_OUTPUT_WIDTH); const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; const ne16_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; + .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki), + .HoWo = nnx_concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki), + .HoWo = nnx_concat_half(rem_Ho, rem_Wo), + .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}}; task->data.cfg.subtile = subtile; } -inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, - const uint8_t bottom, const uint8_t left, - const uint8_t right, const uint8_t value) { +void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | (value & 0xff); } -inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, - const uint8_t right, const uint8_t bottom, - const uint8_t left) { +void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | ((bottom & 0xff) << 8) | ((left & 0xff) << 0); } void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t k_in, const uint32_t h_in_stride, + const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, + const uint32_t h_out_stride, + const uint32_t w_out_stride, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left) { - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); + ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, padding_right); ne16_task_set_padding(task, padding_top, padding_bottom, padding_left, @@ -209,18 +231,20 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, void ne16_task_set_dims_stride2x2( ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left) { const uint8_t stride = 2; - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); + // WARNING: works only for even output channel stride (divisible by 2) + ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride >> 1, + w_out_stride >> 1); ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1, - k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0); + k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, + 0); const uint8_t padding_bottom_new = (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom; diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h index df16b6c..69bc78c 100644 --- a/ne16/hal/ne16_task.h +++ b/ne16/hal/ne16_task.h @@ -60,7 +60,6 @@ typedef enum ne16_quant_function_e { typedef struct ne16_quant_t { // Shift amount must be in range 0x00-0x1F unsigned shift_amount; - ne16_quant_mode_e mode; ne16_quant_function_e function; int flag_rounding; } ne16_quant_t; @@ -110,38 +109,46 @@ typedef struct ne16_task_data_t { typedef struct ne16_task_t { ne16_task_data_t data; - uint8_t outbytes; uint8_t weight_d0_stride; uint8_t qw; - uint8_t stride_shift; - uint8_t output_channel_throughput; + uint8_t subtile_output_channel; uint8_t kernel_shape; uint8_t depthwise; uint8_t id; } ne16_task_t; -void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const ne16_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, ne16_quant_t quant, - ne16_norm_t norm, const uint8_t stride); +void ne16_task_init(ne16_task_t *task); +void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride); +void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weight_bits); +void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant, + ne16_norm_t norm); +void ne16_task_set_weight_offset(ne16_task_t *task, + ne16_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, uint32_t i_width, uint32_t n_height, uint32_t n_width); uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, const uint8_t padding_left); + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left); void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in, - uint32_t k_in, uint8_t bits_in, uint8_t padding_top, + uint32_t w_in_stride, uint8_t padding_top, uint8_t padding_left, uint32_t output_ptr, uint32_t weights_ptr, uint32_t scale_ptr, uint32_t shift_ptr, uint32_t bias_ptr); +/** ne16_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride); + const uint32_t h_out_stride, + const uint32_t w_out_stride); void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint8_t padding_bottom, @@ -152,19 +159,32 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, const uint8_t right, const uint8_t bottom, const uint8_t left); +/** ne16_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t k_in, const uint32_t h_in_stride, + const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, + const uint32_t h_out_stride, + const uint32_t w_out_stride, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left); +/** ne16_task_set_dims_stride2x2 + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_dims_stride2x2( ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left); diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h index 803e30e..d3d7297 100644 --- a/ne16/hal/ne16_task_defs.h +++ b/ne16/hal/ne16_task_defs.h @@ -25,8 +25,13 @@ #define NE16_FILTER_SIZE (3) #define NE16_FILTER_BUFFER_SIZE (5) -#define NE16_INPUT_CHANNEL_THROUGHPUT (16) -#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32) +#define NE16_SUBTILE_INPUT_HEIGHT (5) +#define NE16_SUBTILE_INPUT_WIDTH (5) +#define NE16_SUBTILE_INPUT_CHANNEL (16) +#define NE16_SUBTILE_OUTPUT_HEIGHT (3) +#define NE16_SUBTILE_OUTPUT_WIDTH (3) +#define NE16_SUBTILE_OUTPUT_CHANNEL (32) +#define NE16_OUTPUT_BANDWIDTH_BYTES (32) #define NE16_WEIGHT_D0_STRIDE_MODE8 (2) #define NE16_WEIGHT_D0_STRIDE_MODE16 (1) @@ -59,12 +64,6 @@ #define NE16_REG_FILTER_MASKING 22 #define NE16_REG_CONF0 23 -/* SHIFT */ - -#define NE16_SHIFT_FLAG_NORM_BIAS (25) -#define NE16_SHIFT_FLAG_NORM_SHIFT (24) -#define NE16_SHIFT_ROUNDING (11) - /* CONF0 FLAGS */ #define NE16_FLAG_NORM_BIAS (1 << 25) @@ -81,7 +80,7 @@ #define NE16_NORM_MODE_8BIT (0 << 12) #define NE16_NORM_MODE_16BIT (1 << 12) #define NE16_NORM_MODE_32BIT (2 << 12) -#define NE16_FLAG_ROUND (1 << 11) +#define NE16_FLAG_ROUNDING (1 << 11) #define NE16_FLAG_STRIDE_2x2 (1 << 8) #define NE16_FLAG_LINEAR_MODE (1 << 7) #define NE16_FLAG_MODE_3x3 (0 << 5) @@ -91,10 +90,26 @@ #define NE16_FLAG_MODE_BASIC (0 << 3) #define NE16_FLAG_MODE16 (1 << 3) +/* SHIFT */ + +#define NE16_SHIFT_FLAG_NORM_BIAS (25) +#define NE16_SHIFT_FLAG_NORM_SHIFT (24) +#define NE16_SHIFT_FLAG_ROUNDING (11) + /* Masks */ -#define NE16_MASK_QUANT_FUNCTION (1 << 23) -#define NE16_MASK_QUANT_MODE (3 << 21) +#define NE16_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NE16_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NE16_MASK_QUANT_FUNCTION (0x1 << 23) +#define NE16_MASK_QUANT_MODE (0x3 << 21) +#define NE16_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NE16_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NE16_MASK_NORM_MODE (0x3 << 12) +#define NE16_MASK_FLAG_ROUNDING (0x1 << 11) +#define NE16_MASK_FLAG_STRIDE_2x2 (0x1 << 8) +#define NE16_MASK_FLAG_MODE (0x3 << 5) +#define NE16_MASK_FLAG_MODE16 (0x1 << 3) +#define NE16_MASK_FLAG_WEIGHT_BITS (0x7 << 0) /* PADDING */ diff --git a/neureka/README.md b/neureka/README.md new file mode 100644 index 0000000..9c83f4e --- /dev/null +++ b/neureka/README.md @@ -0,0 +1,34 @@ +# Neureka + +## Docs + +Github repo [link](https://github.com/siracusa-soc/ne). + +## Implemented features + +- [x] Convolution w/ kernel shape 1x1 +- [x] Convolution w/ kernel shape 3x3 +- [x] Depthwise convolution w/ kernel shape 3x3 +- [ ] Normalization and quantization + - [x] With + - [x] Without + - [x] Relu (w/ and w/o) + - [x] Bias (w/ and w/o) + - [ ] Per-channel shift + - [x] Per-layer shift + - [ ] Rounding +- [x] Input type + - [x] uint8 + - [x] int8 +- [x] Output type + - [x] int8 + - [x] uint8 (only w/ Relu) + - [x] int32 +- [ ] Scale type + - [x] uint8 + - [ ] uint32 +- [x] Bias type + - [x] int32 +- [ ] Weight type + - [x] int8 + - [ ] int2-7 diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c new file mode 100644 index 0000000..57136fd --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.c @@ -0,0 +1,78 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_siracusa_bsp.h" +#include + +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR \ + (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + \ + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff +#define NEUREKA_SIRACUSA_MAX_STALL (8) +#define NEUREKA_SIRACUSA_EVENT (1 << 12) +#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000) + +void neureka_siracusa_hci_setpriority_neureka() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_setpriority_core() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_reset_max_stall() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf) { + neureka_siracusa_hci_setpriority_neureka(); + neureka_siracusa_hci_set_max_stall(conf->max_stall); +} + +void neureka_siracusa_close() { + neureka_siracusa_hci_reset_max_stall(); + neureka_siracusa_hci_setpriority_core(); +} + +void neureka_siracusa_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT); +} + +static const neureka_dev_t neureka_siracusa_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}}; + +const neureka_dev_t *neureka_siracusa_get_dev() { + return &neureka_siracusa_dev; +} diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h new file mode 100644 index 0000000..be75a20 --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.h @@ -0,0 +1,67 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_siracusa_BSP_H__ +#define __NEUREKA_siracusa_BSP_H__ + +#include "neureka.h" +#include + +/** + * neureka_siracusa_setpriority_neureka + * + * Set HCI interconnect bus priority to prioritize neureka. + */ +void neureka_siracusa_hci_setpriority_neureka(); + +/** + * neureka_siracusa_setpriority_core + * + * Set HCI bus priority to prioritize cores. + */ +void neureka_siracusa_hci_setpriority_core(); + +/** + * neureka_siracusa_hci_reset_maxstall + * + * Reset the HCI bus maxstall parameter. + * TODO: Check if it disables it also or just resets? + */ +void neureka_siracusa_hci_reset_max_stall(); + +/** + * neureka_siracusa_hci_set_maxstall + * + * Set the HCI bus maxstall. Maxstall defines how many cycles + * will the HCI bus stall the lower priority master, i.e. neureka or core, + * before letting it do a transaction. + */ +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall); + +typedef struct neureka_siracusa_conf_t { + int max_stall; +} neureka_siracusa_conf_t; + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf); +void neureka_siracusa_close(); +void neureka_siracusa_event_wait_and_clear(); +const neureka_dev_t *neureka_siracusa_get_dev(); + +#endif // !__NEUREKA_siracusa_BSP_H__ diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h new file mode 100644 index 0000000..37eeab0 --- /dev/null +++ b/neureka/gvsoc/neureka_gvsoc.h @@ -0,0 +1,54 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_GVSOC_H__ +#define __NEUREKA_GVSOC_H__ + +#include "neureka.h" +#include "neureka_task.h" + +#define NEUREKA_REG_GVSOC_LOG_LEVEL 24 +#define NEUREKA_REG_GVSOC_LOG_FORMAT 25 + +typedef enum neureka_gvsoc_log_format_e { + NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0, + NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3 +} neureka_gvsoc_log_format_e; + +typedef enum neureka_gvsoc_log_level_e { + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0, + NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1, + NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2, + NEUREKA_GVSOC_LOG_LEVEL_ALL = 3 +} neureka_gvsoc_log_level_e; + +static void neureka_gvsoc_log_activate(neureka_dev_t *dev, + neureka_gvsoc_log_level_e log_level, + neureka_gvsoc_log_format_e format) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level); + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format); +} + +static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END); +} + +#endif // __NEUREKA_GVSOC_H__ diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c similarity index 56% rename from neureka/inc/pulp_nnx_error_codes.h rename to neureka/hal/neureka.c index dc71575..dc829d9 100644 --- a/neureka/inc/pulp_nnx_error_codes.h +++ b/neureka/hal/neureka.c @@ -18,15 +18,20 @@ * SPDX-License-Identifier: Apache-2.0 */ -#ifndef __NE16_ERROR_CODES_H__ -#define __NE16_ERROR_CODES_H__ +#include "neureka.h" -typedef enum { - success = 0, - weightBitwidthOutOfBounds, - unsupportedWeightOffsetMode, - unsupportedFeatureBitwidth, - dimensionMismatch -} nnx_error_code; +#define NEUREKA_STATUS_EMPTY (0x000) +#define NEUREKA_STATUS_FULL (0x101) -#endif // __NE16_ERROR_CODES_H__ \ No newline at end of file +inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) { + uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); + return (status & 0x1) + ((status >> 8) & 0x1); +} + +inline int neureka_task_queue_empty(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY; +} + +inline int neureka_task_queue_full(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL; +} diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h similarity index 62% rename from neureka/src/pulp_nnx_util.c rename to neureka/hal/neureka.h index daaaf2b..eae77a1 100644 --- a/neureka/src/pulp_nnx_util.c +++ b/neureka/hal/neureka.h @@ -18,13 +18,20 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "pulp_nnx_util.h" -#include "pulp_nnx_hal.h" +#ifndef __NEUREKA_H__ +#define __NEUREKA_H__ -void nnx_activate_gvsoc_logging(int log_level) { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level); -} +#include "hwpe.h" +#include -void nnx_deactivate_gvsoc_logging() { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0); -} +#define NEUREKA_TASK_QUEUE_SIZE (2) + +typedef struct neureka_dev_t { + hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ +} neureka_dev_t; + +int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev); +int neureka_task_queue_empty(neureka_dev_t *dev); +int neureka_task_queue_full(neureka_dev_t *dev); + +#endif // __NEUREKA_H__ diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c new file mode 100644 index 0000000..501b2b9 --- /dev/null +++ b/neureka/hal/neureka_task.c @@ -0,0 +1,239 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_task.h" +#include "neureka_task_defs.h" +#include "pulp_nnx_util.h" + +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { + uint32_t tile_padding = padding; + if (i_height > 0) { + tile_padding &= ~(0xf << 28); + } + if (i_width < n_width - 1) { + tile_padding &= ~(0xf << 24); + } + if (i_height < n_height - 1) { + tile_padding &= ~(0xf << 20); + } + if (i_width > 0) { + tile_padding &= ~(0xf << 16); + } + return tile_padding; +} + +void neureka_task_init(neureka_task_t *task) { + *task = (neureka_task_t){.data = {0}}; +} + +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, + const uint8_t stride) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_OUTPUT_CHANNEL; + task->subtile_input_channel = kernel_shape == 3 + ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1; + + const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1 + : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW + : NEUREKA_FLAG_MODE_3x3; + + task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE); + task->data.cfg.conf0 |= flag_mode; +} + +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits) { + neureka_quant_mode_e quantMode; + if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->qw = weight_bits; + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | (weight_bits - 1); +} + +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm) { + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT | + NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS | + NEUREKA_MASK_FLAG_NORM_SHIFT); + task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function | + (quant.shift_amount << 16) | norm.mode | + norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT; +} + +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; +} + +void neureka_task_set_input_signed(neureka_task_t *task) { + task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_input_unsigned(neureka_task_t *task) { + task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE; + task->data.cfg.conf0 |= weight_source; +} + +/** neureka_pad_ptr + * + * Calculate the pointer to the start of the ptr as if + * it was the start to the padded data. + * Necessary for input pointer when it's padded. + */ +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * width_stride; +} + +void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr) { + task->data.infeat_ptr = + neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left); + task->data.outfeat_ptr = output_ptr; + task->data.weights_ptr = weights_ptr; + task->data.scale_ptr = scale_ptr; + task->data.scale_shift_ptr = shift_ptr; + task->data.scale_bias_ptr = bias_ptr; +} + +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + + const neureka_stride_t input_stride = { + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; + task->data.cfg.input_stride = input_stride; + + const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; + task->data.cfg.output_stride = output_stride; + + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES; + if (task->kernel_shape == 1) { // 1x1 + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in; + } else if (!task->depthwise) { // 3x3 + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in; + } else { // 3x3 depthwise + task->data.cfg.weights_stride.d1 = 0; + } + task->data.cfg.weights_stride.d2 = 0; +} + +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right) { + const uint16_t num_Ko = + nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel); + const uint16_t num_Ki = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + const uint16_t num_Ho = + nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = + nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = + nnx_calculate_last_tile_size(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = + nnx_calculate_last_tile_size(k_in, task->subtile_input_channel); + const uint16_t rem_Ho = + nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = + nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + const uint16_t rem_Hi = + (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; + const uint16_t rem_Wi = + (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; + + const neureka_subtile_t subtile = { + .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki), + .HoWo = nnx_concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki), + .HoWo = nnx_concat_half(rem_Ho, rem_Wo), + .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}}; + task->data.cfg.subtile = subtile; +} + +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { + task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | + ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | + (value & 0xff); +} + +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { + task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | + ((bottom & 0xff) << 8) | ((left & 0xff) << 0); +} + +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, const uint8_t padding_left) { + neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); + neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, + padding_right); + neureka_task_set_padding(task, padding_top, padding_bottom, padding_left, + padding_right, 0); +} diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h new file mode 100644 index 0000000..2d06468 --- /dev/null +++ b/neureka/hal/neureka_task.h @@ -0,0 +1,187 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_TASK_H__ +#define __NEUREKA_TASK_H__ + +#include "neureka_task_defs.h" +#include + +typedef enum neureka_task_flag_e { + neurekaTaskFlagFalse = 0, + neurekaTaskFlagTrue = 1 +} neureka_task_flag_e; + +typedef enum neureka_weight_source_e { + neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM, + neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM +} neureka_weight_source_e; + +typedef enum neureka_weight_offset_mode_e { + weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, + weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE +} neureka_weight_offset_mode_e; + +typedef enum { + normMode8Bit = NEUREKA_NORM_MODE_8BIT, + normMode32Bit = NEUREKA_NORM_MODE_32BIT +} neureka_norm_mode_e; + +typedef struct neureka_norm_t { + neureka_norm_mode_e mode; + int flag_bias; + int flag_shift; +} neureka_norm_t; + +typedef enum neureka_quant_mode_e { + quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, + quantMode32Bit = NEUREKA_QUANT_MODE_32BIT +} neureka_quant_mode_e; + +typedef enum neureka_quant_function_e { + quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, + quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU +} neureka_quant_function_e; + +typedef struct neureka_quant_t { + // Shift amount must be in range 0x00-0x1F + unsigned shift_amount; + neureka_quant_function_e function; + int flag_rounding; +} neureka_quant_t; + +typedef struct neureka_stride_t { + uint32_t d0; + uint32_t d1; + uint32_t d2; +} neureka_stride_t; + +typedef struct neureka_subtile_remainder_t { + uint32_t KoKi; + uint32_t HoWo; + uint32_t HiWi; +} neureka_subtile_remainder_t; + +typedef struct neureka_subtile_number_t { + uint32_t KoKi; + uint32_t HoWo; +} neureka_subtile_number_t; + +typedef struct neureka_subtile_t { + neureka_subtile_remainder_t remainder; + neureka_subtile_number_t number; +} neureka_subtile_t; + +typedef struct neureka_cfg_t { + neureka_stride_t input_stride; + neureka_stride_t output_stride; + neureka_stride_t weights_stride; + neureka_subtile_t subtile; + uint32_t padding; + uint32_t weight_offset_factor; + uint32_t filter_mask; + uint32_t conf0; +} neureka_cfg_t; + +typedef struct neureka_task_data_t { + uint32_t weights_ptr; + uint32_t infeat_ptr; + uint32_t outfeat_ptr; + uint32_t scale_ptr; + uint32_t scale_shift_ptr; + uint32_t scale_bias_ptr; + neureka_cfg_t cfg; +} neureka_task_data_t; + +typedef struct neureka_task_t { + neureka_task_data_t data; + uint8_t qw; + uint8_t subtile_output_channel; + uint8_t subtile_input_channel; + uint8_t kernel_shape; + uint8_t depthwise; + uint8_t id; +} neureka_task_t; + +void neureka_task_init(neureka_task_t *task); +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride); +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits); +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm); +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); +void neureka_task_set_input_signed(neureka_task_t *task); +void neureka_task_set_input_unsigned(neureka_task_t *task); +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source); +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width); +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left); +void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr); +/** neureka_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride); +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right); +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value); +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left); +/** neureka_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, const uint8_t padding_left); + +#endif // !__NEUREKA_TASK_H__ diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h new file mode 100644 index 0000000..fa08289 --- /dev/null +++ b/neureka/hal/neureka_task_defs.h @@ -0,0 +1,124 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_DEFS_H__ +#define __NEUREKA_DEFS_H__ + +/* ARHITECTURE */ + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32) + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28) + +#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6) +#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6) +#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32) + +#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32) + +/* TASK REGISTERS */ + +// job configuration +#define NEUREKA_REG_WEIGHTS_PTR 0 +#define NEUREKA_REG_INFEAT_PTR 1 +#define NEUREKA_REG_OUTFEAT_PTR 2 +#define NEUREKA_REG_SCALE_PTR 3 +#define NEUREKA_REG_SCALE_SHIFT_PTR 4 +#define NEUREKA_REG_SCALE_BIAS_PTR 5 +#define NEUREKA_REG_INFEAT_D0_STRIDE 6 +#define NEUREKA_REG_INFEAT_D1_STRIDE 7 +#define NEUREKA_REG_INFEAT_D2_STRIDE 8 +#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9 +#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10 +#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11 +#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12 +#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13 +#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14 +#define NEUREKA_REG_SUBTILE_REMAINDER_0 15 +#define NEUREKA_REG_SUBTILE_REMAINDER_1 16 +#define NEUREKA_REG_SUBTILE_REMAINDER_2 17 +#define NEUREKA_REG_SUBTILE_NUMBER_0 18 +#define NEUREKA_REG_SUBTILE_NUMBER_1 19 +#define NEUREKA_REG_PADDING 20 +#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21 +#define NEUREKA_REG_FILTER_MASKING 22 +#define NEUREKA_REG_CONF0 23 + +/* SHIFT */ + +#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26) +#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) +#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) +#define NEUREKA_SHIFT_QUANT_SHIFT (16) + +/* CONF0 FLAGS */ + +#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26) +#define NEUREKA_FLAG_NORM_BIAS (1 << 25) +#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) +#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) +#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) +#define NEUREKA_QUANT_MODE_8BIT (0 << 21) +#define NEUREKA_QUANT_MODE_32BIT (2 << 21) +// conf0[20:16] - quantization shift amount +#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE \ + (1 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_STREAMIN (1 << 14) +#define NEUREKA_NORM_MODE_8BIT (0 << 12) +#define NEUREKA_NORM_MODE_32BIT (2 << 12) +#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) +#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9) +#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9) +#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested +#define NEUREKA_FLAG_MODE_3x3 (0 << 5) +#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) +#define NEUREKA_FLAG_MODE_1x1 (2 << 5) +#define NEUREKA_FLAG_NORM_QUANT (1 << 4) + +/* Masks */ + +#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26) +#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23) +#define NEUREKA_MASK_QUANT_MODE (0x3 << 21) +#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NEUREKA_MASK_NORM_MODE (0x3 << 12) +#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10) +#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9) +#define NEUREKA_MASK_FLAG_MODE (0x3 << 5) +#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0) + +/* PADDING */ + +#define NEUREKA_DONT_PAD (0) +#define NEUREKA_MAX_PAD (2) + +/* NORM */ +#define NEUREKA_NORM_MAX_LEN (32) + +#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h deleted file mode 100644 index e8ecba5..0000000 --- a/neureka/inc/pulp_nnx_defs.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_DEFS_H__ -#define __NEUREKA_DEFS_H__ - -/* ARHITECTURE */ - -#define NEUREKA_FILTER_SIZE (6) -#define NEUREKA_FILTER_BUFFER_SIZE (8) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28) -#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_CONTEXT_SIZE (2) -#define NEUREKA_WEIGHT_BANDWIDTH (256) - -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16) - -/* REGISTER MAP */ - -#define NEUREKA_EVT0 12 -#define NEUREKA_EVT1 13 -#define NEUREKA_BASE_ADDR 0x00201000 -#define WEIGHT_MEM_BASE 0x10400000 -#define SRAM_OFFSET 0x00400000 -#define MRAM_OFFSET 0x00000000 - -// Cluster -#define CLUSTER_CTRL_BASE_ADDR 0x00200000 -#define CLUSTER_CTRL_HWPE_OFFS 0x18 -#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800 - -/* REGISTER OFFSETS */ - -// commands -#define NEUREKA_TRIGGER 0x00 -#define NEUREKA_ACQUIRE 0x04 -#define NEUREKA_FINISHED 0x08 -#define NEUREKA_STATUS 0x0C -#define NEUREKA_RUNNING_JOB 0x10 -#define NEUREKA_SOFT_CLEAR 0x14 -#define NEUREKA_SWSYNC 0x18 -#define NEUREKA_URISCY_IMEM 0x1C - -// job configuration -#define NEUREKA_REGISTER_OFFSET 0x20 - -#define NEUREKA_REG_WEIGHTS_PTR 0x00 -#define NEUREKA_REG_INFEAT_PTR 0x04 -#define NEUREKA_REG_OUTFEAT_PTR 0x08 -#define NEUREKA_REG_SCALE_PTR 0x0C -#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10 -#define NEUREKA_REG_SCALE_BIAS_PTR 0x14 -#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18 -#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C -#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20 -#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24 -#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28 -#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C -#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30 -#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34 -#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38 -#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C -#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40 -#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44 -#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48 -#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C -#define NEUREKA_REG_PADDING 0x50 -#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54 -#define NEUREKA_REG_FILTER_MASKING 0x58 -#define NEUREKA_REG_CONF0 0x5C - -// Simulation only -#define NEUREKA_REG_GVSOC_TRACE 0x60 - -/* SHIFT */ - -#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) -#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) -#define NEUREKA_SHIFT_QUANT_SHIFT (16) -#define NEUREKA_SHIFT_ROUNDING (11) - -/* CONF0 FLAGS */ - -#define NEUREKA_FLAG_NORM_BIAS (1 << 25) -#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) -#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) -#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) -#define NEUREKA_QUANT_MODE_8BIT (0 << 21) -#define NEUREKA_QUANT_MODE_16BIT (1 << 21) -#define NEUREKA_QUANT_MODE_32BIT (2 << 21) -// conf0[20:16] - quantization shift amount -#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) -#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) -#define NEUREKA_FLAG_STREAMIN (1 << 14) -#define NEUREKA_NORM_MODE_8BIT (0 << 12) -#define NEUREKA_NORM_MODE_16BIT (1 << 12) -#define NEUREKA_NORM_MODE_32BIT (2 << 12) -#define NEUREKA_FLAG_ROUND (1 << 11) -#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) -#define NEUREKA_FLAG_USE_WMEM (1 << 9) -#define NEUREKA_FLAG_USE_TCDM (0 << 9) -#define NEUREKA_FLAG_STRIDED_MODE (1 << 8) -#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) -#define NEUREKA_FLAG_MODE_3x3 (0 << 5) -#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) -#define NEUREKA_FLAG_MODE_1x1 (2 << 5) -#define NEUREKA_FLAG_NORM_QUANT (1 << 4) -#define NEUREKA_FLAG_MODE_BASIC (0 << 3) -#define NEUREKA_FLAG_MODE16 (1 << 3) - -/* Masks */ - -#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23) -#define NEUREKA_MASK_QUANT_MODE (3 << 21) - -/* Miscellaneous */ - -// Padding -#define MAX_PAD (0xf) - -// Normalization -#define NEUREKA_NORM_MAX_LEN (32) -#define NO_NORM(length) \ - { \ - .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL, \ - .length = length, .mode = normMode32Bit \ - } - -// Quantization -#define NO_QUANT \ - { \ - .shift_amount = 0, .mode = quantMode32Bit, \ - .function = quantFunctionIdentity \ - } - -// GVSOC trace levels -#define NEUREKA_TRACE_LEVEL_JOB_START_END 0 -#define NEUREKA_TRACE_LEVEL_CONFIG 1 -#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2 -#define NEUREKA_TRACE_LEVEL_ALL 3 - -// null -#define NEUREKA_NULL ((void *)0) -#define NEUREKA_STATUS_FULL (0x101) - -#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h deleted file mode 100644 index 40bcec0..0000000 --- a/neureka/inc/pulp_nnx_hal.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_H__ -#define __NEUREKA_H__ - -#include - -#include "pulp_nnx_defs.h" -#include "pulp_nnx_error_codes.h" - -#define NEUREKA_CG_ENABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |= \ - CLUSTER_CTRL_HWPE_CG_EN_MASK -#define NEUREKA_CG_DISABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &= \ - ~CLUSTER_CTRL_HWPE_CG_EN_MASK - -#define NEUREKA_WRITE(offset, value) \ - *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value) -#define NEUREKA_WRITE_BE(offset, value, be) \ - *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value) -#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) - -#define NEUREKA_WRITE_IO_REG(offset, value) \ - NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value)) -#define NEUREKA_WRITE_IO_REG_BE(offset, value, be) \ - NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be)) -#define NEUREKA_READ_IO_REG(offset) \ - NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset)) - -#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0) -#define NEUREKA_BARRIER() \ - do { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BUSYWAIT() \ - do { \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; -#define NEUREKA_NOBARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; - -#define DIVNCEIL(A, B) (((A - 1) / B) + 1) -#define REMAINDER(A, B) (((A - 1) % B) + 1) -#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff)) - -#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE - -#define FLAG_USED (1) -#define FLAG_UNUSED (0) - -typedef enum { - weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, - weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE -} nnx_weight_offset_mode_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - uint16_t n_weights; - uint32_t bitwidth; - int32_t offset_factor; - nnx_weight_offset_mode_e offset_mode; -} nnx_weights_t; - -typedef enum { - featureBitwidth8Bit = 8, - featureBitwidth16Bit = 16, - featureBitwidth32Bit = 32 -} nnx_feature_bitwidth_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - nnx_feature_bitwidth_e bitwidth; -} nnx_feature_t; - -typedef enum { - normMode8Bit = NEUREKA_NORM_MODE_8BIT, - normMode16Bit = NEUREKA_NORM_MODE_16BIT, - normMode32Bit = NEUREKA_NORM_MODE_32BIT -} nnx_norm_mode_e; - -typedef struct { - nnx_norm_mode_e mode; - int flag_bias; - int flag_shift; -} nnx_norm_t; - -typedef enum { - quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, - quantMode16Bit = NEUREKA_QUANT_MODE_16BIT, - quantMode32Bit = NEUREKA_QUANT_MODE_32BIT -} nnx_quant_mode_e; - -typedef enum { - quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, - quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU -} nnx_quant_function_e; - -// TODO: add rounding to quant. Should also be an enum? Best boolean... -typedef struct { - // Shift amount must be in range 0x00-0x1F - unsigned shift_amount; - nnx_quant_mode_e mode; - nnx_quant_function_e function; - int flag_rounding; -} nnx_quant_t; - -typedef struct { - uint32_t d0; - uint32_t d1; - uint32_t d2; -} nnx_stride_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; - uint32_t HiWi; -} nnx_subtile_remainder_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; -} nnx_subtile_number_t; - -typedef struct { - nnx_subtile_remainder_t remainder; - nnx_subtile_number_t number; -} nnx_subtile_t; - -typedef struct { - nnx_stride_t input_stride; - nnx_stride_t output_stride; - nnx_stride_t weights_stride; - nnx_subtile_t subtile; - uint32_t padding; - uint32_t weight_offset_factor; - uint32_t filter_mask; - uint32_t conf0; -} nnx_cfg_t; - -typedef struct { - uint32_t weights_ptr; - uint32_t infeat_ptr; - uint32_t outfeat_ptr; - uint32_t scale_ptr; - uint32_t scale_shift_ptr; - uint32_t scale_bias_ptr; - nnx_cfg_t cfg; -} nnx_task_t; - -int nnx_job_id(); -int nnx_empty(); -int nnx_full(); -void nnx_soft_clear(); -int nnx_acquire(); -void nnx_offload(nnx_task_t *task); -void nnx_offload_ptr(nnx_task_t *task); -void nnx_run_async(); -void nnx_run_blocking(); -void nnx_commit(); -void nnx_wait_empty(); -void nnx_wait_not_full(); -void nnx_wait_on_id(int id); -void nnx_busywait(); - -void nnx_task_init(nnx_task_t *task); -int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom, - uint32_t left, uint16_t value); -int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant); -void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom, - uint8_t left); -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); - -#endif /* __NEUREKA_H__ */ diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h deleted file mode 100644 index f29ff3e..0000000 --- a/neureka/inc/pulp_nnx_util.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __PULP_NNX_UTIL__ -#define __PULP_NNX_UTIL__ - -void nnx_activate_gvsoc_logging(int use_dec); -void nnx_deactivate_gvsoc_logging(); - -#endif /* __PULP_NNX_UTIL__ */ diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c deleted file mode 100644 index 1d99691..0000000 --- a/neureka/src/pulp_nnx_hal.c +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "pulp_nnx_hal.h" -#include "pmsis.h" - -static int qw, weight_d0_stride, outbytes; - -// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and -// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise -// the compiler is not able to correctly factorize the NEUREKA base in case -// several accesses are done, ending up with twice more code - -// __builtin_pulp_OffsetedX not defined - needs further investigation... (too -// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK... - -int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); } - -int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; } - -int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); } - -void nnx_soft_clear() { - NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0); - for (volatile int i = 0; i < 10; i++) - ; -} - -int nnx_acquire() { - int job_id = -1; - NEUREKA_BARRIER_ACQUIRE(job_id); - return job_id; -} - -void nnx_offload(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_offload_ptr(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < 6; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); } - -void nnx_run_blocking() { - nnx_run_async(); - nnx_wait_empty(); -} - -void nnx_commit() { - NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger -} - -void nnx_busywait() { NEUREKA_BUSYWAIT(); } - -void nnx_wait_empty() { - while (!nnx_empty()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_not_full() { - while (nnx_full()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_on_id(const int id) { - while (nnx_job_id() <= id) { - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); - }; -} - -void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); } - -int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right, - const uint32_t bottom, const uint32_t left, - const uint16_t value) { - uint32_t padding = 0; - uint32_t flags = 0; - - if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) { - return 1; - } - - cfg->padding = - (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value; - - return 0; -} - -int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm, - const nnx_quant_t quant) { - if (quant.shift_amount > 31) { - printf("ERROR! quant.shift_amount > 31\n"); - return 1; - } - - if (quant.mode == quantMode16Bit) { - printf("ERROR! quant.mode == quantMode16Bit\n"); - return 1; - } - - BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | - quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | - norm.mode | - norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT); - - return 0; -} - -void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right, - const uint8_t bottom, const uint8_t left) { - cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) | - ((uint32_t)bottom << 8) | ((uint32_t)left << 0); -} - -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho; - const int rem_Wi = rem_Wo; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_in, - .d1 = k_in * w_out, - .d2 = k_in * 3 * 3 // copying arpan - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = weight_d0_stride * qw, - .d1 = weight_d0_stride * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height != output.height || input.width != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = {.d0 = k_in, - .d1 = k_in * (w_out + 2), - .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE * - NEUREKA_FILTER_BUFFER_SIZE}; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3, - .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ki = num_Ko; - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ki = rem_Ko; - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_out, - .d1 = k_out * (w_out + 2), - .d2 = 0 // Unused - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride, - .d1 = 0, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != output.depth) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c index 7ab0e99..f9799fc 100644 --- a/src/pulp_nnx_ne16.c +++ b/src/pulp_nnx_ne16.c @@ -79,25 +79,20 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i, uint32_t size_j, uint32_t size_k, uint32_t stride_j, uint32_t stride_k, uint32_t overlap_i, uint32_t overlap_j, - uint32_t offset_i, uint32_t offset_j, - uint8_t data_size) { - return ptr + - (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k * - data_size / 8 + - (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8; + uint32_t offset_i, uint32_t offset_j) { + return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j + + (j * (size_j - overlap_j) - offset_j) * stride_k; } -void ne16_nnx_dispatch_stride2x2( - ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker) { +void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task, + const uint32_t w_in, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t h_ker, + const uint8_t w_ker) { const uint8_t stride = 2; - const uint8_t bits = 8; - const uint32_t n_h = divnceil(h_out, stride); - const uint32_t n_w = divnceil(w_out, stride); + const uint32_t n_h = nnx_calculate_number_of_tiles(h_out, stride); + const uint32_t n_w = nnx_calculate_number_of_tiles(w_out, stride); const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0; const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0; const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0; @@ -109,15 +104,15 @@ void ne16_nnx_dispatch_stride2x2( for (int i = 0; i < n_h; i++) { for (int j = 0; j < n_w; j++) { - task->data.infeat_ptr = - _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in, - w_in_stride, k_in_stride, h_ker - stride, - w_ker - stride, i == 0 ? 0 : input_height_offset, - j == 0 ? 0 : input_width_offset, bits); - task->data.outfeat_ptr = - _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride, - k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset, - j == 0 ? 0 : output_width_offset, bits); + task->data.infeat_ptr = _get_tile_ptr( + input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in, + task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0, + h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset, + j == 0 ? 0 : input_width_offset); + task->data.outfeat_ptr = _get_tile_ptr( + output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1, + task->data.cfg.output_stride.d1 << 1, 0, 0, + i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset); task->data.cfg.padding = ne16_get_tile_padding(tile_padding, i, j, n_h, n_w); diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c new file mode 100644 index 0000000..0abb845 --- /dev/null +++ b/src/pulp_nnx_neureka.c @@ -0,0 +1,76 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pulp_nnx_neureka.h" +#include "hwpe.h" +#include "neureka.h" +#include "pulp_nnx_util.h" +#include +#include +#include + +void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf) { + neureka_siracusa_open(conf); + hwpe_soft_clear(&dev->hwpe_dev); +} + +void neureka_nnx_term(neureka_dev_t *dev) { + hwpe_soft_clear(&dev->hwpe_dev); + neureka_siracusa_close(); +} + +int neureka_nnx_dispatch_check(neureka_dev_t *dev) { + return !neureka_task_queue_full(dev); +} + +void neureka_nnx_dispatch_wait(neureka_dev_t *dev) { + while (!neureka_nnx_dispatch_check(dev)) { + neureka_siracusa_event_wait_and_clear(); + } +} + +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) { + if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) { + return 1; + } + hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data, + (int)(sizeof(neureka_task_data_t) / 4)); + hwpe_task_queue_release_and_run(&dev->hwpe_dev); + return 0; +} + +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) { +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + // GVSOC model has a broken running_id so resolve_check + // conservativly looks if the task queue is empty. + return neureka_task_queue_empty(dev); +#else + uint8_t prev_task_id = task->id - 1; + return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id || + (hwpe_last_task_id(&dev->hwpe_dev) == task->id && + !neureka_task_queue_empty(dev))); +#endif +} + +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) { + while (!neureka_nnx_resolve_check(dev, task)) { + neureka_siracusa_event_wait_and_clear(); + } +} diff --git a/test/.isort.cfg b/test/.isort.cfg new file mode 100644 index 0000000..127bf37 --- /dev/null +++ b/test/.isort.cfg @@ -0,0 +1,4 @@ +[settings] +profile=black +line_length=88 +skip_gitignore=true diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py index 5abb204..07dc597 100644 --- a/test/HeaderWriter.py +++ b/test/HeaderWriter.py @@ -48,8 +48,9 @@ def define(self, name, expr): if isinstance(expr, str): expr = f'"{expr}"' elif isinstance(expr, bool): - expr = int(expr) - expr = f"({expr})" + expr = f"({int(expr)})" + else: + expr = f"({expr})" return f"#define {name.upper()} {expr}\n" def vector_size(self, data): @@ -158,7 +159,7 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None): if golden is not None: render += self.render_vector( - "golden_" + name, "PI_L1 " + _type, size, init=golden + "golden_" + name, "PI_L2 " + _type, size, init=golden ) render += self.check(name) diff --git a/test/Ne16.py b/test/Ne16.py deleted file mode 100644 index 6de5ab5..0000000 --- a/test/Ne16.py +++ /dev/null @@ -1,94 +0,0 @@ -# Luka Macan -# -# Copyright 2023 ETH Zurich and University of Bologna -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -import numpy as np -import numpy.typing as npt -from TestClasses import IntegerType - - -class Ne16: - ACCUMULATOR_TYPE = IntegerType(name="int32") - - _CIN_SUBTILE = 16 - - @staticmethod - def weight_unroll( - weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False - ) -> npt.NDArray[np.uint8]: - """Unroll weight into expected memory format - - Expected weight shape is (Cout, Cin, H, W). - The output shape is: (Cout, Cin_major, Bits, H x W, Cin_minor_bytes), - where Cin_major is the ceil(Cin / CIN_SUBTILE) and Cin_minor has to be padded with 0 to CIN_SUBTILE. - """ - if depthwise: - weight = weight.transpose(1, 0, 2, 3) # Swap Cout and Cin - - Cout, Cin, H, W = weight.shape - - # Pad Cin to be divisible with CIN_SUBTILE - if Cin % Ne16._CIN_SUBTILE != 0: - Cin_pad = Ne16._CIN_SUBTILE - Cin % Ne16._CIN_SUBTILE - weight = np.pad( - weight, - ((0, 0), (0, Cin_pad), (0, 0), (0, 0)), - "constant", - constant_values=0, - ) - - # Reshape into (Cout, Cin_major, Cin_minor, Flattened spatial, 1) - # The 1 at the end is required by the unpacking - Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE)) - Cin_minor = Ne16._CIN_SUBTILE - weight = weight.reshape(Cout, Cin_major, Cin_minor, H * W, 1) - - # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] - # (Cout, Cin_major, Cin_minor, Flattened spatial, Bits) - weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") - - # Shuffle bits so that the final shape is: - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor) - weight = weight.transpose(0, 1, 4, 3, 2) - - # Prepare for packing - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes, 8) - Cin_minor_bytes = int(np.ceil(Cin_minor / 8)) - weight = np.stack(np.split(weight, Cin_minor_bytes, axis=-1), axis=-2) - - # Pack - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes) - weight = np.packbits(weight, axis=-1, bitorder="little") - - return weight.flatten() - - @staticmethod - def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: int): - """Reverse of weight_roll""" - Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE)) - Cin_minor = Ne16._CIN_SUBTILE - Cin_minor_bytes = int(np.ceil(Cin_minor / 8)) - - weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1) - weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") - weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor) - weight = weight.transpose(0, 1, 4, 3, 2) - weight = np.packbits(weight, axis=-1, bitorder="little") - weight = weight.reshape(Cout, Cin_major * Cin_minor, H, W) - weight = weight[:, :Cin, :, :] - - return weight diff --git a/test/Ne16MemoryLayout.py b/test/Ne16MemoryLayout.py new file mode 100644 index 0000000..30729ab --- /dev/null +++ b/test/Ne16MemoryLayout.py @@ -0,0 +1,99 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import numpy.typing as npt + + +class Ne16MemoryLayout: + _CIN_SUBTILE = 16 + + @staticmethod + def weightEncode( + weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False + ) -> npt.NDArray[np.uint8]: + """Unroll weight into expected memory format + + Expected weight shape is (cout, cin, height, width). + The output shape is: (cout, cinMajor, Bits, height x width, cinMinorBytes), + where cinMajor is the ceil(cin / CIN_SUBTILE) and cinMinor has to be padded with 0 to CIN_SUBTILE. + """ + if depthwise: + weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin + + cout, cin, height, width = weight.shape + + # Pad cin to be divisible with CIN_SUBTILE + if cin % Ne16MemoryLayout._CIN_SUBTILE != 0: + cinPad = Ne16MemoryLayout._CIN_SUBTILE - cin % Ne16MemoryLayout._CIN_SUBTILE + weight = np.pad( + weight, + ((0, 0), (0, cinPad), (0, 0), (0, 0)), + "constant", + constant_values=0, + ) + cin = cin + cinPad + + # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1) + # The 1 at the end is required by the unpacking + cinMajor = cin // Ne16MemoryLayout._CIN_SUBTILE + cinMinor = Ne16MemoryLayout._CIN_SUBTILE + weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1) + + # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] + # (cout, cinMajor, cinMinor, flattened spatial, Bits) + weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") + + # Shuffle bits so that the final shape is: + # (cout, cinMajor, Bits, flattened spatial, cinMinor) + weight = weight.transpose(0, 1, 4, 3, 2) + + # Prepare for packing + # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes, 8) + cinMinorBytes = int(np.ceil(cinMinor / 8)) + weight = np.stack(np.split(weight, cinMinorBytes, axis=-1), axis=-2) + + # Pack + # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes) + weight = np.packbits(weight, axis=-1, bitorder="little") + + return weight.flatten() + + @staticmethod + def weightDecode( + weight: npt.NDArray[np.uint8], + bits: int, + cout: int, + cin: int, + height: int, + width: int, + ) -> npt.NDArray[np.uint8]: + """Reverse of weight_roll""" + cinMajor = int(np.ceil(cin / Ne16MemoryLayout._CIN_SUBTILE)) + cinMinor = Ne16MemoryLayout._CIN_SUBTILE + cinMinorBytes = int(np.ceil(cinMinor / 8)) + + weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1) + weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little") + weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor) + weight = weight.transpose(0, 1, 4, 3, 2) + weight = np.packbits(weight, axis=-1, bitorder="little") + weight = weight.reshape(cout, cinMajor * cinMinor, height, width) + weight = weight[:, :cin, :, :] + + return weight diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py new file mode 100644 index 0000000..f2e66ad --- /dev/null +++ b/test/Ne16TestConf.py @@ -0,0 +1,111 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import List, Optional, Union + +from pydantic import field_validator, model_validator + +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from NnxTestClasses import NnxTestConf +from TestClasses import IntegerType, KernelShape, Stride, implies + + +class Ne16TestConf(NnxTestConf): + @field_validator("kernel_shape") + @classmethod + def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: + assert v == KernelShape(height=1, width=1) or v == KernelShape( + height=3, width=3 + ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." + return v + + @field_validator("stride") + @classmethod + def check_valid_stride(cls, v: Stride) -> Stride: + assert v == Stride(height=1, width=1) or v == Stride( + height=2, width=2 + ), f"Unsupported stride {v}. Supported 1x1 and 2x2." + return v + + @staticmethod + def _check_type( + name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] + ) -> None: + assert ( + _type in allowed_types + ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" + + @field_validator("in_type") + @classmethod + def check_valid_in_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("in_type", v, ["uint8"]) + return v + + @field_validator("out_type") + @classmethod + def check_valid_out_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("out_type", v, ["uint8", "int8", "int32"]) + return v + + @field_validator("weight_type") + @classmethod + def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("weight_type", v, ["int8"]) + return v + + @field_validator("scale_type") + @classmethod + def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"]) + return v + + @field_validator("bias_type") + @classmethod + def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + Ne16TestConf._check_type("bias_type", v, ["int32"]) + return v + + @model_validator(mode="after") # type: ignore + def check_valid_out_channel_stride_with_stride_2x2(self) -> Ne16TestConf: + assert implies( + self.stride == Stride(height=2, width=2), + self.out_channel * (self.out_type._bits // 8) % 2 == 0, + ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}" + return self + + @model_validator(mode="after") # type: ignore + def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf: + assert implies( + self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) + ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf: + assert implies( + not self.has_norm_quant, + self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, + ), ( + f"Without quantization, the output type has to be equal to the " + f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + ) + return self diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py new file mode 100644 index 0000000..08b3601 --- /dev/null +++ b/test/NeuralEngineFunctionalModel.py @@ -0,0 +1,123 @@ +from typing import Optional + +import torch +import torch.nn.functional as F + +from TestClasses import IntegerType, Padding, Stride + + +class NeuralEngineFunctionalModel: + ACCUMULATOR_TYPE = IntegerType(name="int32") + + @staticmethod + def _cast( + tensor: torch.Tensor, _type: IntegerType, saturate: bool = False + ) -> torch.Tensor: + if saturate: + return tensor.clamp(_type.min, _type.max) + else: + return tensor & ((1 << _type._bits) - 1) + + def _norm_quant( + self, + tensor: torch.Tensor, + scale: torch.Tensor, + bias: Optional[torch.Tensor], + global_shift: torch.Tensor, + out_type: IntegerType, + bias_type: Optional[IntegerType], + has_bias: bool, + has_relu: bool, + ) -> torch.Tensor: + # Scale accumulators are in 48bit, so keeping the data in 64bit + tensor = tensor * scale + assert tensor.dtype == torch.int64 + + if has_bias: + assert bias is not None + assert bias_type is not None + # Saturating cast to int32 + tensor = NeuralEngineFunctionalModel._cast( + tensor, bias_type, saturate=True + ).type(torch.int32) + + tensor = tensor + bias + tensor = NeuralEngineFunctionalModel._cast( + tensor, bias_type, saturate=False + ).type(torch.int32) + + if has_relu: + tensor = F.relu(tensor) + + tensor = tensor >> global_shift + + # Saturate into out_type + tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True) + + return tensor + + def convolution( + self, + input: torch.Tensor, + weight: torch.Tensor, + scale: Optional[torch.Tensor], + bias: Optional[torch.Tensor], + global_shift: Optional[torch.Tensor], + padding: Padding, + stride: Stride, + depthwise: bool, + out_type: IntegerType, + bias_type: Optional[IntegerType], + has_norm_quant: bool, + has_bias: bool, + has_relu: bool, + verbose: bool = False, + **kwargs, + ) -> torch.Tensor: + _ = kwargs + + input_padded = F.pad( + input, + ( + padding.left, + padding.right, + padding.top, + padding.bottom, + ), + "constant", + 0, + ) + + # Accumulators are 32bit non-saturating. + # Calculate in higher precision (int64) + output = F.conv2d( + input=input_padded, + weight=weight, + stride=(stride.height, stride.width), + groups=weight.shape[0] if depthwise else 1, + ).type(torch.int64) + + # Cast to accumulator type + output = NeuralEngineFunctionalModel._cast( + output, NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, saturate=False + ).type(torch.int32) + + if verbose: + print("INTERMEDIATE RESULTS (pre-normalization/requant):") + print(output) + + if has_norm_quant: + assert scale is not None + assert global_shift is not None + output = self._norm_quant( + output, + scale, + bias, + global_shift, + out_type, + bias_type, + has_bias, + has_relu, + ) + + return output diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py new file mode 100644 index 0000000..80a2786 --- /dev/null +++ b/test/NeurekaMemoryLayout.py @@ -0,0 +1,158 @@ +# Luka Macan +# Arpan Suravi Prasad +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import numpy.typing as npt + +from TestClasses import IntegerType + + +class NeurekaMemoryLayout: + _WEIGHT_BANDWIDTH = 256 + _CIN_SUBTILE_1x1 = 32 + _CIN_SUBTILE_3x3 = 28 + + @staticmethod + def weightEncode( + weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False + ) -> npt.NDArray[np.uint8]: + """Unroll weight into expected memory format + + Expected weight shape is (cout, cin, H, W). + The produced memory layout depends on the weight kernel shape: + - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits), + - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits), + where cinMajor is the ceil(cin / cin subtile ) and cinMinor has to be padded with 0 to cin subtile . + """ + if depthwise: + weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin + + cout, cin, height, width = weight.shape + cinSubtile = ( + NeurekaMemoryLayout._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayout._CIN_SUBTILE_1x1 + ) + + # Pad cin to be divisible with CIN_SUBTILE + if cin % cinSubtile != 0: + cinPad = cinSubtile - cin % cinSubtile + weight = np.pad( + weight, + ((0, 0), (0, cinPad), (0, 0), (0, 0)), + "constant", + constant_values=0, + ) + + # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1) + # The 1 at the end is required by the unpacking + cinMajor = int(np.ceil(cin / cinSubtile)) + weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1) + + # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] + # (cout, cinMajor, cinSubtile, Flattened spatial, Bits) + weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") + + # Shuffle bits so that the final shape is: + # (cout, cinMajor, Bits, Flattened spatial, cinSubtile) + weight = weight.transpose(0, 1, 4, 3, 2) + + # Pack dimensions to fit into weight bandwidth + if height == 3 and width == 3: + # (cout * cinMajor * Bits, H * W * cinSubtile) + weight = weight.reshape(-1, height * width * cinSubtile) + # Pad only the last dimension to weight bandwidth size + # (-1, Weight Bandwidth) + weight = np.pad( + weight, + ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])), + "constant", + constant_values=0, + ) + elif height == 1 and width == 1: + # Tile cinSubtile into tiles of size 4 + # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinSubtile // 4, 4 + ) # cout, cinMajor, bits, 1, 8, 4 + # Pad bits to 8 + if bits < 8: + # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = np.pad( + weight, + ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)), + mode="constant", + constant_values=0, + ) + # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile) + weight = weight.transpose(0, 1, 3, 4, 2, 5) + # (-1, Weight Bandwidth) + weight = weight.reshape( + cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH + ) # cout*cinMajor, 256b + + # Prepare for packing + # (-1, Weight Bandwidth Bytes, 8) + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8)) + weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2) + + # Pack bits + # (-1, Weight Bandwidth Bytes) + weight = np.packbits(weight, axis=-1, bitorder="little") + + return weight.flatten() + + @staticmethod + def weightDecode( + weight: npt.NDArray[np.uint8], + bits: int, + cout: int, + cin: int, + height: int, + width: int, + ) -> npt.NDArray[np.uint8]: + """Reverse of weightEncode""" + cinSubtile = ( + NeurekaMemoryLayout._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayout._CIN_SUBTILE_1x1 + ) + cinMajor = int(np.ceil(cin / cinSubtile)) + cinMinor = cinSubtile + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8)) + + weight = weight.reshape(-1, weightBandwidthBytes, 1) + weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little") + weight = weight.reshape(-1, NeurekaMemoryLayout._WEIGHT_BANDWIDTH) + + if height == 3 and width == 3: + weight = weight[:, : height * width * cinMinor] + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinMinor + ).transpose(0, 1, 4, 3, 2) + elif height == 1 and width == 1: + weight = weight[:, : height * width * cinMinor * 8] + weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose( + 0, 1, 2, 4, 3 + ) + weight = np.packbits(weight, axis=-1, bitorder="little") + weight = weight.reshape(cout, cinMajor * cinMinor, height, width) + weight = weight[:, :cin, :, :] + + return weight diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py new file mode 100644 index 0000000..f878e68 --- /dev/null +++ b/test/NeurekaTestConf.py @@ -0,0 +1,101 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import List, Optional, Union + +from pydantic import field_validator, model_validator + +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from NnxTestClasses import NnxTestConf +from TestClasses import IntegerType, KernelShape, Stride, implies + + +class NeurekaTestConf(NnxTestConf): + @field_validator("kernel_shape") + @classmethod + def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: + assert v == KernelShape(height=1, width=1) or v == KernelShape( + height=3, width=3 + ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." + return v + + @field_validator("stride") + @classmethod + def check_valid_stride(cls, v: Stride) -> Stride: + assert v == Stride(height=1, width=1), f"Unsupported stride {v}. Supported 1x1." + return v + + @staticmethod + def _check_type( + name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] + ) -> None: + assert ( + _type in allowed_types + ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" + + @field_validator("in_type") + @classmethod + def check_valid_in_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("in_type", v, ["uint8", "int8"]) + return v + + @field_validator("out_type") + @classmethod + def check_valid_out_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"]) + return v + + @field_validator("weight_type") + @classmethod + def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("weight_type", v, ["int8"]) + return v + + @field_validator("scale_type") + @classmethod + def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + NeurekaTestConf._check_type("scale_type", v, ["uint8", "uint32"]) + return v + + @field_validator("bias_type") + @classmethod + def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + NeurekaTestConf._check_type("bias_type", v, ["int32"]) + return v + + @model_validator(mode="after") # type: ignore + def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf: + assert implies( + self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) + ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf: + assert implies( + not self.has_norm_quant, + self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, + ), ( + f"Without quantization, the output type has to be equal to the " + f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + ) + return self diff --git a/test/Ne16TestClasses.py b/test/NnxTestClasses.py similarity index 53% rename from test/Ne16TestClasses.py rename to test/NnxTestClasses.py index d99e829..a7aaa00 100644 --- a/test/Ne16TestClasses.py +++ b/test/NnxTestClasses.py @@ -17,18 +17,21 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import List, Union, Optional, Set, Tuple -import torch -import numpy as np -import torch.nn.functional as F + import os -from Ne16 import Ne16 +from typing import Callable, Optional, Set, Tuple, Type, Union + +import numpy as np +import numpy.typing as npt +import torch +from pydantic import BaseModel, PositiveInt, field_validator, model_validator + from HeaderWriter import HeaderWriter -from TestClasses import implies, KernelShape, Padding, Stride, IntegerType -from pydantic import BaseModel, field_validator, model_validator, PositiveInt +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from TestClasses import IntegerType, KernelShape, Padding, Stride, implies -class Ne16TestConf(BaseModel): +class NnxTestConf(BaseModel): in_height: PositiveInt in_width: PositiveInt in_channel: PositiveInt @@ -46,74 +49,8 @@ class Ne16TestConf(BaseModel): has_bias: bool has_relu: bool - @field_validator("kernel_shape") - @classmethod - def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: - assert v == KernelShape(height=1, width=1) or v == KernelShape( - height=3, width=3 - ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." - return v - - @field_validator("stride") - @classmethod - def check_valid_stride(cls, v: Stride) -> Stride: - assert v == Stride(height=1, width=1) or v == Stride( - height=2, width=2 - ), f"Unsupported stride {v}. Supported 1x1 and 2x2." - return v - - @staticmethod - def _check_type( - name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] - ) -> None: - assert ( - _type in allowed_types - ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" - - @field_validator("in_type") - @classmethod - def check_valid_in_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("in_type", v, ["uint8"]) - return v - - @field_validator("out_type") - @classmethod - def check_valid_out_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("out_type", v, ["uint8", "int8"]) - return v - - @field_validator("weight_type") - @classmethod - def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("weight_type", v, ["int8"]) - return v - - @field_validator("scale_type") - @classmethod - def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: - if v is not None: - Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"]) - return v - - @field_validator("bias_type") - @classmethod - def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: - if v is not None: - Ne16TestConf._check_type("bias_type", v, ["int32"]) - return v - @model_validator(mode="after") # type: ignore - def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf: - assert implies( - self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0 - ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}" - return self - - @model_validator(mode="after") # type: ignore - def check_valid_depthwise(self) -> Ne16TestConf: - assert implies( - self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) - ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + def check_valid_depthwise_channels(self) -> NnxTestConf: assert implies(self.depthwise, self.in_channel == self.out_channel), ( f"Input and output channel should be the same in a depthwise layer. " f"input channel: {self.in_channel}, output channel: {self.out_channel}" @@ -121,21 +58,15 @@ def check_valid_depthwise(self) -> Ne16TestConf: return self @model_validator(mode="after") # type: ignore - def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf: + def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf: assert implies( self.kernel_shape == KernelShape(height=1, width=1), self.padding == Padding(top=0, bottom=0, left=0, right=0), ), f"No padding on 1x1 kernel. Given padding {self.padding}" return self - @field_validator("has_norm_quant") - @classmethod - def check_valid_has_norm_quant(cls, v: bool) -> bool: - assert v == True, f"Untested without has_norm_quant." - return v - @model_validator(mode="after") # type: ignore - def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf: + def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf: if self.has_norm_quant: assert self.scale_type is not None, "Scale type was not provided." if self.has_bias: @@ -143,25 +74,31 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf: return self @model_validator(mode="after") # type: ignore - def check_valid_out_type_with_flags(self) -> Ne16TestConf: - assert implies( - not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE - ), ( - f"Without quantization, the output type has to be equal to the " - f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + def check_has_relu_with_norm_quant(self) -> NnxTestConf: + assert implies(self.has_relu, self.has_norm_quant), ( + f"Relu flag can only be enabled when norm_quant is enabled. " + f"Given has_relu {self.has_relu} and has_norm_quant {self.has_norm_quant}" ) - assert implies( - self.has_norm_quant, - (self.has_relu and not self.out_type._signed) - or (not self.has_relu and self.out_type._signed), - ), ( + return self + + @model_validator(mode="after") # type: ignore + def check_has_bias_with_norm_quant(self) -> NnxTestConf: + assert implies(self.has_bias, self.has_norm_quant), ( + f"Bias flag can only be enabled when norm_quant is enabled. " + f"Given has_bias {self.has_bias} and has_norm_quant {self.has_norm_quant}" + ) + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_relu(self) -> NnxTestConf: + assert self.has_relu ^ self.out_type._signed, ( f"Output type has to be unsigned when there is relu, otherwise signed. " f"Given output type {self.out_type} and has_relu {self.has_relu}" ) return self -class Ne16Test: +class NnxTest: _CONF_NAME = "conf.json" _INPUT_NAME = "input.pt" _OUTPUT_NAME = "output.pt" @@ -172,7 +109,7 @@ class Ne16Test: def __init__( self, - conf: Ne16TestConf, + conf: NnxTestConf, input: Optional[torch.Tensor], output: Optional[torch.Tensor], weight: Optional[torch.Tensor], @@ -188,7 +125,7 @@ def __init__( self.bias = bias self.global_shift = global_shift - def is_valid(self): + def is_valid(self) -> bool: return all( [ self.input is not None, @@ -203,22 +140,22 @@ def is_valid(self): def save_conf(self, path: Union[str, os.PathLike]) -> None: os.makedirs(path, exist_ok=True) - with open(os.path.join(path, Ne16Test._CONF_NAME), "w") as fp: + with open(os.path.join(path, NnxTest._CONF_NAME), "w") as fp: fp.write(self.conf.model_dump_json(indent=4)) def save_data(self, path: Union[str, os.PathLike]) -> None: os.makedirs(path, exist_ok=True) - torch.save(self.input, os.path.join(path, Ne16Test._INPUT_NAME)) - torch.save(self.output, os.path.join(path, Ne16Test._OUTPUT_NAME)) - torch.save(self.weight, os.path.join(path, Ne16Test._WEIGHT_NAME)) + torch.save(self.input, os.path.join(path, NnxTest._INPUT_NAME)) + torch.save(self.output, os.path.join(path, NnxTest._OUTPUT_NAME)) + torch.save(self.weight, os.path.join(path, NnxTest._WEIGHT_NAME)) if self.scale is not None: - torch.save(self.scale, os.path.join(path, Ne16Test._SCALE_NAME)) + torch.save(self.scale, os.path.join(path, NnxTest._SCALE_NAME)) if self.bias is not None: - torch.save(self.bias, os.path.join(path, Ne16Test._BIAS_NAME)) + torch.save(self.bias, os.path.join(path, NnxTest._BIAS_NAME)) if self.global_shift is not None: torch.save( - self.global_shift, os.path.join(path, Ne16Test._GLOBAL_SHIFT_NAME) + self.global_shift, os.path.join(path, NnxTest._GLOBAL_SHIFT_NAME) ) def save(self, path: Union[str, os.PathLike]) -> None: @@ -228,154 +165,111 @@ def save(self, path: Union[str, os.PathLike]) -> None: @staticmethod def is_test_dir(path: Union[str, os.PathLike]) -> bool: fileset = set(os.listdir(path)) - required_fileset = set([Ne16Test._CONF_NAME]) + required_fileset = set([NnxTest._CONF_NAME]) return required_fileset.issubset(fileset) @classmethod - def load(cls, path: Union[str, os.PathLike]) -> "Ne16Test": - assert Ne16Test.is_test_dir( + def load(cls, confCls: Type[NnxTestConf], path: Union[str, os.PathLike]) -> NnxTest: + assert NnxTest.is_test_dir( path ), f"ERROR: Test {path} does not contain the necessary files." - with open(os.path.join(path, Ne16Test._CONF_NAME), "r") as fp: - conf = Ne16TestConf.model_validate_json(fp.read()) + with open(os.path.join(path, NnxTest._CONF_NAME), "r") as fp: + conf = confCls.model_validate_json(fp.read()) def load_if_exist(filename: str) -> Optional[torch.Tensor]: filepath = os.path.join(path, filename) return torch.load(filepath) if os.path.isfile(filepath) else None - input = load_if_exist(Ne16Test._INPUT_NAME) - output = load_if_exist(Ne16Test._OUTPUT_NAME) - weight = load_if_exist(Ne16Test._WEIGHT_NAME) - scale = load_if_exist(Ne16Test._SCALE_NAME) - bias = load_if_exist(Ne16Test._BIAS_NAME) - global_shift = load_if_exist(Ne16Test._GLOBAL_SHIFT_NAME) + input = load_if_exist(NnxTest._INPUT_NAME) + output = load_if_exist(NnxTest._OUTPUT_NAME) + weight = load_if_exist(NnxTest._WEIGHT_NAME) + scale = load_if_exist(NnxTest._SCALE_NAME) + bias = load_if_exist(NnxTest._BIAS_NAME) + global_shift = load_if_exist(NnxTest._GLOBAL_SHIFT_NAME) return cls(conf, input, output, weight, scale, bias, global_shift) -class Ne16TestGenerator: +class NnxTestGenerator: _DEFAULT_SEED = 0 @staticmethod - def _global_shift( - tensor: torch.Tensor, out_type: IntegerType, has_relu: bool + def _calculate_global_shift( + tensor: torch.Tensor, out_type: IntegerType ) -> torch.Tensor: - if has_relu: - # only adjust positive values - tensor = tensor[tensor > 0] - + """Calculate global shift so that the output values are in the range of out_type""" s = tensor.type(torch.float64).std() target_s = 2 ** (out_type._bits - 1) - global_shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32) - - return global_shift + return torch.ceil(torch.log2(s / target_s)).type(torch.int32) @staticmethod - def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]): + def _random_data(_type: IntegerType, shape: Tuple): return torch.randint(_type.min, _type.max, size=shape) - @staticmethod - def _cast( - tensor: torch.Tensor, _type: IntegerType, saturate: bool = False - ) -> torch.Tensor: - if saturate: - return tensor.clamp(_type.min, _type.max) - else: - return tensor & ((1 << _type._bits) - 1) - @staticmethod def from_conf( - conf: Ne16TestConf, + conf: NnxTestConf, input: Optional[torch.Tensor] = None, weight: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, global_shift: Optional[torch.Tensor] = None, - ) -> Ne16Test: - torch.manual_seed(Ne16TestGenerator._DEFAULT_SEED) + verbose: bool = False, + ) -> NnxTest: + torch.manual_seed(NnxTestGenerator._DEFAULT_SEED) + + input_shape = (1, conf.in_channel, conf.in_height, conf.in_width) + weight_shape = ( + conf.out_channel, + 1 if conf.depthwise else conf.in_channel, + conf.kernel_shape.height, + conf.kernel_shape.width, + ) + scale_shape = (1, conf.out_channel, 1, 1) + bias_shape = (1, conf.out_channel, 1, 1) if input is None: - input = Ne16TestGenerator._random_data( + input = NnxTestGenerator._random_data( _type=conf.in_type, - shape=(1, conf.in_channel, conf.in_height, conf.in_width), + shape=input_shape, ) - input_padded = F.pad( - input, - ( - conf.padding.left, - conf.padding.right, - conf.padding.top, - conf.padding.bottom, - ), - "constant", - 0, - ) - if weight is None: - weight = Ne16TestGenerator._random_data( + weight = NnxTestGenerator._random_data( _type=conf.weight_type, - shape=( - conf.out_channel, - 1 if conf.depthwise else conf.in_channel, - conf.kernel_shape.height, - conf.kernel_shape.width, - ), + shape=weight_shape, ) - # Accumulators are 32bit non-saturating. - # Calculate in higher precision (int64) - output = F.conv2d( - input=input_padded, - weight=weight, - stride=(conf.stride.height, conf.stride.width), - groups=conf.in_channel if conf.depthwise else 1, - ).type(torch.int64) - # Use only the lower 32bits - output = Ne16TestGenerator._cast( - output, Ne16.ACCUMULATOR_TYPE, saturate=False - ).type(torch.int32) - if conf.has_norm_quant: if scale is None: assert conf.scale_type is not None - scale = Ne16TestGenerator._random_data( - conf.scale_type, shape=(1, conf.out_channel, 1, 1) + scale = NnxTestGenerator._random_data( + conf.scale_type, shape=scale_shape ) - # Scale accumulators are in 48bit, so keeping the data in 64bit - output = scale * output - assert output.dtype == torch.int64 - - if conf.has_bias: - # Saturating cast to int32 + if conf.has_bias and bias is None: assert conf.bias_type is not None - output = Ne16TestGenerator._cast( - output, conf.bias_type, saturate=True - ).type(torch.int32) - - if bias is None: - bias = Ne16TestGenerator._random_data( - conf.bias_type, shape=(1, conf.out_channel, 1, 1) - ).type(torch.int32) - output = output + bias - output = Ne16TestGenerator._cast( - output, conf.bias_type, saturate=False + bias = NnxTestGenerator._random_data( + conf.bias_type, shape=bias_shape ).type(torch.int32) - - if conf.has_relu: - output = F.relu(output) - if global_shift is None: - global_shift = Ne16TestGenerator._global_shift( - output, conf.out_type, conf.has_relu + global_shift = torch.Tensor([0]).type(torch.int32) + output = NeuralEngineFunctionalModel().convolution( + input, + weight, + scale, + bias, + global_shift, + verbose=verbose, + **conf.__dict__, ) - output = output >> global_shift + NnxTestGenerator._calculate_global_shift(output, conf.out_type) - # Saturate into out_type - output = Ne16TestGenerator._cast(output, conf.out_type, saturate=True) + output = NeuralEngineFunctionalModel().convolution( + input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__ + ) - return Ne16Test( + return NnxTest( conf=conf, input=input, output=output, @@ -386,28 +280,38 @@ def from_conf( ) @staticmethod - def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test: + def regenerate(test: NnxTest, regen_tensors: Set[str]) -> NnxTest: test_tensors = set(["input", "output", "weight", "scale", "bias"]) load_tensors = test_tensors - regen_tensors kwargs = {tensor: getattr(test, tensor) for tensor in load_tensors} - return Ne16TestGenerator.from_conf(test.conf, **kwargs) + return NnxTestGenerator.from_conf(test.conf, **kwargs) -class Ne16TestHeaderGenerator: +class NnxTestHeaderGenerator: DEFAULT_HEADERS_DIR = "app/gen" - def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None): + def __init__( + self, + weightEncode: Callable[ + [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8] + ], + headers_dir: Optional[Union[str, os.PathLike]] = None, + ): if headers_dir is None: - headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR + headers_dir = NnxTestHeaderGenerator.DEFAULT_HEADERS_DIR self.header_writer = HeaderWriter(headers_dir) + # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag, + # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator + self.weightEncode = weightEncode - def generate(self, test_name: str, test: Ne16Test): + def generate(self, test_name: str, test: NnxTest): assert test.input is not None and test.output is not None _, in_channel, in_height, in_width = test.input.shape _, out_channel, out_height, out_width = test.output.shape # Render input in_ctype = test.conf.in_type.ctype() + in_signed = test.conf.in_type._signed in_data = test.input.permute(0, 2, 3, 1).ravel() self.header_writer.generate_vector_files( "input", _type=in_ctype, size=in_data.numel(), init=in_data @@ -431,10 +335,10 @@ def generate(self, test_name: str, test: Ne16Test): weight_offset = -(2 ** (weight_bits - 1)) weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape weight_data: np.ndarray = test.weight.numpy() - weight_offset - weight_init = Ne16.weight_unroll( + weight_init = self.weightEncode( weight_data.astype(np.uint8), weight_type._bits, - depthwise=test.conf.depthwise, + test.conf.depthwise, ) self.header_writer.generate_vector_files( "weight", _type="uint8_t", size=weight_init.size, init=weight_init @@ -470,13 +374,14 @@ def generate(self, test_name: str, test: Ne16Test): "height": in_height, "width": in_width, "channel": in_channel, - "bits": 8, + "signed": in_signed, + "bits": test.conf.in_type._bits, }, "output": { "height": out_height, "width": out_width, "channel": out_channel, - "bits": 8, + "bits": test.conf.out_type._bits, }, "weight": { "height": weight_ks_h, @@ -486,8 +391,16 @@ def generate(self, test_name: str, test: Ne16Test): "bits": weight_bits, "offset": weight_offset, }, - "scale": {"bits": 8}, - "bias": {"bits": 32}, + "scale": { + "bits": test.conf.scale_type._bits + if test.conf.scale_type is not None + else 0 + }, + "bias": { + "bits": test.conf.bias_type._bits + if test.conf.bias_type is not None + else 0 + }, "padding": { "top": test.conf.padding.top, "bottom": test.conf.padding.bottom, diff --git a/test/README.md b/test/README.md index c3d29c5..8442493 100644 --- a/test/README.md +++ b/test/README.md @@ -35,3 +35,9 @@ $ pytest test.py --help - [testgen.py](testgen.py): collection of helper tools for individual tests For more information you can run the script with the `-h` flag. + +## Application + +The Makefile in the `app/` uses a flag `ACCELERATOR` to decide which accelerator to use. +The choices are _ne16_ or _neureka_. +You can either export it or run it like `ACCELERATOR= make clean all run`. diff --git a/test/TestClasses.py b/test/TestClasses.py index c10641c..c6267d6 100644 --- a/test/TestClasses.py +++ b/test/TestClasses.py @@ -16,15 +16,16 @@ # # SPDX-License-Identifier: Apache-2.0 -from functools import cached_property import re -from typing import Any, Dict, Literal, Optional, TYPE_CHECKING +from functools import cached_property +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional + from pydantic import ( BaseModel, - model_serializer, - model_validator, NonNegativeInt, PositiveInt, + model_serializer, + model_validator, ) diff --git a/test/app/Makefile b/test/app/Makefile index 14f30fd..ca65892 100644 --- a/test/app/Makefile +++ b/test/app/Makefile @@ -40,6 +40,8 @@ INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp INC_DIRS += gen/inc INC_FLAGS += $(addprefix -I,$(INC_DIRS)) +APP_CFLAGS += $(INC_FLAGS) + # Source files @@ -58,7 +60,9 @@ APP_SRCS += $(wildcard gen/src/*.c) # Flags -APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto -APP_LDFLAGS += -flto +ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:]) +APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE) + +APP_CFLAGS += -O2 -w -Wall -Werror include $(RULES_DIR)/pmsis_rules.mk diff --git a/test/app/src/main.c b/test/app/src/main.c index cc67050..7cce4bf 100644 --- a/test/app/src/main.c +++ b/test/app/src/main.c @@ -29,8 +29,9 @@ int main() { struct pi_cluster_conf cl_conf; struct pi_cluster_task cl_task; - printf("\n"); - printf("Test %s starting\n", TEST_NAME); + printf("\nTest " TEST_NAME " starting\n"); + + printf("\nAccelerator: " NNX_ACCELERATOR "\n"); printf("\n"); layer_info(); @@ -43,13 +44,13 @@ int main() { } pi_cluster_send_task_to_cl( &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL)); - pi_cluster_close(&cl_dev); - - printf("\n"); - printf("Test %s finished\n", TEST_NAME); printf("\n"); check_output(); + pi_cluster_close(&cl_dev); + + printf("\nTest " TEST_NAME " finished\n"); + return 0; } diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c index ffd93a1..0d98ff6 100644 --- a/test/app/src/nnx_layer.c +++ b/test/app/src/nnx_layer.c @@ -19,12 +19,89 @@ */ #include "nnx_layer.h" +#include + +#ifdef NNX_NE16 + #include "ne16.h" #include "ne16_gvsoc.h" #include "ne16_pulp_bsp.h" #include "ne16_task.h" #include "pulp_nnx_ne16.h" -#include + +typedef ne16_norm_mode_e nnx_norm_mode_e; +typedef ne16_quant_t nnx_quant_t; +typedef ne16_norm_t nnx_norm_t; +typedef ne16_task_t nnx_task_t; +typedef ne16_dev_t nnx_dev_t; +typedef ne16_pulp_conf_t nnx_bsp_conf_t; + +#define nnxTaskFlagTrue ne16TaskFlagTrue +#define nnxTaskFlagFalse ne16TaskFlagFalse + +#define nnx_task_init ne16_task_init +#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv +#define nnx_task_set_bits ne16_task_set_bits +#define nnx_task_set_norm_quant ne16_task_set_norm_quant +#define nnx_task_set_weight_offset ne16_task_set_weight_offset +#define nnx_task_set_dims ne16_task_set_dims +#define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2 +#define nnx_task_set_ptrs ne16_task_set_ptrs + +#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_ALL +#define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL +#define nnx_gvsoc_log_activate ne16_gvsoc_log_activate +#define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate + +#define nnx_bsp_get_dev ne16_pulp_get_dev + +#define nnx_init ne16_nnx_init +#define nnx_dispatch_wait ne16_nnx_dispatch_wait +#define nnx_dispatch_stride2x2 ne16_nnx_dispatch_stride2x2 +#define nnx_dispatch ne16_nnx_dispatch +#define nnx_resolve_wait ne16_nnx_resolve_wait +#define nnx_term ne16_nnx_term + +#elif defined NNX_NEUREKA + +#include "neureka.h" +#include "neureka_gvsoc.h" +#include "neureka_siracusa_bsp.h" +#include "neureka_task.h" +#include "pulp_nnx_neureka.h" + +typedef neureka_norm_mode_e nnx_norm_mode_e; +typedef neureka_quant_t nnx_quant_t; +typedef neureka_norm_t nnx_norm_t; +typedef neureka_task_t nnx_task_t; +typedef neureka_dev_t nnx_dev_t; +typedef neureka_siracusa_conf_t nnx_bsp_conf_t; + +#define nnxTaskFlagTrue neurekaTaskFlagTrue +#define nnxTaskFlagFalse neurekaTaskFlagFalse + +#define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset +#define nnx_task_set_dims neureka_task_set_dims +#define nnx_task_set_ptrs neureka_task_set_ptrs + +#define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL +#define NNX_GVSOC_LOG_FORMAT NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL +#define nnx_gvsoc_log_activate neureka_gvsoc_log_activate +#define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate + +#define nnx_bsp_get_dev neureka_siracusa_get_dev + +#define nnx_init neureka_nnx_init +#define nnx_dispatch_wait neureka_nnx_dispatch_wait +#define nnx_dispatch neureka_nnx_dispatch +#define nnx_resolve_wait neureka_nnx_resolve_wait +#define nnx_term neureka_nnx_term + +#endif // NNX_NE16 || NNX_NEUREKA // Generated headers #include "bias.h" @@ -34,73 +111,109 @@ #include "scale.h" #include "weight.h" -static void task_prepare(ne16_task_t *task) { - ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, - WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET, - (ne16_quant_t){.shift_amount = OUTSHIFT, - .mode = quantMode8Bit, - .function = HAS_RELU ? quantFunctionRelu - : quantFunctionIdentity, - .flag_rounding = ne16TaskFlagFalse}, - (ne16_norm_t){.mode = normMode8Bit, - .flag_bias = HAS_BIAS ? ne16TaskFlagTrue - : ne16TaskFlagFalse, - .flag_shift = ne16TaskFlagFalse}, - STRIDE_HEIGHT); - - if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { - ne16_task_set_dims_stride2x2( - task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, - OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, - PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); - } else { - ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, - OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, - PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); - } - - ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL, - INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output, - (uint32_t)weight, (uint32_t)scale, NULL, +static void task_prepare(nnx_task_t *task) { + nnx_task_init(task); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + +#if HAS_NORM_QUANT == 1 +#if SCALE_BITS == 8 + const nnx_norm_mode_e normMode = normMode8Bit; +#elif SCALE_BITS == 32 + const nnx_norm_mode_e normMode = normMode32Bit; +#endif + + nnx_task_set_norm_quant( + task, + (nnx_quant_t){.shift_amount = OUTSHIFT, + .function = + HAS_RELU ? quantFunctionRelu : quantFunctionIdentity, + .flag_rounding = nnxTaskFlagFalse}, + (nnx_norm_t){.mode = normMode, + .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse, + .flag_shift = nnxTaskFlagFalse}); +#endif // HAS_NORM_QUANT + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NNX_NEUREKA +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + neureka_task_set_weight_source(task, neurekaWeightSourceWmem); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif +#endif + + const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; + const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride; + const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8; + const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride; + +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride, + WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT, + PADDING_LEFT); +#else + nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, + w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT, + PADDING_LEFT); +#endif + + nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, w_in_stride, + PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight, +#if HAS_NORM_QUANT == 1 + (uint32_t)scale, NULL, #if HAS_BIAS == 1 - (uint32_t)bias + (uint32_t)bias +#else + NULL +#endif #else - NULL + NULL, NULL, NULL #endif ); } -static void task_execute(ne16_task_t *task) { - ne16_dev_t *dev = ne16_pulp_get_dev(); +static void task_execute(nnx_task_t *task) { + nnx_dev_t *dev = nnx_bsp_get_dev(); - ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG, - NE16_GVSOC_LOG_FORMAT_HEXADECIMAL); +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT); +#endif - ne16_pulp_conf_t conf = {.max_stall = 8}; - ne16_nnx_init(dev, &conf); + nnx_bsp_conf_t conf = {.max_stall = 8}; + nnx_init(dev, &conf); - ne16_nnx_dispatch_wait(dev); + nnx_dispatch_wait(dev); - if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { - ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, - OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, - WEIGHT_HEIGHT, WEIGHT_WIDTH); - } else { - ne16_nnx_dispatch(dev, task); - } +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, + OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, + WEIGHT_WIDTH); +#else + nnx_dispatch(dev, task); +#endif - ne16_nnx_resolve_wait(dev, task); + nnx_resolve_wait(dev, task); - ne16_nnx_term(dev); + nnx_term(dev); - ne16_gvsoc_log_deactivate(dev); +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + nnx_gvsoc_log_deactivate(dev); +#endif } void execute_nnx_layer(void *args) { - ne16_task_t task; + nnx_task_t task; task_prepare(&task); task_execute(&task); } diff --git a/test/conf.toml b/test/conf.toml index 1222f1d..c24055a 100644 --- a/test/conf.toml +++ b/test/conf.toml @@ -22,7 +22,7 @@ # Ne16TestClasses.py:Ne16TestConf().check_valid() # Input dimensions -in_height = 3 +in_height = 4 in_width = 3 in_channel = 8 diff --git a/test/conftest.py b/test/conftest.py index 6c2c15b..3c0a316 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -18,7 +18,17 @@ import os from typing import Union -from Ne16TestClasses import Ne16Test, Ne16TestGenerator + +import pydantic +import pytest + +from Ne16MemoryLayout import Ne16MemoryLayout +from Ne16TestConf import Ne16TestConf +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NeurekaTestConf import NeurekaTestConf +from NnxTestClasses import NnxTest, NnxTestGenerator + +_SUPPORTED_ACCELERATORS = ["ne16", "neureka"] def pytest_addoption(parser): @@ -39,6 +49,13 @@ def pytest_addoption(parser): default=False, help="Recursively search for tests in given test directories.", ) + parser.addoption( + "-A", + "--accelerator", + choices=_SUPPORTED_ACCELERATORS, + default="ne16", + help="Choose an accelerator to test. Default: ne16", + ) parser.addoption( "--regenerate", action="store_true", @@ -54,7 +71,7 @@ def pytest_addoption(parser): def _find_test_dirs(path: Union[str, os.PathLike]): - return [dirpath for dirpath, _, _ in os.walk(path) if Ne16Test.is_test_dir(dirpath)] + return [dirpath for dirpath, _, _ in os.walk(path) if NnxTest.is_test_dir(dirpath)] def pytest_generate_tests(metafunc): @@ -62,6 +79,18 @@ def pytest_generate_tests(metafunc): recursive = metafunc.config.getoption("recursive") regenerate = metafunc.config.getoption("regenerate") timeout = metafunc.config.getoption("timeout") + nnxName = metafunc.config.getoption("accelerator") + + if nnxName == "ne16": + nnxMemoryLayoutCls = Ne16MemoryLayout + nnxTestConfCls = Ne16TestConf + elif nnxName == "neureka": + nnxMemoryLayoutCls = NeurekaMemoryLayout + nnxTestConfCls = NeurekaTestConf + else: + assert ( + False + ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}" if recursive: tests_dirs = test_dirs @@ -69,12 +98,28 @@ def pytest_generate_tests(metafunc): for tests_dir in tests_dirs: test_dirs.extend(_find_test_dirs(tests_dir)) - # (Re)Generate test data + # Load valid tests + nnxTestAndNames = [] for test_dir in test_dirs: - test = Ne16Test.load(test_dir) - if not test.is_valid() or regenerate: - test = Ne16TestGenerator.from_conf(test.conf) - test.save_data(test_dir) + try: + test = NnxTest.load(nnxTestConfCls, test_dir) + # (Re)generate data + if not test.is_valid() or regenerate: + test = NnxTestGenerator.from_conf(test.conf) + test.save_data(test_dir) + nnxTestAndNames.append((test, test_dir)) + except pydantic.ValidationError as e: + _ = e + nnxTestAndNames.append( + pytest.param( + (None, test_dir), + marks=pytest.mark.skipif( + True, reason=f"Invalid test {test_dir}: {e.errors}" + ), + ) + ) - metafunc.parametrize("path", test_dirs) + metafunc.parametrize("nnxTestAndName", nnxTestAndNames) metafunc.parametrize("timeout", [timeout]) + metafunc.parametrize("nnxName", [nnxName]) + metafunc.parametrize("nnxMemoryLayoutCls", [nnxMemoryLayoutCls]) diff --git a/test/requirements-dev.txt b/test/requirements-dev.txt index fa0a75a..0956e5e 100644 --- a/test/requirements-dev.txt +++ b/test/requirements-dev.txt @@ -1,2 +1,3 @@ pyright black +isort diff --git a/test/test.py b/test/test.py index 39709b6..1893cdf 100644 --- a/test/test.py +++ b/test/test.py @@ -16,13 +16,16 @@ # # SPDX-License-Identifier: Apache-2.0 +import locale import os import re -from typing import Union, Optional, Tuple -import locale import subprocess -from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator from pathlib import Path +from typing import Dict, Optional, Tuple, Type, Union + +from Ne16MemoryLayout import Ne16MemoryLayout +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator HORIZONTAL_LINE = "\n" + "-" * 100 + "\n" @@ -49,17 +52,29 @@ def captured_output( def execute_command( - cmd: str, timeout: int = 30, cflags: Optional[str] = None + cmd: str, + timeout: int = 30, + cflags: Optional[str] = None, + envflags: Optional[Dict[str, str]] = None, ) -> Tuple[bool, str, str, Optional[str]]: - app_cflags = 'APP_CFLAGS="' + " ".join(cflags) + '" ' if cflags else "" - cmd = cmd + app_cflags + env = os.environ + if cflags: + env["APP_CFLAGS"] = '"' + " ".join(cflags) + '"' + if envflags: + for key, value in envflags.items(): + env[key] = value status = None stdout = None try: proc = subprocess.run( - cmd.split(), check=True, capture_output=True, text=True, timeout=timeout + cmd.split(), + check=True, + capture_output=True, + text=True, + timeout=timeout, + env=env, ) status = True msg = "OK" @@ -94,28 +109,35 @@ def assert_message( return retval -def test(path: str, timeout: int): - test_name = path - test = Ne16Test.load(path) - - Ne16TestHeaderGenerator().generate(test_name, test) +def test( + nnxTestAndName: Tuple[NnxTest, str], + timeout: int, + nnxName: str, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], +): + nnxTest, nnxTestName = nnxTestAndName + NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate( + nnxTestName, nnxTest + ) Path("app/src/nnx_layer.c").touch() cmd = f"make -C app all run platform=gvsoc" - passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout) + passed, msg, stdout, stderr = execute_command( + cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName} + ) - assert passed, assert_message(msg, test_name, cmd, stdout, stderr) + assert passed, assert_message(msg, nnxTestName, cmd, stdout, stderr) match_success = re.search(r"> Success! No errors found.", stdout) match_fail = re.search(r"> Failure! Found (\d*)/(\d*) errors.", stdout) assert match_success or match_fail, assert_message( - "No regexes matched.", test_name, cmd, stdout + "No regexes matched.", nnxTestName, cmd, stdout ) assert not match_fail, assert_message( f"Errors found: {match_fail.group(1)}/{match_fail.group(2)}", - test_name, + nnxTestName, cmd, stdout, ) diff --git a/test/testgen.py b/test/testgen.py index e748f2e..521aecc 100644 --- a/test/testgen.py +++ b/test/testgen.py @@ -16,28 +16,61 @@ # # SPDX-License-Identifier: Apache-2.0 -import os import argparse import json +import os +from typing import Optional, Set, Type, Union + import toml -from typing import Optional, Union, Set -from Ne16TestClasses import ( - Ne16TestConf, - Ne16TestGenerator, - Ne16Test, - Ne16TestHeaderGenerator, + +from Ne16MemoryLayout import Ne16MemoryLayout +from Ne16TestConf import Ne16TestConf +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NeurekaTestConf import NeurekaTestConf +from NnxTestClasses import ( + NnxTest, + NnxTestConf, + NnxTestGenerator, + NnxTestHeaderGenerator, ) -def headers_gen(args, test: Optional[Ne16Test] = None): +def headers_gen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], + test: Optional[NnxTest] = None, +): if test is None: - test = Ne16Test.load(args.test_dir) + test = NnxTest.load(nnxTestConfCls, args.test_dir) + assert test is not None if not test.is_valid(): - test = Ne16TestGenerator.from_conf(test.conf) - Ne16TestHeaderGenerator().generate(args.test_dir, test) - - -def test_gen(args): + test = NnxTestGenerator.from_conf(test.conf) + NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate( + args.test_dir, test + ) + + +def print_tensors(test: NnxTest): + print("INPUT TENSOR:") + print(test.input) + print("WEIGHT TENSOR:") + print(test.weight) + print("SCALE TENSOR:") + print(test.scale) + print("BIAS TENSOR:") + print(test.bias) + print("GLOBAL SHIFT TENSOR:") + print(test.global_shift) + print("EXPECTED OUTPUT TENSOR:") + print(test.output) + + +def test_gen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], +): if args.conf.endswith(".toml"): test_conf_dict = toml.load(args.conf) elif args.conf.endswith(".json"): @@ -49,37 +82,71 @@ def test_gen(args): ) exit(-1) - test_conf = Ne16TestConf.model_validate(test_conf_dict) - test = Ne16TestGenerator.from_conf(test_conf) + test_conf = nnxTestConfCls.model_validate(test_conf_dict) + test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors) if not args.skip_save: test.save(args.test_dir) if args.headers: - headers_gen(args, test) - - -def _regen(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None: - test = Ne16Test.load(path) - test = Ne16TestGenerator.regenerate(test, regen_tensors) + headers_gen(args, nnxMemoryLayoutCls, nnxTestConfCls, test) + if args.print_tensors: + print_tensors(test) + + +def _regen( + path: Union[str, os.PathLike], + regen_tensors: Set[str], + nnxTestConfCls: Type[NnxTestConf], +) -> None: + test = NnxTest.load(nnxTestConfCls, path) + test = NnxTestGenerator.regenerate(test, regen_tensors) test.save(path) -def _regen_recursive(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None: - if Ne16Test.is_test_dir(path): - _regen(path, regen_tensors) +def _regen_recursive( + path: Union[str, os.PathLike], + regen_tensors: Set[str], + nnxTestConfCls: Type[NnxTestConf], +) -> None: + if NnxTest.is_test_dir(path): + _regen(path, regen_tensors, nnxTestConfCls) return for dirpath, _, _ in os.walk(path): - _regen_recursive(dirpath, regen_tensors) + _regen_recursive(dirpath, regen_tensors, nnxTestConfCls) -def test_regen(args): +def test_regen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], +): + _ = nnxMemoryLayoutCls regen_tensors = set(args.tensors + ["output"]) for test_dir in args.test_dirs: if args.recurse: - _regen_recursive(test_dir, regen_tensors) + _regen_recursive(test_dir, regen_tensors, nnxTestConfCls) else: - _regen(test_dir, regen_tensors) + _regen(test_dir, regen_tensors, nnxTestConfCls) + + +def add_common_arguments(parser: argparse.ArgumentParser): + parser.add_argument( + "-t", + "--test-dir", + type=str, + dest="test_dir", + required=True, + help="Path to the test.", + ) + + parser.add_argument( + "-a", + "--accelerator", + choices=["ne16", "neureka"], + default="ne16", + help="Choose an accelerator. Default: ne16", + ) parser = argparse.ArgumentParser( @@ -91,14 +158,7 @@ def test_regen(args): parser_header = subparsers.add_parser( "headers", description="Generate headers for a single test." ) -parser_header.add_argument( - "-t", - "--test-dir", - type=str, - dest="test_dir", - required=True, - help="Path to the test." "basename.", -) +add_common_arguments(parser_header) parser_header.set_defaults(func=headers_gen) parser_test = subparsers.add_parser( @@ -112,14 +172,6 @@ def test_regen(args): required=True, help="Path to the configuration file.", ) -parser_test.add_argument( - "-t", - "--test-dir", - type=str, - dest="test_dir", - required=True, - help="Path to the test. " "basename.", -) parser_test.add_argument( "--headers", action="store_true", default=False, help="Generate headers." ) @@ -130,6 +182,14 @@ def test_regen(args): dest="skip_save", help="Skip saving the test.", ) +parser_test.add_argument( + "--print-tensors", + action="store_true", + default=False, + dest="print_tensors", + help="Print tensor values to stdout.", +) +add_common_arguments(parser_test) parser_test.set_defaults(func=test_gen) parser_regen = subparsers.add_parser("regen", description="Regenerate test tensors.") @@ -138,25 +198,27 @@ def test_regen(args): type=str, nargs="?", default=[], - help="Tensors that should be regenerated. Output " "included by default.", -) -parser_regen.add_argument( - "-t", - "--test-dir", - action="append", - dest="test_dirs", - required=True, - help="Path to the test.", + help="Tensors that should be regenerated. Output included by default.", ) parser_regen.add_argument( "-r", "--recursive", action="store_true", default=False, - help="Recursively search for test directiories " "inside given test directories.", + help="Recursively search for test directiories inside given test directories.", ) +add_common_arguments(parser_regen) parser_regen.set_defaults(func=test_regen) args = parser.parse_args() -args.func(args) +if args.accelerator == "ne16": + nnxMemoryLayoutCls = Ne16MemoryLayout + nnxTestConfCls = Ne16TestConf +elif args.accelerator == "neureka": + nnxMemoryLayoutCls = NeurekaMemoryLayout + nnxTestConfCls = NeurekaTestConf +else: + assert False, f"Unsupported accelerator {args.accelerator}." + +args.func(args, nnxMemoryLayoutCls, nnxTestConfCls) diff --git a/test/tests/test_102/conf.json b/test/tests/test_102/conf.json new file mode 100644 index 0000000..d6d0c17 --- /dev/null +++ b/test/tests/test_102/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 4, + "in_width": 3, + "in_channel": 8, + "out_channel": 8, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_103/conf.json b/test/tests/test_103/conf.json new file mode 100644 index 0000000..3eff547 --- /dev/null +++ b/test/tests/test_103/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 25, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_104/conf.json b/test/tests/test_104/conf.json new file mode 100644 index 0000000..d6d00e4 --- /dev/null +++ b/test/tests/test_104/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 25, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_105/conf.json b/test/tests/test_105/conf.json new file mode 100644 index 0000000..0f34422 --- /dev/null +++ b/test/tests/test_105/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 40, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_106/conf.json b/test/tests/test_106/conf.json new file mode 100644 index 0000000..0b98f3a --- /dev/null +++ b/test/tests/test_106/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 17, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_107/conf.json b/test/tests/test_107/conf.json new file mode 100644 index 0000000..2f8951c --- /dev/null +++ b/test/tests/test_107/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 17, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_108/conf.json b/test/tests/test_108/conf.json new file mode 100644 index 0000000..7842aaa --- /dev/null +++ b/test/tests/test_108/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_109/conf.json b/test/tests/test_109/conf.json new file mode 100644 index 0000000..a6b71c9 --- /dev/null +++ b/test/tests/test_109/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_110/conf.json b/test/tests/test_110/conf.json new file mode 100644 index 0000000..622efc4 --- /dev/null +++ b/test/tests/test_110/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_111/conf.json b/test/tests/test_111/conf.json new file mode 100644 index 0000000..d6714c4 --- /dev/null +++ b/test/tests/test_111/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_112/conf.json b/test/tests/test_112/conf.json new file mode 100644 index 0000000..1991c59 --- /dev/null +++ b/test/tests/test_112/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 1, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_113/conf.json b/test/tests/test_113/conf.json new file mode 100644 index 0000000..1dce097 --- /dev/null +++ b/test/tests/test_113/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 1 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_114/conf.json b/test/tests/test_114/conf.json new file mode 100644 index 0000000..c1ce5c3 --- /dev/null +++ b/test/tests/test_114/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 1, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_115/conf.json b/test/tests/test_115/conf.json new file mode 100644 index 0000000..19153ba --- /dev/null +++ b/test/tests/test_115/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 1, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/util/hwpe.c b/util/hwpe.c index 53c1ace..0430081 100644 --- a/util/hwpe.c +++ b/util/hwpe.c @@ -31,11 +31,11 @@ #define HWPE_TASK_REG_OFFSET 8 inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { - *(dev->base_addr + reg) = value; + dev->base_addr[reg] = value; } inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) { - return *(dev->base_addr + reg); + return dev->base_addr[reg]; } inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { diff --git a/util/pulp_nnx_util.c b/util/pulp_nnx_util.c index 34db512..0107fc1 100644 --- a/util/pulp_nnx_util.c +++ b/util/pulp_nnx_util.c @@ -20,14 +20,16 @@ #include "pulp_nnx_util.h" -inline int divnceil(const int dividend, const int divisor) { - return ((dividend - 1) / divisor) + 1; +inline int nnx_calculate_number_of_tiles(const int dim_size, + const int tile_size) { + return ((dim_size - 1) / tile_size) + 1; } -inline int remainder(const int dividend, const int divisor) { - return ((dividend - 1) % divisor) + 1; +inline int nnx_calculate_last_tile_size(const int dim_size, + const int tile_size) { + return ((dim_size - 1) % tile_size) + 1; } -inline uint32_t concat_half(const uint16_t high, const uint16_t low) { +inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) { return ((uint32_t)high << 16) | low; } diff --git a/util/pulp_nnx_util.h b/util/pulp_nnx_util.h index 638e5d9..d167f6d 100644 --- a/util/pulp_nnx_util.h +++ b/util/pulp_nnx_util.h @@ -24,26 +24,28 @@ #include /** - * divnceil + * nnx_calculate_number_of_iterations * - * Does integer division and ceiling of it. + * Calculates the number of iterations to go through a dimension. + * It does it by dividing the dimension with the tile size and doing a ceiling + * the result. */ -int divnceil(const int dividend, const int divisor); +int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size); /** - * remainder + * nnx_calculate_last_tile_size * - * Calculates the remainder but if the remainder should be 0, - * returns divisor. Used for calculation of the last `remainding` - * iteration of the tile. + * Calculates the size of the last executed tile by calculating the remainder of + * the dim_size and the tile_size. In case the remainder is 0, it returns the + * full tile_size. */ -int remainder(const int dividend, const int divisor); +int nnx_calculate_last_tile_size(const int dim_size, const int tile_size); /** * concat_half * * Concatenate 2 16-bit numbers into a 32-bit number. */ -uint32_t concat_half(const uint16_t high, const uint16_t low); +uint32_t nnx_concat_half(const uint16_t high, const uint16_t low); #endif // __NNX_UTIL_H__