diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b595682..4c7b267 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,25 +20,41 @@ stages: - lint - test -format_python: +python_format: stage: lint tags: - python-lint script: - black --check . -static_check_python: +python_sort_imports: + stage: lint + tags: + - python-lint + script: + - isort --check test + +python_static_check: stage: lint tags: - python-lint script: - pyright . -run_test0: +run_ne16_test: stage: test tags: - gap9-sdk artifacts: untracked: true script: - - cd test && pytest test.py --test-dir tests --recursive + - cd test && pytest test.py --test-dir tests --recursive -A ne16 + +run_neureka_test: + stage: test + tags: + - siracusa-sdk + artifacts: + untracked: true + script: + - cd test && pytest test.py --test-dir tests --recursive -A neureka diff --git a/CHANGELOG.md b/CHANGELOG.md index 48a4461..84b516f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## [Unreleased] + +### Added + +- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels +- Support for kernels without normalization and quantization for NE16 +- isort check +- publication citation + +### Changed + +- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant` +- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension +- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE` + +### Removed + +- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2` +- `mode` attribute from `ne16_quant_t` structure + ## [0.3.0] - 2024-01-14 ### Added diff --git a/README.md b/README.md index be8c9be..1671dc7 100644 --- a/README.md +++ b/README.md @@ -39,51 +39,22 @@ _Note: The accelerator can provide additional helper functions if needed._ ## Accelerators -### NE16 - -Github repo [link](https://github.com/pulp-platform/ne16). - -#### Implemented features - -- [x] Convolution w/ kernel shape 1x1 -- [x] Convolution w/ kernel shape 3x3 -- [x] Depthwise convolution w/ kernel shape 3x3 -- [x] Stride 1x1 -- [x] Stride 2x2 -- [ ] Normalization and quantization - - [x] With - - [ ] Without - - [x] Relu (w/ and w/o) - - [x] Bias (w/ and w/o) - - [ ] Per-channel shift - - [x] Per-layer shift - - [ ] Rounding -- [ ] Input type - - [x] uint8 - - [ ] uint16 -- [ ] Output type - - [x] int8 - - [x] uint8 (only w/ Relu) - - [ ] int32 - - [ ] uint32 (only w/ Relu) -- [ ] Scale type - - [x] uint8 - - [ ] uint16 - - [ ] uint32 -- [x] Bias type - - [x] int32 -- [ ] Weight type - - [x] int8 - - [ ] int2-7 - -### Neureka - -**Untested and considered broken.** +- [NE16](ne16/README.md) +- [Neureka](neureka/README.md) ## Testing You can find information about testing in the dedicated [README](test/README.md). +### Environment + +The library was tested with following pairs of SDKs and compilers: + +| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash | +| --- | --------------- | -------- | -------------------- | +| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 | +| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) | + ## Contributing Bug reports and feature requests should be reported through issues. @@ -93,15 +64,38 @@ All the development should be done through forks and merged onto the `dev` branc The library will follow the [Semantic Versioning](https://semver.org/). -## Citing +## Publication + +
+If you use PULP-NNX in your work, you can cite us: + +``` +@inproceedings{10.1145/3607889.3609092, + author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco}, + title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study}, + year = {2024}, + isbn = {9798400702907}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3607889.3609092}, + doi = {10.1145/3607889.3609092}, + abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.}, + booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems}, + pages = {9–10}, + numpages = {2}, + keywords = {TinyML, MCUs, deep learning, HW accelerators}, + location = {, Hamburg, Germany, }, + series = {CASES '23 Companion} +} +``` -*TBA* +
## Contributors * Luka Macan <[luka.macan@unibo.it](mailto:luka.macan@unibo.it)> * Francesco Conti <[fconti@unibo.it](mailto:fconti@unibo.it)> -* Arpan Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)> +* Arpan Suravi Prasad <[prasadar@iis.ee.ethz.ch](mailto:prasadar@iis.ee.ethz.ch)> ## License diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h index eff9a60..97e6e2e 100644 --- a/inc/pulp_nnx_ne16.h +++ b/inc/pulp_nnx_ne16.h @@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev); /** ne16_nnx_dispatch * * Dispatch a task to the accelerator. - * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns + * 0. */ int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task); @@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task); */ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task); - /* Additional helper functions */ /** ne16_nnx_dispatch_stride2x2 @@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task); * tile the tile to the subtile's spatial dimensions (in this case 3x3 output). * Works only if the k_out is divisible by 2. */ -void ne16_nnx_dispatch_stride2x2( - ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker); +void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task, + const uint32_t w_in, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t h_ker, + const uint8_t w_ker); diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h new file mode 100644 index 0000000..25ef4a8 --- /dev/null +++ b/inc/pulp_nnx_neureka.h @@ -0,0 +1,61 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka.h" +#include "neureka_siracusa_bsp.h" +#include "neureka_task.h" +#include + +/* PULP-NNX interface */ + +void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf); +void neureka_nnx_term(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_check + * + * Check whether you can dispatch to the accelerator. + */ +int neureka_nnx_dispatch_check(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_wait + * + * Block until you can dispatch to the accelerator. + */ +void neureka_nnx_dispatch_wait(neureka_dev_t *dev); + +/** neureka_nnx_dispatch + * + * Dispatch a task to the accelerator. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns + * 0. + */ +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_check + * + * Check whether the task has been resolved. + */ +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_wait + * + * Block until you can resolve the task. + */ +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task); diff --git a/ne16/README.md b/ne16/README.md new file mode 100644 index 0000000..9f05956 --- /dev/null +++ b/ne16/README.md @@ -0,0 +1,36 @@ +# NE16 + +## Docs + +- Github repo [link](https://github.com/pulp-platform/ne16). + +## Implemented features + +- [x] Convolution w/ kernel shape 1x1 +- [x] Convolution w/ kernel shape 3x3 +- [x] Depthwise convolution w/ kernel shape 3x3 +- [x] Stride 2x2 +- [ ] Normalization and quantization + - [x] With + - [x] Without + - [x] Relu (w/ and w/o) + - [x] Bias (w/ and w/o) + - [ ] Per-channel shift + - [x] Per-layer shift + - [ ] Rounding +- [ ] Input type + - [x] uint8 + - [ ] uint16 +- [ ] Output type + - [x] int8 + - [x] uint8 (only w/ Relu) + - [x] int32 +- [ ] Scale type + - [x] uint8 + - [ ] uint16 + - [ ] uint32 +- [x] Bias type + - [x] int32 +- [ ] Weight type + - [x] int8 + - [ ] int2-7 diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c index 97859b4..d92a7d5 100644 --- a/ne16/hal/ne16.c +++ b/ne16/hal/ne16.c @@ -23,8 +23,6 @@ #define NE16_STATUS_EMPTY (0x000) #define NE16_STATUS_FULL (0x101) -inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; } - inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) { uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); return (status & 0x1) + ((status >> 8) & 0x1); diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h index c4c3a19..88ebee7 100644 --- a/ne16/hal/ne16.h +++ b/ne16/hal/ne16.h @@ -24,11 +24,12 @@ #include "hwpe.h" #include +#define NE16_TASK_QUEUE_SIZE (2) + typedef struct ne16_dev_t { hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ } ne16_dev_t; -int ne16_task_queue_size(ne16_dev_t *dev); int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev); int ne16_task_queue_empty(ne16_dev_t *dev); int ne16_task_queue_full(ne16_dev_t *dev); diff --git a/ne16/hal/ne16_task.c b/ne16/hal/ne16_task.c index 0ba54d5..f8408da 100644 --- a/ne16/hal/ne16_task.c +++ b/ne16/hal/ne16_task.c @@ -22,9 +22,9 @@ #include "ne16_task_defs.h" #include "pulp_nnx_util.h" -inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, - uint32_t i_width, uint32_t n_height, - uint32_t n_width) { +uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { uint32_t tile_padding = padding; if (i_height > 0) { tile_padding &= ~(0xf << 28); @@ -41,41 +41,65 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, return tile_padding; } -void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const ne16_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, ne16_quant_t quant, - ne16_norm_t norm, const uint8_t stride) { - const uint32_t flag_mode16 = - input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC; - - *task = (ne16_task_t){ - .outbytes = output_bits / 8, - .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 - : NE16_WEIGHT_D0_STRIDE_MODE8, - .qw = weights_bits, - .stride_shift = stride == 2 ? 1 : 0, - .output_channel_throughput = depthwise ? NE16_INPUT_CHANNEL_THROUGHPUT - : NE16_OUTPUT_CHANNEL_THROUGHPUT, - .kernel_shape = kernel_shape, - .depthwise = depthwise, - .data = {0}}; - - const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0; +void ne16_task_init(ne16_task_t *task) { *task = (ne16_task_t){.data = {0}}; } +void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = + depthwise ? NE16_SUBTILE_INPUT_CHANNEL : NE16_SUBTILE_OUTPUT_CHANNEL; const int flag_mode = kernel_shape == 1 ? NE16_FLAG_MODE_1x1 : depthwise == 1 ? NE16_FLAG_MODE_3x3_DW : NE16_FLAG_MODE_3x3; + const int flag_stride2x2 = stride == 2 ? NE16_FLAG_STRIDE_2x2 : 0; + + task->data.cfg.conf0 &= ~(NE16_MASK_FLAG_MODE | NE16_MASK_FLAG_STRIDE_2x2); + task->data.cfg.conf0 |= flag_mode | flag_stride2x2; +} + +void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weight_bits) { + const uint32_t flag_mode16 = + input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC; + + ne16_quant_mode_e quantMode; + if (output_bits == 16) { + quantMode = quantMode16Bit; + } else if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->weight_d0_stride = + flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8; + task->qw = weight_bits; + task->data.cfg.conf0 &= ~(NE16_MASK_QUANT_MODE | NE16_MASK_FLAG_MODE16 | + NE16_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | flag_mode16 | (weight_bits - 1); +} + +void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant, + ne16_norm_t norm) { + task->data.cfg.conf0 &= + ~(NE16_MASK_QUANT_FUNCTION | NE16_MASK_SHIFT_AMOUNT | + NE16_MASK_FLAG_ROUNDING | NE16_MASK_NORM_MODE | + NE16_MASK_FLAG_NORM_BIAS | NE16_MASK_FLAG_NORM_SHIFT); task->data.cfg.conf0 |= - NE16_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | quant.flag_rounding << NE16_SHIFT_ROUNDING | - norm.mode | norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode | - flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2; + NE16_FLAG_NORM_QUANT | quant.function | (quant.shift_amount << 16) | + quant.flag_rounding << NE16_SHIFT_FLAG_ROUNDING | norm.mode | + norm.flag_bias << NE16_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NE16_SHIFT_FLAG_NORM_SHIFT; +} - task->data.cfg.weight_offset_factor = weights_offset_factor; +void ne16_task_set_weight_offset(ne16_task_t *task, + ne16_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NE16_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; } /** ne16_pad_ptr @@ -84,21 +108,18 @@ void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, * it was the start to the padded data. * Necessary for input pointer when it's padded. */ -inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, - const uint8_t padding_left) { - return ptr - (padding_top * width + padding_left) * channel * bits / 8; +uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride, + const uint8_t padding_top, const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * width_stride; } -inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, - uint32_t w_in, uint32_t k_in, uint8_t bits_in, - uint8_t padding_top, uint8_t padding_left, - uint32_t output_ptr, uint32_t weights_ptr, - uint32_t scale_ptr, uint32_t shift_ptr, - uint32_t bias_ptr) { +void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in, + uint32_t w_in_stride, uint8_t padding_top, + uint8_t padding_left, uint32_t output_ptr, + uint32_t weights_ptr, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr) { task->data.infeat_ptr = - ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); + ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left); task->data.outfeat_ptr = output_ptr; task->data.weights_ptr = weights_ptr; task->data.scale_ptr = scale_ptr; @@ -107,100 +128,101 @@ inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, } void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride) { - const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = + nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL); const ne16_stride_t input_stride = { - .d0 = k_in_stride, - .d1 = k_in_stride * w_in_stride, - .d2 = task->depthwise ? 0 - : k_in_stride * NE16_FILTER_BUFFER_SIZE * - NE16_FILTER_BUFFER_SIZE}; + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; task->data.cfg.input_stride = input_stride; - // WARNING: Stride works only for even output channel sizes (divisible by 2) - const ne16_stride_t output_stride = { - .d0 = 32, - .d1 = (k_out_stride * task->outbytes) >> task->stride_shift, - .d2 = - (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift}; + const ne16_stride_t output_stride = {.d0 = NE16_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; task->data.cfg.output_stride = output_stride; if (task->kernel_shape == 1) { task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw; task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw * num_k_in; - task->data.cfg.weights_stride.d2 = 0; } else if (!task->depthwise) { task->data.cfg.weights_stride.d0 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride; task->data.cfg.weights_stride.d1 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride * task->qw * num_k_in; - task->data.cfg.weights_stride.d2 = 0; } else { task->data.cfg.weights_stride.d0 = NE16_FILTER_SIZE * NE16_FILTER_SIZE * task->weight_d0_stride; task->data.cfg.weights_stride.d1 = 0; - task->data.cfg.weights_stride.d2 = 0; } + task->data.cfg.weights_stride.d2 = 0; } void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint8_t padding_bottom, const uint8_t padding_right) { - const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput); - const uint16_t num_Ki = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); - const uint16_t num_Ho = divnceil(h_out, NE16_FILTER_SIZE); - const uint16_t num_Wo = divnceil(w_out, NE16_FILTER_SIZE); - - const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput); - const uint16_t rem_Ki = remainder(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); - const uint16_t rem_Ho = remainder(h_out, NE16_FILTER_SIZE); - const uint16_t rem_Wo = remainder(w_out, NE16_FILTER_SIZE); + const uint16_t num_Ko = + nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel); + const uint16_t num_Ki = + nnx_calculate_number_of_tiles(k_in, NE16_SUBTILE_INPUT_CHANNEL); + const uint16_t num_Ho = + nnx_calculate_number_of_tiles(h_out, NE16_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = + nnx_calculate_number_of_tiles(w_out, NE16_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = + nnx_calculate_last_tile_size(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = + nnx_calculate_last_tile_size(k_in, NE16_SUBTILE_INPUT_CHANNEL); + const uint16_t rem_Ho = + nnx_calculate_last_tile_size(h_out, NE16_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = + nnx_calculate_last_tile_size(w_out, NE16_SUBTILE_OUTPUT_WIDTH); const uint16_t rem_Hi = (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; const ne16_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; + .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki), + .HoWo = nnx_concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki), + .HoWo = nnx_concat_half(rem_Ho, rem_Wo), + .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}}; task->data.cfg.subtile = subtile; } -inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, - const uint8_t bottom, const uint8_t left, - const uint8_t right, const uint8_t value) { +void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | (value & 0xff); } -inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, - const uint8_t right, const uint8_t bottom, - const uint8_t left) { +void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | ((bottom & 0xff) << 8) | ((left & 0xff) << 0); } void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t k_in, const uint32_t h_in_stride, + const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, + const uint32_t h_out_stride, + const uint32_t w_out_stride, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left) { - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); + ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, padding_right); ne16_task_set_padding(task, padding_top, padding_bottom, padding_left, @@ -209,18 +231,20 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, void ne16_task_set_dims_stride2x2( ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left) { const uint8_t stride = 2; - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); + // WARNING: works only for even output channel stride (divisible by 2) + ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride >> 1, + w_out_stride >> 1); ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1, - k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0); + k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, + 0); const uint8_t padding_bottom_new = (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom; diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h index df16b6c..69bc78c 100644 --- a/ne16/hal/ne16_task.h +++ b/ne16/hal/ne16_task.h @@ -60,7 +60,6 @@ typedef enum ne16_quant_function_e { typedef struct ne16_quant_t { // Shift amount must be in range 0x00-0x1F unsigned shift_amount; - ne16_quant_mode_e mode; ne16_quant_function_e function; int flag_rounding; } ne16_quant_t; @@ -110,38 +109,46 @@ typedef struct ne16_task_data_t { typedef struct ne16_task_t { ne16_task_data_t data; - uint8_t outbytes; uint8_t weight_d0_stride; uint8_t qw; - uint8_t stride_shift; - uint8_t output_channel_throughput; + uint8_t subtile_output_channel; uint8_t kernel_shape; uint8_t depthwise; uint8_t id; } ne16_task_t; -void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const ne16_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, ne16_quant_t quant, - ne16_norm_t norm, const uint8_t stride); +void ne16_task_init(ne16_task_t *task); +void ne16_task_set_op_to_conv(ne16_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride); +void ne16_task_set_bits(ne16_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weight_bits); +void ne16_task_set_norm_quant(ne16_task_t *task, ne16_quant_t quant, + ne16_norm_t norm); +void ne16_task_set_weight_offset(ne16_task_t *task, + ne16_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, uint32_t i_width, uint32_t n_height, uint32_t n_width); uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, const uint8_t padding_left); + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left); void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in, - uint32_t k_in, uint8_t bits_in, uint8_t padding_top, + uint32_t w_in_stride, uint8_t padding_top, uint8_t padding_left, uint32_t output_ptr, uint32_t weights_ptr, uint32_t scale_ptr, uint32_t shift_ptr, uint32_t bias_ptr); +/** ne16_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride); + const uint32_t h_out_stride, + const uint32_t w_out_stride); void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint8_t padding_bottom, @@ -152,19 +159,32 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, const uint8_t right, const uint8_t bottom, const uint8_t left); +/** ne16_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t k_in, const uint32_t h_in_stride, + const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, + const uint32_t h_out_stride, + const uint32_t w_out_stride, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left); +/** ne16_task_set_dims_stride2x2 + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the NE16 requires the channels to be contiguous. + */ void ne16_task_set_dims_stride2x2( ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t k_in, const uint32_t h_in_stride, const uint32_t w_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint32_t h_out_stride, const uint32_t w_out_stride, const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, const uint8_t padding_bottom, const uint8_t padding_right, const uint8_t padding_left); diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h index 803e30e..d3d7297 100644 --- a/ne16/hal/ne16_task_defs.h +++ b/ne16/hal/ne16_task_defs.h @@ -25,8 +25,13 @@ #define NE16_FILTER_SIZE (3) #define NE16_FILTER_BUFFER_SIZE (5) -#define NE16_INPUT_CHANNEL_THROUGHPUT (16) -#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32) +#define NE16_SUBTILE_INPUT_HEIGHT (5) +#define NE16_SUBTILE_INPUT_WIDTH (5) +#define NE16_SUBTILE_INPUT_CHANNEL (16) +#define NE16_SUBTILE_OUTPUT_HEIGHT (3) +#define NE16_SUBTILE_OUTPUT_WIDTH (3) +#define NE16_SUBTILE_OUTPUT_CHANNEL (32) +#define NE16_OUTPUT_BANDWIDTH_BYTES (32) #define NE16_WEIGHT_D0_STRIDE_MODE8 (2) #define NE16_WEIGHT_D0_STRIDE_MODE16 (1) @@ -59,12 +64,6 @@ #define NE16_REG_FILTER_MASKING 22 #define NE16_REG_CONF0 23 -/* SHIFT */ - -#define NE16_SHIFT_FLAG_NORM_BIAS (25) -#define NE16_SHIFT_FLAG_NORM_SHIFT (24) -#define NE16_SHIFT_ROUNDING (11) - /* CONF0 FLAGS */ #define NE16_FLAG_NORM_BIAS (1 << 25) @@ -81,7 +80,7 @@ #define NE16_NORM_MODE_8BIT (0 << 12) #define NE16_NORM_MODE_16BIT (1 << 12) #define NE16_NORM_MODE_32BIT (2 << 12) -#define NE16_FLAG_ROUND (1 << 11) +#define NE16_FLAG_ROUNDING (1 << 11) #define NE16_FLAG_STRIDE_2x2 (1 << 8) #define NE16_FLAG_LINEAR_MODE (1 << 7) #define NE16_FLAG_MODE_3x3 (0 << 5) @@ -91,10 +90,26 @@ #define NE16_FLAG_MODE_BASIC (0 << 3) #define NE16_FLAG_MODE16 (1 << 3) +/* SHIFT */ + +#define NE16_SHIFT_FLAG_NORM_BIAS (25) +#define NE16_SHIFT_FLAG_NORM_SHIFT (24) +#define NE16_SHIFT_FLAG_ROUNDING (11) + /* Masks */ -#define NE16_MASK_QUANT_FUNCTION (1 << 23) -#define NE16_MASK_QUANT_MODE (3 << 21) +#define NE16_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NE16_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NE16_MASK_QUANT_FUNCTION (0x1 << 23) +#define NE16_MASK_QUANT_MODE (0x3 << 21) +#define NE16_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NE16_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NE16_MASK_NORM_MODE (0x3 << 12) +#define NE16_MASK_FLAG_ROUNDING (0x1 << 11) +#define NE16_MASK_FLAG_STRIDE_2x2 (0x1 << 8) +#define NE16_MASK_FLAG_MODE (0x3 << 5) +#define NE16_MASK_FLAG_MODE16 (0x1 << 3) +#define NE16_MASK_FLAG_WEIGHT_BITS (0x7 << 0) /* PADDING */ diff --git a/neureka/README.md b/neureka/README.md new file mode 100644 index 0000000..9c83f4e --- /dev/null +++ b/neureka/README.md @@ -0,0 +1,34 @@ +# Neureka + +## Docs + +Github repo [link](https://github.com/siracusa-soc/ne). + +## Implemented features + +- [x] Convolution w/ kernel shape 1x1 +- [x] Convolution w/ kernel shape 3x3 +- [x] Depthwise convolution w/ kernel shape 3x3 +- [ ] Normalization and quantization + - [x] With + - [x] Without + - [x] Relu (w/ and w/o) + - [x] Bias (w/ and w/o) + - [ ] Per-channel shift + - [x] Per-layer shift + - [ ] Rounding +- [x] Input type + - [x] uint8 + - [x] int8 +- [x] Output type + - [x] int8 + - [x] uint8 (only w/ Relu) + - [x] int32 +- [ ] Scale type + - [x] uint8 + - [ ] uint32 +- [x] Bias type + - [x] int32 +- [ ] Weight type + - [x] int8 + - [ ] int2-7 diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c new file mode 100644 index 0000000..57136fd --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.c @@ -0,0 +1,78 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_siracusa_bsp.h" +#include + +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR \ + (NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR + \ + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff +#define NEUREKA_SIRACUSA_MAX_STALL (8) +#define NEUREKA_SIRACUSA_EVENT (1 << 12) +#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000) + +void neureka_siracusa_hci_setpriority_neureka() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_setpriority_core() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void neureka_siracusa_hci_reset_max_stall() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf) { + neureka_siracusa_hci_setpriority_neureka(); + neureka_siracusa_hci_set_max_stall(conf->max_stall); +} + +void neureka_siracusa_close() { + neureka_siracusa_hci_reset_max_stall(); + neureka_siracusa_hci_setpriority_core(); +} + +void neureka_siracusa_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT); +} + +static const neureka_dev_t neureka_siracusa_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}}; + +const neureka_dev_t *neureka_siracusa_get_dev() { + return &neureka_siracusa_dev; +} diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h new file mode 100644 index 0000000..be75a20 --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.h @@ -0,0 +1,67 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_siracusa_BSP_H__ +#define __NEUREKA_siracusa_BSP_H__ + +#include "neureka.h" +#include + +/** + * neureka_siracusa_setpriority_neureka + * + * Set HCI interconnect bus priority to prioritize neureka. + */ +void neureka_siracusa_hci_setpriority_neureka(); + +/** + * neureka_siracusa_setpriority_core + * + * Set HCI bus priority to prioritize cores. + */ +void neureka_siracusa_hci_setpriority_core(); + +/** + * neureka_siracusa_hci_reset_maxstall + * + * Reset the HCI bus maxstall parameter. + * TODO: Check if it disables it also or just resets? + */ +void neureka_siracusa_hci_reset_max_stall(); + +/** + * neureka_siracusa_hci_set_maxstall + * + * Set the HCI bus maxstall. Maxstall defines how many cycles + * will the HCI bus stall the lower priority master, i.e. neureka or core, + * before letting it do a transaction. + */ +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall); + +typedef struct neureka_siracusa_conf_t { + int max_stall; +} neureka_siracusa_conf_t; + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf); +void neureka_siracusa_close(); +void neureka_siracusa_event_wait_and_clear(); +const neureka_dev_t *neureka_siracusa_get_dev(); + +#endif // !__NEUREKA_siracusa_BSP_H__ diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h new file mode 100644 index 0000000..37eeab0 --- /dev/null +++ b/neureka/gvsoc/neureka_gvsoc.h @@ -0,0 +1,54 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_GVSOC_H__ +#define __NEUREKA_GVSOC_H__ + +#include "neureka.h" +#include "neureka_task.h" + +#define NEUREKA_REG_GVSOC_LOG_LEVEL 24 +#define NEUREKA_REG_GVSOC_LOG_FORMAT 25 + +typedef enum neureka_gvsoc_log_format_e { + NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0, + NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3 +} neureka_gvsoc_log_format_e; + +typedef enum neureka_gvsoc_log_level_e { + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0, + NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1, + NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2, + NEUREKA_GVSOC_LOG_LEVEL_ALL = 3 +} neureka_gvsoc_log_level_e; + +static void neureka_gvsoc_log_activate(neureka_dev_t *dev, + neureka_gvsoc_log_level_e log_level, + neureka_gvsoc_log_format_e format) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level); + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format); +} + +static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END); +} + +#endif // __NEUREKA_GVSOC_H__ diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c similarity index 56% rename from neureka/inc/pulp_nnx_error_codes.h rename to neureka/hal/neureka.c index dc71575..dc829d9 100644 --- a/neureka/inc/pulp_nnx_error_codes.h +++ b/neureka/hal/neureka.c @@ -18,15 +18,20 @@ * SPDX-License-Identifier: Apache-2.0 */ -#ifndef __NE16_ERROR_CODES_H__ -#define __NE16_ERROR_CODES_H__ +#include "neureka.h" -typedef enum { - success = 0, - weightBitwidthOutOfBounds, - unsupportedWeightOffsetMode, - unsupportedFeatureBitwidth, - dimensionMismatch -} nnx_error_code; +#define NEUREKA_STATUS_EMPTY (0x000) +#define NEUREKA_STATUS_FULL (0x101) -#endif // __NE16_ERROR_CODES_H__ \ No newline at end of file +inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) { + uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); + return (status & 0x1) + ((status >> 8) & 0x1); +} + +inline int neureka_task_queue_empty(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY; +} + +inline int neureka_task_queue_full(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL; +} diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h similarity index 62% rename from neureka/src/pulp_nnx_util.c rename to neureka/hal/neureka.h index daaaf2b..eae77a1 100644 --- a/neureka/src/pulp_nnx_util.c +++ b/neureka/hal/neureka.h @@ -18,13 +18,20 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "pulp_nnx_util.h" -#include "pulp_nnx_hal.h" +#ifndef __NEUREKA_H__ +#define __NEUREKA_H__ -void nnx_activate_gvsoc_logging(int log_level) { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level); -} +#include "hwpe.h" +#include -void nnx_deactivate_gvsoc_logging() { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0); -} +#define NEUREKA_TASK_QUEUE_SIZE (2) + +typedef struct neureka_dev_t { + hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ +} neureka_dev_t; + +int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev); +int neureka_task_queue_empty(neureka_dev_t *dev); +int neureka_task_queue_full(neureka_dev_t *dev); + +#endif // __NEUREKA_H__ diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c new file mode 100644 index 0000000..501b2b9 --- /dev/null +++ b/neureka/hal/neureka_task.c @@ -0,0 +1,239 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_task.h" +#include "neureka_task_defs.h" +#include "pulp_nnx_util.h" + +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { + uint32_t tile_padding = padding; + if (i_height > 0) { + tile_padding &= ~(0xf << 28); + } + if (i_width < n_width - 1) { + tile_padding &= ~(0xf << 24); + } + if (i_height < n_height - 1) { + tile_padding &= ~(0xf << 20); + } + if (i_width > 0) { + tile_padding &= ~(0xf << 16); + } + return tile_padding; +} + +void neureka_task_init(neureka_task_t *task) { + *task = (neureka_task_t){.data = {0}}; +} + +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, + const uint8_t stride) { + task->depthwise = depthwise; + task->kernel_shape = kernel_shape; + task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_OUTPUT_CHANNEL; + task->subtile_input_channel = kernel_shape == 3 + ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 + : NEUREKA_SUBTILE_INPUT_CHANNEL_1x1; + + const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1 + : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW + : NEUREKA_FLAG_MODE_3x3; + + task->data.cfg.conf0 &= ~(NEUREKA_MASK_FLAG_MODE); + task->data.cfg.conf0 |= flag_mode; +} + +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits) { + neureka_quant_mode_e quantMode; + if (output_bits == 8) { + quantMode = quantMode8Bit; + } else { + quantMode = quantMode32Bit; + } + + task->qw = weight_bits; + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_MODE | NEUREKA_MASK_FLAG_WEIGHT_BITS); + task->data.cfg.conf0 |= quantMode | (weight_bits - 1); +} + +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm) { + task->data.cfg.conf0 &= + ~(NEUREKA_MASK_QUANT_FUNCTION | NEUREKA_MASK_SHIFT_AMOUNT | + NEUREKA_MASK_NORM_MODE | NEUREKA_MASK_FLAG_NORM_BIAS | + NEUREKA_MASK_FLAG_NORM_SHIFT); + task->data.cfg.conf0 |= NEUREKA_FLAG_NORM_QUANT | quant.function | + (quant.shift_amount << 16) | norm.mode | + norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT; +} + +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_WEIGHT_OFFSET_MODE; + task->data.cfg.conf0 |= weight_offset_mode; + task->data.cfg.weight_offset_factor = weight_offset; +} + +void neureka_task_set_input_signed(neureka_task_t *task) { + task->data.cfg.conf0 |= NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_input_unsigned(neureka_task_t *task) { + task->data.cfg.conf0 &= ~NEUREKA_FLAG_INPUT_SIGNED; +} + +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source) { + task->data.cfg.conf0 &= ~NEUREKA_MASK_FLAG_WEIGHT_SOURCE; + task->data.cfg.conf0 |= weight_source; +} + +/** neureka_pad_ptr + * + * Calculate the pointer to the start of the ptr as if + * it was the start to the padded data. + * Necessary for input pointer when it's padded. + */ +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * width_stride; +} + +void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr) { + task->data.infeat_ptr = + neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left); + task->data.outfeat_ptr = output_ptr; + task->data.weights_ptr = weights_ptr; + task->data.scale_ptr = scale_ptr; + task->data.scale_shift_ptr = shift_ptr; + task->data.scale_bias_ptr = bias_ptr; +} + +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride) { + const uint32_t num_k_in = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + + const neureka_stride_t input_stride = { + .d0 = w_in_stride, .d1 = h_in_stride, .d2 = 0}; + task->data.cfg.input_stride = input_stride; + + const neureka_stride_t output_stride = {.d0 = NEUREKA_OUTPUT_BANDWIDTH_BYTES, + .d1 = w_out_stride, + .d2 = h_out_stride}; + task->data.cfg.output_stride = output_stride; + + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES; + if (task->kernel_shape == 1) { // 1x1 + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in; + } else if (!task->depthwise) { // 3x3 + task->data.cfg.weights_stride.d1 = + NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in; + } else { // 3x3 depthwise + task->data.cfg.weights_stride.d1 = 0; + } + task->data.cfg.weights_stride.d2 = 0; +} + +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right) { + const uint16_t num_Ko = + nnx_calculate_number_of_tiles(k_out, task->subtile_output_channel); + const uint16_t num_Ki = + nnx_calculate_number_of_tiles(k_in, task->subtile_input_channel); + const uint16_t num_Ho = + nnx_calculate_number_of_tiles(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t num_Wo = + nnx_calculate_number_of_tiles(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + + const uint16_t rem_Ko = + nnx_calculate_last_tile_size(k_out, task->subtile_output_channel); + const uint16_t rem_Ki = + nnx_calculate_last_tile_size(k_in, task->subtile_input_channel); + const uint16_t rem_Ho = + nnx_calculate_last_tile_size(h_out, NEUREKA_SUBTILE_OUTPUT_HEIGHT); + const uint16_t rem_Wo = + nnx_calculate_last_tile_size(w_out, NEUREKA_SUBTILE_OUTPUT_WIDTH); + const uint16_t rem_Hi = + (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; + const uint16_t rem_Wi = + (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; + + const neureka_subtile_t subtile = { + .number = {.KoKi = nnx_concat_half(num_Ko, num_Ki), + .HoWo = nnx_concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = nnx_concat_half(rem_Ko, rem_Ki), + .HoWo = nnx_concat_half(rem_Ho, rem_Wo), + .HiWi = nnx_concat_half(rem_Hi, rem_Wi)}}; + task->data.cfg.subtile = subtile; +} + +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { + task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | + ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | + (value & 0xff); +} + +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { + task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | + ((bottom & 0xff) << 8) | ((left & 0xff) << 0); +} + +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, const uint8_t padding_left) { + neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride, + w_out_stride); + neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, + padding_right); + neureka_task_set_padding(task, padding_top, padding_bottom, padding_left, + padding_right, 0); +} diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h new file mode 100644 index 0000000..2d06468 --- /dev/null +++ b/neureka/hal/neureka_task.h @@ -0,0 +1,187 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_TASK_H__ +#define __NEUREKA_TASK_H__ + +#include "neureka_task_defs.h" +#include + +typedef enum neureka_task_flag_e { + neurekaTaskFlagFalse = 0, + neurekaTaskFlagTrue = 1 +} neureka_task_flag_e; + +typedef enum neureka_weight_source_e { + neurekaWeightSourceTcdm = NEUREKA_FLAG_WEIGHT_SOURCE_TCDM, + neurekaWeightSourceWmem = NEUREKA_FLAG_WEIGHT_SOURCE_WMEM +} neureka_weight_source_e; + +typedef enum neureka_weight_offset_mode_e { + weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, + weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE +} neureka_weight_offset_mode_e; + +typedef enum { + normMode8Bit = NEUREKA_NORM_MODE_8BIT, + normMode32Bit = NEUREKA_NORM_MODE_32BIT +} neureka_norm_mode_e; + +typedef struct neureka_norm_t { + neureka_norm_mode_e mode; + int flag_bias; + int flag_shift; +} neureka_norm_t; + +typedef enum neureka_quant_mode_e { + quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, + quantMode32Bit = NEUREKA_QUANT_MODE_32BIT +} neureka_quant_mode_e; + +typedef enum neureka_quant_function_e { + quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, + quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU +} neureka_quant_function_e; + +typedef struct neureka_quant_t { + // Shift amount must be in range 0x00-0x1F + unsigned shift_amount; + neureka_quant_function_e function; + int flag_rounding; +} neureka_quant_t; + +typedef struct neureka_stride_t { + uint32_t d0; + uint32_t d1; + uint32_t d2; +} neureka_stride_t; + +typedef struct neureka_subtile_remainder_t { + uint32_t KoKi; + uint32_t HoWo; + uint32_t HiWi; +} neureka_subtile_remainder_t; + +typedef struct neureka_subtile_number_t { + uint32_t KoKi; + uint32_t HoWo; +} neureka_subtile_number_t; + +typedef struct neureka_subtile_t { + neureka_subtile_remainder_t remainder; + neureka_subtile_number_t number; +} neureka_subtile_t; + +typedef struct neureka_cfg_t { + neureka_stride_t input_stride; + neureka_stride_t output_stride; + neureka_stride_t weights_stride; + neureka_subtile_t subtile; + uint32_t padding; + uint32_t weight_offset_factor; + uint32_t filter_mask; + uint32_t conf0; +} neureka_cfg_t; + +typedef struct neureka_task_data_t { + uint32_t weights_ptr; + uint32_t infeat_ptr; + uint32_t outfeat_ptr; + uint32_t scale_ptr; + uint32_t scale_shift_ptr; + uint32_t scale_bias_ptr; + neureka_cfg_t cfg; +} neureka_task_data_t; + +typedef struct neureka_task_t { + neureka_task_data_t data; + uint8_t qw; + uint8_t subtile_output_channel; + uint8_t subtile_input_channel; + uint8_t kernel_shape; + uint8_t depthwise; + uint8_t id; +} neureka_task_t; + +void neureka_task_init(neureka_task_t *task); +void neureka_task_set_op_to_conv(neureka_task_t *task, + const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t stride); +void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits, + const uint8_t output_bits, + const uint8_t weight_bits); +void neureka_task_set_norm_quant(neureka_task_t *task, neureka_quant_t quant, + neureka_norm_t norm); +void neureka_task_set_weight_offset( + neureka_task_t *task, neureka_weight_offset_mode_e weight_offset_mode, + const int32_t weight_offset); +void neureka_task_set_input_signed(neureka_task_t *task); +void neureka_task_set_input_unsigned(neureka_task_t *task); +void neureka_task_set_weight_source(neureka_task_t *task, + neureka_weight_source_e weight_source); +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width); +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t width_stride, const uint8_t padding_top, + const uint8_t padding_left); +void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t w_in_stride, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr); +/** neureka_task_set_strides + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_in_stride, + const uint32_t w_in_stride, + const uint32_t h_out_stride, + const uint32_t w_out_stride); +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, + const uint8_t padding_bottom, + const uint8_t padding_right); +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value); +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left); +/** neureka_task_set_dims + * + * All the strides variables are strides between elements alongside that + * dimension and expressed in bytes. There is no stride variable for the channel + * dimension because the N-EUREKA requires the channels to be contiguous. + */ +void neureka_task_set_dims( + neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t h_in_stride, const uint32_t w_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t h_out_stride, const uint32_t w_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, const uint8_t padding_left); + +#endif // !__NEUREKA_TASK_H__ diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h new file mode 100644 index 0000000..fa08289 --- /dev/null +++ b/neureka/hal/neureka_task_defs.h @@ -0,0 +1,124 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_DEFS_H__ +#define __NEUREKA_DEFS_H__ + +/* ARHITECTURE */ + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32) + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28) + +#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6) +#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6) +#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32) + +#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32) + +/* TASK REGISTERS */ + +// job configuration +#define NEUREKA_REG_WEIGHTS_PTR 0 +#define NEUREKA_REG_INFEAT_PTR 1 +#define NEUREKA_REG_OUTFEAT_PTR 2 +#define NEUREKA_REG_SCALE_PTR 3 +#define NEUREKA_REG_SCALE_SHIFT_PTR 4 +#define NEUREKA_REG_SCALE_BIAS_PTR 5 +#define NEUREKA_REG_INFEAT_D0_STRIDE 6 +#define NEUREKA_REG_INFEAT_D1_STRIDE 7 +#define NEUREKA_REG_INFEAT_D2_STRIDE 8 +#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9 +#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10 +#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11 +#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12 +#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13 +#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14 +#define NEUREKA_REG_SUBTILE_REMAINDER_0 15 +#define NEUREKA_REG_SUBTILE_REMAINDER_1 16 +#define NEUREKA_REG_SUBTILE_REMAINDER_2 17 +#define NEUREKA_REG_SUBTILE_NUMBER_0 18 +#define NEUREKA_REG_SUBTILE_NUMBER_1 19 +#define NEUREKA_REG_PADDING 20 +#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21 +#define NEUREKA_REG_FILTER_MASKING 22 +#define NEUREKA_REG_CONF0 23 + +/* SHIFT */ + +#define NEUREKA_SHIFT_FLAG_INPUT_SIGNED (26) +#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) +#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) +#define NEUREKA_SHIFT_QUANT_SHIFT (16) + +/* CONF0 FLAGS */ + +#define NEUREKA_FLAG_INPUT_SIGNED (1 << 26) +#define NEUREKA_FLAG_NORM_BIAS (1 << 25) +#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) +#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) +#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) +#define NEUREKA_QUANT_MODE_8BIT (0 << 21) +#define NEUREKA_QUANT_MODE_32BIT (2 << 21) +// conf0[20:16] - quantization shift amount +#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE \ + (1 << 15) // Unimplemented in gvsoc +#define NEUREKA_FLAG_STREAMIN (1 << 14) +#define NEUREKA_NORM_MODE_8BIT (0 << 12) +#define NEUREKA_NORM_MODE_32BIT (2 << 12) +#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) +#define NEUREKA_FLAG_WEIGHT_SOURCE_WMEM (1 << 9) +#define NEUREKA_FLAG_WEIGHT_SOURCE_TCDM (0 << 9) +#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) // not tested +#define NEUREKA_FLAG_MODE_3x3 (0 << 5) +#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) +#define NEUREKA_FLAG_MODE_1x1 (2 << 5) +#define NEUREKA_FLAG_NORM_QUANT (1 << 4) + +/* Masks */ + +#define NEUREKA_MASK_FLAG_INPUT_SIGNED (0x1 << 26) +#define NEUREKA_MASK_FLAG_NORM_BIAS (0x1 << 25) +#define NEUREKA_MASK_FLAG_NORM_SHIFT (0x1 << 24) +#define NEUREKA_MASK_QUANT_FUNCTION (0x1 << 23) +#define NEUREKA_MASK_QUANT_MODE (0x3 << 21) +#define NEUREKA_MASK_SHIFT_AMOUNT (0x1f << 16) +#define NEUREKA_MASK_WEIGHT_OFFSET_MODE (0x1 << 15) +#define NEUREKA_MASK_NORM_MODE (0x3 << 12) +#define NEUREKA_MASK_FLAG_ACTIVATION_PREFETCH (0x1 << 10) +#define NEUREKA_MASK_FLAG_WEIGHT_SOURCE (0x1 << 9) +#define NEUREKA_MASK_FLAG_MODE (0x3 << 5) +#define NEUREKA_MASK_FLAG_WEIGHT_BITS (0x7 << 0) + +/* PADDING */ + +#define NEUREKA_DONT_PAD (0) +#define NEUREKA_MAX_PAD (2) + +/* NORM */ +#define NEUREKA_NORM_MAX_LEN (32) + +#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h deleted file mode 100644 index e8ecba5..0000000 --- a/neureka/inc/pulp_nnx_defs.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_DEFS_H__ -#define __NEUREKA_DEFS_H__ - -/* ARHITECTURE */ - -#define NEUREKA_FILTER_SIZE (6) -#define NEUREKA_FILTER_BUFFER_SIZE (8) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28) -#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_CONTEXT_SIZE (2) -#define NEUREKA_WEIGHT_BANDWIDTH (256) - -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16) - -/* REGISTER MAP */ - -#define NEUREKA_EVT0 12 -#define NEUREKA_EVT1 13 -#define NEUREKA_BASE_ADDR 0x00201000 -#define WEIGHT_MEM_BASE 0x10400000 -#define SRAM_OFFSET 0x00400000 -#define MRAM_OFFSET 0x00000000 - -// Cluster -#define CLUSTER_CTRL_BASE_ADDR 0x00200000 -#define CLUSTER_CTRL_HWPE_OFFS 0x18 -#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800 - -/* REGISTER OFFSETS */ - -// commands -#define NEUREKA_TRIGGER 0x00 -#define NEUREKA_ACQUIRE 0x04 -#define NEUREKA_FINISHED 0x08 -#define NEUREKA_STATUS 0x0C -#define NEUREKA_RUNNING_JOB 0x10 -#define NEUREKA_SOFT_CLEAR 0x14 -#define NEUREKA_SWSYNC 0x18 -#define NEUREKA_URISCY_IMEM 0x1C - -// job configuration -#define NEUREKA_REGISTER_OFFSET 0x20 - -#define NEUREKA_REG_WEIGHTS_PTR 0x00 -#define NEUREKA_REG_INFEAT_PTR 0x04 -#define NEUREKA_REG_OUTFEAT_PTR 0x08 -#define NEUREKA_REG_SCALE_PTR 0x0C -#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10 -#define NEUREKA_REG_SCALE_BIAS_PTR 0x14 -#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18 -#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C -#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20 -#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24 -#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28 -#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C -#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30 -#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34 -#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38 -#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C -#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40 -#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44 -#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48 -#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C -#define NEUREKA_REG_PADDING 0x50 -#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54 -#define NEUREKA_REG_FILTER_MASKING 0x58 -#define NEUREKA_REG_CONF0 0x5C - -// Simulation only -#define NEUREKA_REG_GVSOC_TRACE 0x60 - -/* SHIFT */ - -#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) -#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) -#define NEUREKA_SHIFT_QUANT_SHIFT (16) -#define NEUREKA_SHIFT_ROUNDING (11) - -/* CONF0 FLAGS */ - -#define NEUREKA_FLAG_NORM_BIAS (1 << 25) -#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) -#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) -#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) -#define NEUREKA_QUANT_MODE_8BIT (0 << 21) -#define NEUREKA_QUANT_MODE_16BIT (1 << 21) -#define NEUREKA_QUANT_MODE_32BIT (2 << 21) -// conf0[20:16] - quantization shift amount -#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) -#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) -#define NEUREKA_FLAG_STREAMIN (1 << 14) -#define NEUREKA_NORM_MODE_8BIT (0 << 12) -#define NEUREKA_NORM_MODE_16BIT (1 << 12) -#define NEUREKA_NORM_MODE_32BIT (2 << 12) -#define NEUREKA_FLAG_ROUND (1 << 11) -#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) -#define NEUREKA_FLAG_USE_WMEM (1 << 9) -#define NEUREKA_FLAG_USE_TCDM (0 << 9) -#define NEUREKA_FLAG_STRIDED_MODE (1 << 8) -#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) -#define NEUREKA_FLAG_MODE_3x3 (0 << 5) -#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) -#define NEUREKA_FLAG_MODE_1x1 (2 << 5) -#define NEUREKA_FLAG_NORM_QUANT (1 << 4) -#define NEUREKA_FLAG_MODE_BASIC (0 << 3) -#define NEUREKA_FLAG_MODE16 (1 << 3) - -/* Masks */ - -#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23) -#define NEUREKA_MASK_QUANT_MODE (3 << 21) - -/* Miscellaneous */ - -// Padding -#define MAX_PAD (0xf) - -// Normalization -#define NEUREKA_NORM_MAX_LEN (32) -#define NO_NORM(length) \ - { \ - .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL, \ - .length = length, .mode = normMode32Bit \ - } - -// Quantization -#define NO_QUANT \ - { \ - .shift_amount = 0, .mode = quantMode32Bit, \ - .function = quantFunctionIdentity \ - } - -// GVSOC trace levels -#define NEUREKA_TRACE_LEVEL_JOB_START_END 0 -#define NEUREKA_TRACE_LEVEL_CONFIG 1 -#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2 -#define NEUREKA_TRACE_LEVEL_ALL 3 - -// null -#define NEUREKA_NULL ((void *)0) -#define NEUREKA_STATUS_FULL (0x101) - -#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h deleted file mode 100644 index 40bcec0..0000000 --- a/neureka/inc/pulp_nnx_hal.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_H__ -#define __NEUREKA_H__ - -#include - -#include "pulp_nnx_defs.h" -#include "pulp_nnx_error_codes.h" - -#define NEUREKA_CG_ENABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |= \ - CLUSTER_CTRL_HWPE_CG_EN_MASK -#define NEUREKA_CG_DISABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &= \ - ~CLUSTER_CTRL_HWPE_CG_EN_MASK - -#define NEUREKA_WRITE(offset, value) \ - *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value) -#define NEUREKA_WRITE_BE(offset, value, be) \ - *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value) -#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) - -#define NEUREKA_WRITE_IO_REG(offset, value) \ - NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value)) -#define NEUREKA_WRITE_IO_REG_BE(offset, value, be) \ - NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be)) -#define NEUREKA_READ_IO_REG(offset) \ - NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset)) - -#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0) -#define NEUREKA_BARRIER() \ - do { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BUSYWAIT() \ - do { \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; -#define NEUREKA_NOBARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; - -#define DIVNCEIL(A, B) (((A - 1) / B) + 1) -#define REMAINDER(A, B) (((A - 1) % B) + 1) -#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff)) - -#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE - -#define FLAG_USED (1) -#define FLAG_UNUSED (0) - -typedef enum { - weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, - weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE -} nnx_weight_offset_mode_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - uint16_t n_weights; - uint32_t bitwidth; - int32_t offset_factor; - nnx_weight_offset_mode_e offset_mode; -} nnx_weights_t; - -typedef enum { - featureBitwidth8Bit = 8, - featureBitwidth16Bit = 16, - featureBitwidth32Bit = 32 -} nnx_feature_bitwidth_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - nnx_feature_bitwidth_e bitwidth; -} nnx_feature_t; - -typedef enum { - normMode8Bit = NEUREKA_NORM_MODE_8BIT, - normMode16Bit = NEUREKA_NORM_MODE_16BIT, - normMode32Bit = NEUREKA_NORM_MODE_32BIT -} nnx_norm_mode_e; - -typedef struct { - nnx_norm_mode_e mode; - int flag_bias; - int flag_shift; -} nnx_norm_t; - -typedef enum { - quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, - quantMode16Bit = NEUREKA_QUANT_MODE_16BIT, - quantMode32Bit = NEUREKA_QUANT_MODE_32BIT -} nnx_quant_mode_e; - -typedef enum { - quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, - quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU -} nnx_quant_function_e; - -// TODO: add rounding to quant. Should also be an enum? Best boolean... -typedef struct { - // Shift amount must be in range 0x00-0x1F - unsigned shift_amount; - nnx_quant_mode_e mode; - nnx_quant_function_e function; - int flag_rounding; -} nnx_quant_t; - -typedef struct { - uint32_t d0; - uint32_t d1; - uint32_t d2; -} nnx_stride_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; - uint32_t HiWi; -} nnx_subtile_remainder_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; -} nnx_subtile_number_t; - -typedef struct { - nnx_subtile_remainder_t remainder; - nnx_subtile_number_t number; -} nnx_subtile_t; - -typedef struct { - nnx_stride_t input_stride; - nnx_stride_t output_stride; - nnx_stride_t weights_stride; - nnx_subtile_t subtile; - uint32_t padding; - uint32_t weight_offset_factor; - uint32_t filter_mask; - uint32_t conf0; -} nnx_cfg_t; - -typedef struct { - uint32_t weights_ptr; - uint32_t infeat_ptr; - uint32_t outfeat_ptr; - uint32_t scale_ptr; - uint32_t scale_shift_ptr; - uint32_t scale_bias_ptr; - nnx_cfg_t cfg; -} nnx_task_t; - -int nnx_job_id(); -int nnx_empty(); -int nnx_full(); -void nnx_soft_clear(); -int nnx_acquire(); -void nnx_offload(nnx_task_t *task); -void nnx_offload_ptr(nnx_task_t *task); -void nnx_run_async(); -void nnx_run_blocking(); -void nnx_commit(); -void nnx_wait_empty(); -void nnx_wait_not_full(); -void nnx_wait_on_id(int id); -void nnx_busywait(); - -void nnx_task_init(nnx_task_t *task); -int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom, - uint32_t left, uint16_t value); -int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant); -void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom, - uint8_t left); -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); - -#endif /* __NEUREKA_H__ */ diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h deleted file mode 100644 index f29ff3e..0000000 --- a/neureka/inc/pulp_nnx_util.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __PULP_NNX_UTIL__ -#define __PULP_NNX_UTIL__ - -void nnx_activate_gvsoc_logging(int use_dec); -void nnx_deactivate_gvsoc_logging(); - -#endif /* __PULP_NNX_UTIL__ */ diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c deleted file mode 100644 index 1d99691..0000000 --- a/neureka/src/pulp_nnx_hal.c +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "pulp_nnx_hal.h" -#include "pmsis.h" - -static int qw, weight_d0_stride, outbytes; - -// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and -// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise -// the compiler is not able to correctly factorize the NEUREKA base in case -// several accesses are done, ending up with twice more code - -// __builtin_pulp_OffsetedX not defined - needs further investigation... (too -// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK... - -int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); } - -int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; } - -int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); } - -void nnx_soft_clear() { - NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0); - for (volatile int i = 0; i < 10; i++) - ; -} - -int nnx_acquire() { - int job_id = -1; - NEUREKA_BARRIER_ACQUIRE(job_id); - return job_id; -} - -void nnx_offload(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_offload_ptr(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < 6; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); } - -void nnx_run_blocking() { - nnx_run_async(); - nnx_wait_empty(); -} - -void nnx_commit() { - NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger -} - -void nnx_busywait() { NEUREKA_BUSYWAIT(); } - -void nnx_wait_empty() { - while (!nnx_empty()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_not_full() { - while (nnx_full()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_on_id(const int id) { - while (nnx_job_id() <= id) { - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); - }; -} - -void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); } - -int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right, - const uint32_t bottom, const uint32_t left, - const uint16_t value) { - uint32_t padding = 0; - uint32_t flags = 0; - - if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) { - return 1; - } - - cfg->padding = - (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value; - - return 0; -} - -int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm, - const nnx_quant_t quant) { - if (quant.shift_amount > 31) { - printf("ERROR! quant.shift_amount > 31\n"); - return 1; - } - - if (quant.mode == quantMode16Bit) { - printf("ERROR! quant.mode == quantMode16Bit\n"); - return 1; - } - - BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | - quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | - norm.mode | - norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT); - - return 0; -} - -void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right, - const uint8_t bottom, const uint8_t left) { - cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) | - ((uint32_t)bottom << 8) | ((uint32_t)left << 0); -} - -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho; - const int rem_Wi = rem_Wo; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_in, - .d1 = k_in * w_out, - .d2 = k_in * 3 * 3 // copying arpan - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = weight_d0_stride * qw, - .d1 = weight_d0_stride * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height != output.height || input.width != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = {.d0 = k_in, - .d1 = k_in * (w_out + 2), - .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE * - NEUREKA_FILTER_BUFFER_SIZE}; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3, - .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ki = num_Ko; - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ki = rem_Ko; - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_out, - .d1 = k_out * (w_out + 2), - .d2 = 0 // Unused - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride, - .d1 = 0, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != output.depth) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c index 7ab0e99..f9799fc 100644 --- a/src/pulp_nnx_ne16.c +++ b/src/pulp_nnx_ne16.c @@ -79,25 +79,20 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i, uint32_t size_j, uint32_t size_k, uint32_t stride_j, uint32_t stride_k, uint32_t overlap_i, uint32_t overlap_j, - uint32_t offset_i, uint32_t offset_j, - uint8_t data_size) { - return ptr + - (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k * - data_size / 8 + - (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8; + uint32_t offset_i, uint32_t offset_j) { + return ptr + (i * (size_i - overlap_i) - offset_i) * stride_j + + (j * (size_j - overlap_j) - offset_j) * stride_k; } -void ne16_nnx_dispatch_stride2x2( - ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker) { +void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task, + const uint32_t w_in, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t h_ker, + const uint8_t w_ker) { const uint8_t stride = 2; - const uint8_t bits = 8; - const uint32_t n_h = divnceil(h_out, stride); - const uint32_t n_w = divnceil(w_out, stride); + const uint32_t n_h = nnx_calculate_number_of_tiles(h_out, stride); + const uint32_t n_w = nnx_calculate_number_of_tiles(w_out, stride); const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0; const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0; const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0; @@ -109,15 +104,15 @@ void ne16_nnx_dispatch_stride2x2( for (int i = 0; i < n_h; i++) { for (int j = 0; j < n_w; j++) { - task->data.infeat_ptr = - _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in, - w_in_stride, k_in_stride, h_ker - stride, - w_ker - stride, i == 0 ? 0 : input_height_offset, - j == 0 ? 0 : input_width_offset, bits); - task->data.outfeat_ptr = - _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride, - k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset, - j == 0 ? 0 : output_width_offset, bits); + task->data.infeat_ptr = _get_tile_ptr( + input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in, + task->data.cfg.input_stride.d1, task->data.cfg.input_stride.d0, + h_ker - stride, w_ker - stride, i == 0 ? 0 : input_height_offset, + j == 0 ? 0 : input_width_offset); + task->data.outfeat_ptr = _get_tile_ptr( + output_base, i, j, 2, 2, k_out, task->data.cfg.output_stride.d2 << 1, + task->data.cfg.output_stride.d1 << 1, 0, 0, + i == 0 ? 0 : output_height_offset, j == 0 ? 0 : output_width_offset); task->data.cfg.padding = ne16_get_tile_padding(tile_padding, i, j, n_h, n_w); diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c new file mode 100644 index 0000000..0abb845 --- /dev/null +++ b/src/pulp_nnx_neureka.c @@ -0,0 +1,76 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pulp_nnx_neureka.h" +#include "hwpe.h" +#include "neureka.h" +#include "pulp_nnx_util.h" +#include +#include +#include + +void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf) { + neureka_siracusa_open(conf); + hwpe_soft_clear(&dev->hwpe_dev); +} + +void neureka_nnx_term(neureka_dev_t *dev) { + hwpe_soft_clear(&dev->hwpe_dev); + neureka_siracusa_close(); +} + +int neureka_nnx_dispatch_check(neureka_dev_t *dev) { + return !neureka_task_queue_full(dev); +} + +void neureka_nnx_dispatch_wait(neureka_dev_t *dev) { + while (!neureka_nnx_dispatch_check(dev)) { + neureka_siracusa_event_wait_and_clear(); + } +} + +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) { + if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) { + return 1; + } + hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data, + (int)(sizeof(neureka_task_data_t) / 4)); + hwpe_task_queue_release_and_run(&dev->hwpe_dev); + return 0; +} + +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) { +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + // GVSOC model has a broken running_id so resolve_check + // conservativly looks if the task queue is empty. + return neureka_task_queue_empty(dev); +#else + uint8_t prev_task_id = task->id - 1; + return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id || + (hwpe_last_task_id(&dev->hwpe_dev) == task->id && + !neureka_task_queue_empty(dev))); +#endif +} + +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) { + while (!neureka_nnx_resolve_check(dev, task)) { + neureka_siracusa_event_wait_and_clear(); + } +} diff --git a/test/.isort.cfg b/test/.isort.cfg new file mode 100644 index 0000000..127bf37 --- /dev/null +++ b/test/.isort.cfg @@ -0,0 +1,4 @@ +[settings] +profile=black +line_length=88 +skip_gitignore=true diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py index 5abb204..07dc597 100644 --- a/test/HeaderWriter.py +++ b/test/HeaderWriter.py @@ -48,8 +48,9 @@ def define(self, name, expr): if isinstance(expr, str): expr = f'"{expr}"' elif isinstance(expr, bool): - expr = int(expr) - expr = f"({expr})" + expr = f"({int(expr)})" + else: + expr = f"({expr})" return f"#define {name.upper()} {expr}\n" def vector_size(self, data): @@ -158,7 +159,7 @@ def generate_vector_source(self, name, size, _type, init=None, golden=None): if golden is not None: render += self.render_vector( - "golden_" + name, "PI_L1 " + _type, size, init=golden + "golden_" + name, "PI_L2 " + _type, size, init=golden ) render += self.check(name) diff --git a/test/Ne16.py b/test/Ne16.py deleted file mode 100644 index 6de5ab5..0000000 --- a/test/Ne16.py +++ /dev/null @@ -1,94 +0,0 @@ -# Luka Macan -# -# Copyright 2023 ETH Zurich and University of Bologna -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -import numpy as np -import numpy.typing as npt -from TestClasses import IntegerType - - -class Ne16: - ACCUMULATOR_TYPE = IntegerType(name="int32") - - _CIN_SUBTILE = 16 - - @staticmethod - def weight_unroll( - weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False - ) -> npt.NDArray[np.uint8]: - """Unroll weight into expected memory format - - Expected weight shape is (Cout, Cin, H, W). - The output shape is: (Cout, Cin_major, Bits, H x W, Cin_minor_bytes), - where Cin_major is the ceil(Cin / CIN_SUBTILE) and Cin_minor has to be padded with 0 to CIN_SUBTILE. - """ - if depthwise: - weight = weight.transpose(1, 0, 2, 3) # Swap Cout and Cin - - Cout, Cin, H, W = weight.shape - - # Pad Cin to be divisible with CIN_SUBTILE - if Cin % Ne16._CIN_SUBTILE != 0: - Cin_pad = Ne16._CIN_SUBTILE - Cin % Ne16._CIN_SUBTILE - weight = np.pad( - weight, - ((0, 0), (0, Cin_pad), (0, 0), (0, 0)), - "constant", - constant_values=0, - ) - - # Reshape into (Cout, Cin_major, Cin_minor, Flattened spatial, 1) - # The 1 at the end is required by the unpacking - Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE)) - Cin_minor = Ne16._CIN_SUBTILE - weight = weight.reshape(Cout, Cin_major, Cin_minor, H * W, 1) - - # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] - # (Cout, Cin_major, Cin_minor, Flattened spatial, Bits) - weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") - - # Shuffle bits so that the final shape is: - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor) - weight = weight.transpose(0, 1, 4, 3, 2) - - # Prepare for packing - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes, 8) - Cin_minor_bytes = int(np.ceil(Cin_minor / 8)) - weight = np.stack(np.split(weight, Cin_minor_bytes, axis=-1), axis=-2) - - # Pack - # (Cout, Cin_major, Bits, Flattened spatial, Cin_minor_bytes) - weight = np.packbits(weight, axis=-1, bitorder="little") - - return weight.flatten() - - @staticmethod - def weight_roll(weight: np.ndarray, bits: int, Cout: int, Cin: int, H: int, W: int): - """Reverse of weight_roll""" - Cin_major = int(np.ceil(Cin / Ne16._CIN_SUBTILE)) - Cin_minor = Ne16._CIN_SUBTILE - Cin_minor_bytes = int(np.ceil(Cin_minor / 8)) - - weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor_bytes, 1) - weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") - weight = weight.reshape(Cout, Cin_major, bits, H * W, Cin_minor) - weight = weight.transpose(0, 1, 4, 3, 2) - weight = np.packbits(weight, axis=-1, bitorder="little") - weight = weight.reshape(Cout, Cin_major * Cin_minor, H, W) - weight = weight[:, :Cin, :, :] - - return weight diff --git a/test/Ne16MemoryLayout.py b/test/Ne16MemoryLayout.py new file mode 100644 index 0000000..30729ab --- /dev/null +++ b/test/Ne16MemoryLayout.py @@ -0,0 +1,99 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import numpy.typing as npt + + +class Ne16MemoryLayout: + _CIN_SUBTILE = 16 + + @staticmethod + def weightEncode( + weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False + ) -> npt.NDArray[np.uint8]: + """Unroll weight into expected memory format + + Expected weight shape is (cout, cin, height, width). + The output shape is: (cout, cinMajor, Bits, height x width, cinMinorBytes), + where cinMajor is the ceil(cin / CIN_SUBTILE) and cinMinor has to be padded with 0 to CIN_SUBTILE. + """ + if depthwise: + weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin + + cout, cin, height, width = weight.shape + + # Pad cin to be divisible with CIN_SUBTILE + if cin % Ne16MemoryLayout._CIN_SUBTILE != 0: + cinPad = Ne16MemoryLayout._CIN_SUBTILE - cin % Ne16MemoryLayout._CIN_SUBTILE + weight = np.pad( + weight, + ((0, 0), (0, cinPad), (0, 0), (0, 0)), + "constant", + constant_values=0, + ) + cin = cin + cinPad + + # Reshape into (cout, cinMajor, cinMinor, flattened spatial, 1) + # The 1 at the end is required by the unpacking + cinMajor = cin // Ne16MemoryLayout._CIN_SUBTILE + cinMinor = Ne16MemoryLayout._CIN_SUBTILE + weight = weight.reshape(cout, cinMajor, cinMinor, height * width, 1) + + # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] + # (cout, cinMajor, cinMinor, flattened spatial, Bits) + weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") + + # Shuffle bits so that the final shape is: + # (cout, cinMajor, Bits, flattened spatial, cinMinor) + weight = weight.transpose(0, 1, 4, 3, 2) + + # Prepare for packing + # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes, 8) + cinMinorBytes = int(np.ceil(cinMinor / 8)) + weight = np.stack(np.split(weight, cinMinorBytes, axis=-1), axis=-2) + + # Pack + # (cout, cinMajor, Bits, flattened spatial, cinMinorBytes) + weight = np.packbits(weight, axis=-1, bitorder="little") + + return weight.flatten() + + @staticmethod + def weightDecode( + weight: npt.NDArray[np.uint8], + bits: int, + cout: int, + cin: int, + height: int, + width: int, + ) -> npt.NDArray[np.uint8]: + """Reverse of weight_roll""" + cinMajor = int(np.ceil(cin / Ne16MemoryLayout._CIN_SUBTILE)) + cinMinor = Ne16MemoryLayout._CIN_SUBTILE + cinMinorBytes = int(np.ceil(cinMinor / 8)) + + weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinorBytes, 1) + weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little") + weight = weight.reshape(cout, cinMajor, bits, height * width, cinMinor) + weight = weight.transpose(0, 1, 4, 3, 2) + weight = np.packbits(weight, axis=-1, bitorder="little") + weight = weight.reshape(cout, cinMajor * cinMinor, height, width) + weight = weight[:, :cin, :, :] + + return weight diff --git a/test/Ne16TestConf.py b/test/Ne16TestConf.py new file mode 100644 index 0000000..f2e66ad --- /dev/null +++ b/test/Ne16TestConf.py @@ -0,0 +1,111 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import List, Optional, Union + +from pydantic import field_validator, model_validator + +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from NnxTestClasses import NnxTestConf +from TestClasses import IntegerType, KernelShape, Stride, implies + + +class Ne16TestConf(NnxTestConf): + @field_validator("kernel_shape") + @classmethod + def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: + assert v == KernelShape(height=1, width=1) or v == KernelShape( + height=3, width=3 + ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." + return v + + @field_validator("stride") + @classmethod + def check_valid_stride(cls, v: Stride) -> Stride: + assert v == Stride(height=1, width=1) or v == Stride( + height=2, width=2 + ), f"Unsupported stride {v}. Supported 1x1 and 2x2." + return v + + @staticmethod + def _check_type( + name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] + ) -> None: + assert ( + _type in allowed_types + ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" + + @field_validator("in_type") + @classmethod + def check_valid_in_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("in_type", v, ["uint8"]) + return v + + @field_validator("out_type") + @classmethod + def check_valid_out_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("out_type", v, ["uint8", "int8", "int32"]) + return v + + @field_validator("weight_type") + @classmethod + def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: + Ne16TestConf._check_type("weight_type", v, ["int8"]) + return v + + @field_validator("scale_type") + @classmethod + def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"]) + return v + + @field_validator("bias_type") + @classmethod + def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + Ne16TestConf._check_type("bias_type", v, ["int32"]) + return v + + @model_validator(mode="after") # type: ignore + def check_valid_out_channel_stride_with_stride_2x2(self) -> Ne16TestConf: + assert implies( + self.stride == Stride(height=2, width=2), + self.out_channel * (self.out_type._bits // 8) % 2 == 0, + ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}" + return self + + @model_validator(mode="after") # type: ignore + def check_valid_depthwise_kernel_shape(self) -> Ne16TestConf: + assert implies( + self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) + ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_norm_quant(self) -> Ne16TestConf: + assert implies( + not self.has_norm_quant, + self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, + ), ( + f"Without quantization, the output type has to be equal to the " + f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + ) + return self diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py new file mode 100644 index 0000000..08b3601 --- /dev/null +++ b/test/NeuralEngineFunctionalModel.py @@ -0,0 +1,123 @@ +from typing import Optional + +import torch +import torch.nn.functional as F + +from TestClasses import IntegerType, Padding, Stride + + +class NeuralEngineFunctionalModel: + ACCUMULATOR_TYPE = IntegerType(name="int32") + + @staticmethod + def _cast( + tensor: torch.Tensor, _type: IntegerType, saturate: bool = False + ) -> torch.Tensor: + if saturate: + return tensor.clamp(_type.min, _type.max) + else: + return tensor & ((1 << _type._bits) - 1) + + def _norm_quant( + self, + tensor: torch.Tensor, + scale: torch.Tensor, + bias: Optional[torch.Tensor], + global_shift: torch.Tensor, + out_type: IntegerType, + bias_type: Optional[IntegerType], + has_bias: bool, + has_relu: bool, + ) -> torch.Tensor: + # Scale accumulators are in 48bit, so keeping the data in 64bit + tensor = tensor * scale + assert tensor.dtype == torch.int64 + + if has_bias: + assert bias is not None + assert bias_type is not None + # Saturating cast to int32 + tensor = NeuralEngineFunctionalModel._cast( + tensor, bias_type, saturate=True + ).type(torch.int32) + + tensor = tensor + bias + tensor = NeuralEngineFunctionalModel._cast( + tensor, bias_type, saturate=False + ).type(torch.int32) + + if has_relu: + tensor = F.relu(tensor) + + tensor = tensor >> global_shift + + # Saturate into out_type + tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True) + + return tensor + + def convolution( + self, + input: torch.Tensor, + weight: torch.Tensor, + scale: Optional[torch.Tensor], + bias: Optional[torch.Tensor], + global_shift: Optional[torch.Tensor], + padding: Padding, + stride: Stride, + depthwise: bool, + out_type: IntegerType, + bias_type: Optional[IntegerType], + has_norm_quant: bool, + has_bias: bool, + has_relu: bool, + verbose: bool = False, + **kwargs, + ) -> torch.Tensor: + _ = kwargs + + input_padded = F.pad( + input, + ( + padding.left, + padding.right, + padding.top, + padding.bottom, + ), + "constant", + 0, + ) + + # Accumulators are 32bit non-saturating. + # Calculate in higher precision (int64) + output = F.conv2d( + input=input_padded, + weight=weight, + stride=(stride.height, stride.width), + groups=weight.shape[0] if depthwise else 1, + ).type(torch.int64) + + # Cast to accumulator type + output = NeuralEngineFunctionalModel._cast( + output, NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, saturate=False + ).type(torch.int32) + + if verbose: + print("INTERMEDIATE RESULTS (pre-normalization/requant):") + print(output) + + if has_norm_quant: + assert scale is not None + assert global_shift is not None + output = self._norm_quant( + output, + scale, + bias, + global_shift, + out_type, + bias_type, + has_bias, + has_relu, + ) + + return output diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py new file mode 100644 index 0000000..80a2786 --- /dev/null +++ b/test/NeurekaMemoryLayout.py @@ -0,0 +1,158 @@ +# Luka Macan +# Arpan Suravi Prasad +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import numpy.typing as npt + +from TestClasses import IntegerType + + +class NeurekaMemoryLayout: + _WEIGHT_BANDWIDTH = 256 + _CIN_SUBTILE_1x1 = 32 + _CIN_SUBTILE_3x3 = 28 + + @staticmethod + def weightEncode( + weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False + ) -> npt.NDArray[np.uint8]: + """Unroll weight into expected memory format + + Expected weight shape is (cout, cin, H, W). + The produced memory layout depends on the weight kernel shape: + - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits), + - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits), + where cinMajor is the ceil(cin / cin subtile ) and cinMinor has to be padded with 0 to cin subtile . + """ + if depthwise: + weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin + + cout, cin, height, width = weight.shape + cinSubtile = ( + NeurekaMemoryLayout._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayout._CIN_SUBTILE_1x1 + ) + + # Pad cin to be divisible with CIN_SUBTILE + if cin % cinSubtile != 0: + cinPad = cinSubtile - cin % cinSubtile + weight = np.pad( + weight, + ((0, 0), (0, cinPad), (0, 0), (0, 0)), + "constant", + constant_values=0, + ) + + # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1) + # The 1 at the end is required by the unpacking + cinMajor = int(np.ceil(cin / cinSubtile)) + weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1) + + # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] + # (cout, cinMajor, cinSubtile, Flattened spatial, Bits) + weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") + + # Shuffle bits so that the final shape is: + # (cout, cinMajor, Bits, Flattened spatial, cinSubtile) + weight = weight.transpose(0, 1, 4, 3, 2) + + # Pack dimensions to fit into weight bandwidth + if height == 3 and width == 3: + # (cout * cinMajor * Bits, H * W * cinSubtile) + weight = weight.reshape(-1, height * width * cinSubtile) + # Pad only the last dimension to weight bandwidth size + # (-1, Weight Bandwidth) + weight = np.pad( + weight, + ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])), + "constant", + constant_values=0, + ) + elif height == 1 and width == 1: + # Tile cinSubtile into tiles of size 4 + # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinSubtile // 4, 4 + ) # cout, cinMajor, bits, 1, 8, 4 + # Pad bits to 8 + if bits < 8: + # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = np.pad( + weight, + ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)), + mode="constant", + constant_values=0, + ) + # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile) + weight = weight.transpose(0, 1, 3, 4, 2, 5) + # (-1, Weight Bandwidth) + weight = weight.reshape( + cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH + ) # cout*cinMajor, 256b + + # Prepare for packing + # (-1, Weight Bandwidth Bytes, 8) + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8)) + weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2) + + # Pack bits + # (-1, Weight Bandwidth Bytes) + weight = np.packbits(weight, axis=-1, bitorder="little") + + return weight.flatten() + + @staticmethod + def weightDecode( + weight: npt.NDArray[np.uint8], + bits: int, + cout: int, + cin: int, + height: int, + width: int, + ) -> npt.NDArray[np.uint8]: + """Reverse of weightEncode""" + cinSubtile = ( + NeurekaMemoryLayout._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayout._CIN_SUBTILE_1x1 + ) + cinMajor = int(np.ceil(cin / cinSubtile)) + cinMinor = cinSubtile + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8)) + + weight = weight.reshape(-1, weightBandwidthBytes, 1) + weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little") + weight = weight.reshape(-1, NeurekaMemoryLayout._WEIGHT_BANDWIDTH) + + if height == 3 and width == 3: + weight = weight[:, : height * width * cinMinor] + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinMinor + ).transpose(0, 1, 4, 3, 2) + elif height == 1 and width == 1: + weight = weight[:, : height * width * cinMinor * 8] + weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose( + 0, 1, 2, 4, 3 + ) + weight = np.packbits(weight, axis=-1, bitorder="little") + weight = weight.reshape(cout, cinMajor * cinMinor, height, width) + weight = weight[:, :cin, :, :] + + return weight diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py new file mode 100644 index 0000000..f878e68 --- /dev/null +++ b/test/NeurekaTestConf.py @@ -0,0 +1,101 @@ +# Luka Macan +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import List, Optional, Union + +from pydantic import field_validator, model_validator + +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from NnxTestClasses import NnxTestConf +from TestClasses import IntegerType, KernelShape, Stride, implies + + +class NeurekaTestConf(NnxTestConf): + @field_validator("kernel_shape") + @classmethod + def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: + assert v == KernelShape(height=1, width=1) or v == KernelShape( + height=3, width=3 + ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." + return v + + @field_validator("stride") + @classmethod + def check_valid_stride(cls, v: Stride) -> Stride: + assert v == Stride(height=1, width=1), f"Unsupported stride {v}. Supported 1x1." + return v + + @staticmethod + def _check_type( + name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] + ) -> None: + assert ( + _type in allowed_types + ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" + + @field_validator("in_type") + @classmethod + def check_valid_in_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("in_type", v, ["uint8", "int8"]) + return v + + @field_validator("out_type") + @classmethod + def check_valid_out_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("out_type", v, ["uint8", "int8", "int32"]) + return v + + @field_validator("weight_type") + @classmethod + def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: + NeurekaTestConf._check_type("weight_type", v, ["int8"]) + return v + + @field_validator("scale_type") + @classmethod + def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + NeurekaTestConf._check_type("scale_type", v, ["uint8", "uint32"]) + return v + + @field_validator("bias_type") + @classmethod + def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: + if v is not None: + NeurekaTestConf._check_type("bias_type", v, ["int32"]) + return v + + @model_validator(mode="after") # type: ignore + def check_valid_depthwise_kernel_shape(self) -> NeurekaTestConf: + assert implies( + self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) + ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_norm_quant(self) -> NeurekaTestConf: + assert implies( + not self.has_norm_quant, + self.out_type == NeuralEngineFunctionalModel.ACCUMULATOR_TYPE, + ), ( + f"Without quantization, the output type has to be equal to the " + f"accumulator type {NeuralEngineFunctionalModel.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + ) + return self diff --git a/test/Ne16TestClasses.py b/test/NnxTestClasses.py similarity index 53% rename from test/Ne16TestClasses.py rename to test/NnxTestClasses.py index d99e829..a7aaa00 100644 --- a/test/Ne16TestClasses.py +++ b/test/NnxTestClasses.py @@ -17,18 +17,21 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import List, Union, Optional, Set, Tuple -import torch -import numpy as np -import torch.nn.functional as F + import os -from Ne16 import Ne16 +from typing import Callable, Optional, Set, Tuple, Type, Union + +import numpy as np +import numpy.typing as npt +import torch +from pydantic import BaseModel, PositiveInt, field_validator, model_validator + from HeaderWriter import HeaderWriter -from TestClasses import implies, KernelShape, Padding, Stride, IntegerType -from pydantic import BaseModel, field_validator, model_validator, PositiveInt +from NeuralEngineFunctionalModel import NeuralEngineFunctionalModel +from TestClasses import IntegerType, KernelShape, Padding, Stride, implies -class Ne16TestConf(BaseModel): +class NnxTestConf(BaseModel): in_height: PositiveInt in_width: PositiveInt in_channel: PositiveInt @@ -46,74 +49,8 @@ class Ne16TestConf(BaseModel): has_bias: bool has_relu: bool - @field_validator("kernel_shape") - @classmethod - def check_valid_kernel_shape(cls, v: KernelShape) -> KernelShape: - assert v == KernelShape(height=1, width=1) or v == KernelShape( - height=3, width=3 - ), f"Unsupported kernel shape {v}. Supported 1x1 and 3x3." - return v - - @field_validator("stride") - @classmethod - def check_valid_stride(cls, v: Stride) -> Stride: - assert v == Stride(height=1, width=1) or v == Stride( - height=2, width=2 - ), f"Unsupported stride {v}. Supported 1x1 and 2x2." - return v - - @staticmethod - def _check_type( - name: str, _type: IntegerType, allowed_types: List[Union[IntegerType, str]] - ) -> None: - assert ( - _type in allowed_types - ), f"Unsupported {name} type {_type}. Supported types: {allowed_types}" - - @field_validator("in_type") - @classmethod - def check_valid_in_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("in_type", v, ["uint8"]) - return v - - @field_validator("out_type") - @classmethod - def check_valid_out_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("out_type", v, ["uint8", "int8"]) - return v - - @field_validator("weight_type") - @classmethod - def check_valid_weight_type(cls, v: IntegerType) -> IntegerType: - Ne16TestConf._check_type("weight_type", v, ["int8"]) - return v - - @field_validator("scale_type") - @classmethod - def check_valid_scale_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: - if v is not None: - Ne16TestConf._check_type("scale_type", v, ["uint8", "uint32"]) - return v - - @field_validator("bias_type") - @classmethod - def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType]: - if v is not None: - Ne16TestConf._check_type("bias_type", v, ["int32"]) - return v - @model_validator(mode="after") # type: ignore - def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf: - assert implies( - self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0 - ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}" - return self - - @model_validator(mode="after") # type: ignore - def check_valid_depthwise(self) -> Ne16TestConf: - assert implies( - self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) - ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." + def check_valid_depthwise_channels(self) -> NnxTestConf: assert implies(self.depthwise, self.in_channel == self.out_channel), ( f"Input and output channel should be the same in a depthwise layer. " f"input channel: {self.in_channel}, output channel: {self.out_channel}" @@ -121,21 +58,15 @@ def check_valid_depthwise(self) -> Ne16TestConf: return self @model_validator(mode="after") # type: ignore - def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf: + def check_valid_padding_with_kernel_shape_1x1(self) -> NnxTestConf: assert implies( self.kernel_shape == KernelShape(height=1, width=1), self.padding == Padding(top=0, bottom=0, left=0, right=0), ), f"No padding on 1x1 kernel. Given padding {self.padding}" return self - @field_validator("has_norm_quant") - @classmethod - def check_valid_has_norm_quant(cls, v: bool) -> bool: - assert v == True, f"Untested without has_norm_quant." - return v - @model_validator(mode="after") # type: ignore - def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf: + def check_valid_norm_quant_types_when_has_norm_qunat(self) -> NnxTestConf: if self.has_norm_quant: assert self.scale_type is not None, "Scale type was not provided." if self.has_bias: @@ -143,25 +74,31 @@ def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf: return self @model_validator(mode="after") # type: ignore - def check_valid_out_type_with_flags(self) -> Ne16TestConf: - assert implies( - not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE - ), ( - f"Without quantization, the output type has to be equal to the " - f"accumulator type {Ne16.ACCUMULATOR_TYPE}. Given output type {self.out_type}" + def check_has_relu_with_norm_quant(self) -> NnxTestConf: + assert implies(self.has_relu, self.has_norm_quant), ( + f"Relu flag can only be enabled when norm_quant is enabled. " + f"Given has_relu {self.has_relu} and has_norm_quant {self.has_norm_quant}" ) - assert implies( - self.has_norm_quant, - (self.has_relu and not self.out_type._signed) - or (not self.has_relu and self.out_type._signed), - ), ( + return self + + @model_validator(mode="after") # type: ignore + def check_has_bias_with_norm_quant(self) -> NnxTestConf: + assert implies(self.has_bias, self.has_norm_quant), ( + f"Bias flag can only be enabled when norm_quant is enabled. " + f"Given has_bias {self.has_bias} and has_norm_quant {self.has_norm_quant}" + ) + return self + + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_relu(self) -> NnxTestConf: + assert self.has_relu ^ self.out_type._signed, ( f"Output type has to be unsigned when there is relu, otherwise signed. " f"Given output type {self.out_type} and has_relu {self.has_relu}" ) return self -class Ne16Test: +class NnxTest: _CONF_NAME = "conf.json" _INPUT_NAME = "input.pt" _OUTPUT_NAME = "output.pt" @@ -172,7 +109,7 @@ class Ne16Test: def __init__( self, - conf: Ne16TestConf, + conf: NnxTestConf, input: Optional[torch.Tensor], output: Optional[torch.Tensor], weight: Optional[torch.Tensor], @@ -188,7 +125,7 @@ def __init__( self.bias = bias self.global_shift = global_shift - def is_valid(self): + def is_valid(self) -> bool: return all( [ self.input is not None, @@ -203,22 +140,22 @@ def is_valid(self): def save_conf(self, path: Union[str, os.PathLike]) -> None: os.makedirs(path, exist_ok=True) - with open(os.path.join(path, Ne16Test._CONF_NAME), "w") as fp: + with open(os.path.join(path, NnxTest._CONF_NAME), "w") as fp: fp.write(self.conf.model_dump_json(indent=4)) def save_data(self, path: Union[str, os.PathLike]) -> None: os.makedirs(path, exist_ok=True) - torch.save(self.input, os.path.join(path, Ne16Test._INPUT_NAME)) - torch.save(self.output, os.path.join(path, Ne16Test._OUTPUT_NAME)) - torch.save(self.weight, os.path.join(path, Ne16Test._WEIGHT_NAME)) + torch.save(self.input, os.path.join(path, NnxTest._INPUT_NAME)) + torch.save(self.output, os.path.join(path, NnxTest._OUTPUT_NAME)) + torch.save(self.weight, os.path.join(path, NnxTest._WEIGHT_NAME)) if self.scale is not None: - torch.save(self.scale, os.path.join(path, Ne16Test._SCALE_NAME)) + torch.save(self.scale, os.path.join(path, NnxTest._SCALE_NAME)) if self.bias is not None: - torch.save(self.bias, os.path.join(path, Ne16Test._BIAS_NAME)) + torch.save(self.bias, os.path.join(path, NnxTest._BIAS_NAME)) if self.global_shift is not None: torch.save( - self.global_shift, os.path.join(path, Ne16Test._GLOBAL_SHIFT_NAME) + self.global_shift, os.path.join(path, NnxTest._GLOBAL_SHIFT_NAME) ) def save(self, path: Union[str, os.PathLike]) -> None: @@ -228,154 +165,111 @@ def save(self, path: Union[str, os.PathLike]) -> None: @staticmethod def is_test_dir(path: Union[str, os.PathLike]) -> bool: fileset = set(os.listdir(path)) - required_fileset = set([Ne16Test._CONF_NAME]) + required_fileset = set([NnxTest._CONF_NAME]) return required_fileset.issubset(fileset) @classmethod - def load(cls, path: Union[str, os.PathLike]) -> "Ne16Test": - assert Ne16Test.is_test_dir( + def load(cls, confCls: Type[NnxTestConf], path: Union[str, os.PathLike]) -> NnxTest: + assert NnxTest.is_test_dir( path ), f"ERROR: Test {path} does not contain the necessary files." - with open(os.path.join(path, Ne16Test._CONF_NAME), "r") as fp: - conf = Ne16TestConf.model_validate_json(fp.read()) + with open(os.path.join(path, NnxTest._CONF_NAME), "r") as fp: + conf = confCls.model_validate_json(fp.read()) def load_if_exist(filename: str) -> Optional[torch.Tensor]: filepath = os.path.join(path, filename) return torch.load(filepath) if os.path.isfile(filepath) else None - input = load_if_exist(Ne16Test._INPUT_NAME) - output = load_if_exist(Ne16Test._OUTPUT_NAME) - weight = load_if_exist(Ne16Test._WEIGHT_NAME) - scale = load_if_exist(Ne16Test._SCALE_NAME) - bias = load_if_exist(Ne16Test._BIAS_NAME) - global_shift = load_if_exist(Ne16Test._GLOBAL_SHIFT_NAME) + input = load_if_exist(NnxTest._INPUT_NAME) + output = load_if_exist(NnxTest._OUTPUT_NAME) + weight = load_if_exist(NnxTest._WEIGHT_NAME) + scale = load_if_exist(NnxTest._SCALE_NAME) + bias = load_if_exist(NnxTest._BIAS_NAME) + global_shift = load_if_exist(NnxTest._GLOBAL_SHIFT_NAME) return cls(conf, input, output, weight, scale, bias, global_shift) -class Ne16TestGenerator: +class NnxTestGenerator: _DEFAULT_SEED = 0 @staticmethod - def _global_shift( - tensor: torch.Tensor, out_type: IntegerType, has_relu: bool + def _calculate_global_shift( + tensor: torch.Tensor, out_type: IntegerType ) -> torch.Tensor: - if has_relu: - # only adjust positive values - tensor = tensor[tensor > 0] - + """Calculate global shift so that the output values are in the range of out_type""" s = tensor.type(torch.float64).std() target_s = 2 ** (out_type._bits - 1) - global_shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32) - - return global_shift + return torch.ceil(torch.log2(s / target_s)).type(torch.int32) @staticmethod - def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]): + def _random_data(_type: IntegerType, shape: Tuple): return torch.randint(_type.min, _type.max, size=shape) - @staticmethod - def _cast( - tensor: torch.Tensor, _type: IntegerType, saturate: bool = False - ) -> torch.Tensor: - if saturate: - return tensor.clamp(_type.min, _type.max) - else: - return tensor & ((1 << _type._bits) - 1) - @staticmethod def from_conf( - conf: Ne16TestConf, + conf: NnxTestConf, input: Optional[torch.Tensor] = None, weight: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, global_shift: Optional[torch.Tensor] = None, - ) -> Ne16Test: - torch.manual_seed(Ne16TestGenerator._DEFAULT_SEED) + verbose: bool = False, + ) -> NnxTest: + torch.manual_seed(NnxTestGenerator._DEFAULT_SEED) + + input_shape = (1, conf.in_channel, conf.in_height, conf.in_width) + weight_shape = ( + conf.out_channel, + 1 if conf.depthwise else conf.in_channel, + conf.kernel_shape.height, + conf.kernel_shape.width, + ) + scale_shape = (1, conf.out_channel, 1, 1) + bias_shape = (1, conf.out_channel, 1, 1) if input is None: - input = Ne16TestGenerator._random_data( + input = NnxTestGenerator._random_data( _type=conf.in_type, - shape=(1, conf.in_channel, conf.in_height, conf.in_width), + shape=input_shape, ) - input_padded = F.pad( - input, - ( - conf.padding.left, - conf.padding.right, - conf.padding.top, - conf.padding.bottom, - ), - "constant", - 0, - ) - if weight is None: - weight = Ne16TestGenerator._random_data( + weight = NnxTestGenerator._random_data( _type=conf.weight_type, - shape=( - conf.out_channel, - 1 if conf.depthwise else conf.in_channel, - conf.kernel_shape.height, - conf.kernel_shape.width, - ), + shape=weight_shape, ) - # Accumulators are 32bit non-saturating. - # Calculate in higher precision (int64) - output = F.conv2d( - input=input_padded, - weight=weight, - stride=(conf.stride.height, conf.stride.width), - groups=conf.in_channel if conf.depthwise else 1, - ).type(torch.int64) - # Use only the lower 32bits - output = Ne16TestGenerator._cast( - output, Ne16.ACCUMULATOR_TYPE, saturate=False - ).type(torch.int32) - if conf.has_norm_quant: if scale is None: assert conf.scale_type is not None - scale = Ne16TestGenerator._random_data( - conf.scale_type, shape=(1, conf.out_channel, 1, 1) + scale = NnxTestGenerator._random_data( + conf.scale_type, shape=scale_shape ) - # Scale accumulators are in 48bit, so keeping the data in 64bit - output = scale * output - assert output.dtype == torch.int64 - - if conf.has_bias: - # Saturating cast to int32 + if conf.has_bias and bias is None: assert conf.bias_type is not None - output = Ne16TestGenerator._cast( - output, conf.bias_type, saturate=True - ).type(torch.int32) - - if bias is None: - bias = Ne16TestGenerator._random_data( - conf.bias_type, shape=(1, conf.out_channel, 1, 1) - ).type(torch.int32) - output = output + bias - output = Ne16TestGenerator._cast( - output, conf.bias_type, saturate=False + bias = NnxTestGenerator._random_data( + conf.bias_type, shape=bias_shape ).type(torch.int32) - - if conf.has_relu: - output = F.relu(output) - if global_shift is None: - global_shift = Ne16TestGenerator._global_shift( - output, conf.out_type, conf.has_relu + global_shift = torch.Tensor([0]).type(torch.int32) + output = NeuralEngineFunctionalModel().convolution( + input, + weight, + scale, + bias, + global_shift, + verbose=verbose, + **conf.__dict__, ) - output = output >> global_shift + NnxTestGenerator._calculate_global_shift(output, conf.out_type) - # Saturate into out_type - output = Ne16TestGenerator._cast(output, conf.out_type, saturate=True) + output = NeuralEngineFunctionalModel().convolution( + input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__ + ) - return Ne16Test( + return NnxTest( conf=conf, input=input, output=output, @@ -386,28 +280,38 @@ def from_conf( ) @staticmethod - def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test: + def regenerate(test: NnxTest, regen_tensors: Set[str]) -> NnxTest: test_tensors = set(["input", "output", "weight", "scale", "bias"]) load_tensors = test_tensors - regen_tensors kwargs = {tensor: getattr(test, tensor) for tensor in load_tensors} - return Ne16TestGenerator.from_conf(test.conf, **kwargs) + return NnxTestGenerator.from_conf(test.conf, **kwargs) -class Ne16TestHeaderGenerator: +class NnxTestHeaderGenerator: DEFAULT_HEADERS_DIR = "app/gen" - def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None): + def __init__( + self, + weightEncode: Callable[ + [npt.NDArray[np.uint8], int, bool], npt.NDArray[np.uint8] + ], + headers_dir: Optional[Union[str, os.PathLike]] = None, + ): if headers_dir is None: - headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR + headers_dir = NnxTestHeaderGenerator.DEFAULT_HEADERS_DIR self.header_writer = HeaderWriter(headers_dir) + # function that takes the weights in CoutCinK format, bitwidth, and a depthwise flag, + # and returns a numpy array of dtype=np.uint8 of data in a layout correct for the accelerator + self.weightEncode = weightEncode - def generate(self, test_name: str, test: Ne16Test): + def generate(self, test_name: str, test: NnxTest): assert test.input is not None and test.output is not None _, in_channel, in_height, in_width = test.input.shape _, out_channel, out_height, out_width = test.output.shape # Render input in_ctype = test.conf.in_type.ctype() + in_signed = test.conf.in_type._signed in_data = test.input.permute(0, 2, 3, 1).ravel() self.header_writer.generate_vector_files( "input", _type=in_ctype, size=in_data.numel(), init=in_data @@ -431,10 +335,10 @@ def generate(self, test_name: str, test: Ne16Test): weight_offset = -(2 ** (weight_bits - 1)) weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape weight_data: np.ndarray = test.weight.numpy() - weight_offset - weight_init = Ne16.weight_unroll( + weight_init = self.weightEncode( weight_data.astype(np.uint8), weight_type._bits, - depthwise=test.conf.depthwise, + test.conf.depthwise, ) self.header_writer.generate_vector_files( "weight", _type="uint8_t", size=weight_init.size, init=weight_init @@ -470,13 +374,14 @@ def generate(self, test_name: str, test: Ne16Test): "height": in_height, "width": in_width, "channel": in_channel, - "bits": 8, + "signed": in_signed, + "bits": test.conf.in_type._bits, }, "output": { "height": out_height, "width": out_width, "channel": out_channel, - "bits": 8, + "bits": test.conf.out_type._bits, }, "weight": { "height": weight_ks_h, @@ -486,8 +391,16 @@ def generate(self, test_name: str, test: Ne16Test): "bits": weight_bits, "offset": weight_offset, }, - "scale": {"bits": 8}, - "bias": {"bits": 32}, + "scale": { + "bits": test.conf.scale_type._bits + if test.conf.scale_type is not None + else 0 + }, + "bias": { + "bits": test.conf.bias_type._bits + if test.conf.bias_type is not None + else 0 + }, "padding": { "top": test.conf.padding.top, "bottom": test.conf.padding.bottom, diff --git a/test/README.md b/test/README.md index c3d29c5..8442493 100644 --- a/test/README.md +++ b/test/README.md @@ -35,3 +35,9 @@ $ pytest test.py --help - [testgen.py](testgen.py): collection of helper tools for individual tests For more information you can run the script with the `-h` flag. + +## Application + +The Makefile in the `app/` uses a flag `ACCELERATOR` to decide which accelerator to use. +The choices are _ne16_ or _neureka_. +You can either export it or run it like `ACCELERATOR= make clean all run`. diff --git a/test/TestClasses.py b/test/TestClasses.py index c10641c..c6267d6 100644 --- a/test/TestClasses.py +++ b/test/TestClasses.py @@ -16,15 +16,16 @@ # # SPDX-License-Identifier: Apache-2.0 -from functools import cached_property import re -from typing import Any, Dict, Literal, Optional, TYPE_CHECKING +from functools import cached_property +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional + from pydantic import ( BaseModel, - model_serializer, - model_validator, NonNegativeInt, PositiveInt, + model_serializer, + model_validator, ) diff --git a/test/app/Makefile b/test/app/Makefile index 14f30fd..ca65892 100644 --- a/test/app/Makefile +++ b/test/app/Makefile @@ -40,6 +40,8 @@ INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp INC_DIRS += gen/inc INC_FLAGS += $(addprefix -I,$(INC_DIRS)) +APP_CFLAGS += $(INC_FLAGS) + # Source files @@ -58,7 +60,9 @@ APP_SRCS += $(wildcard gen/src/*.c) # Flags -APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto -APP_LDFLAGS += -flto +ACCELERATOR_UPPERCASE := $(shell echo $(ACCELERATOR) | tr [:lower:] [:upper:]) +APP_CFLAGS += -DNNX_ACCELERATOR=\"$(ACCELERATOR)\" -DNNX_$(ACCELERATOR_UPPERCASE) + +APP_CFLAGS += -O2 -w -Wall -Werror include $(RULES_DIR)/pmsis_rules.mk diff --git a/test/app/src/main.c b/test/app/src/main.c index cc67050..7cce4bf 100644 --- a/test/app/src/main.c +++ b/test/app/src/main.c @@ -29,8 +29,9 @@ int main() { struct pi_cluster_conf cl_conf; struct pi_cluster_task cl_task; - printf("\n"); - printf("Test %s starting\n", TEST_NAME); + printf("\nTest " TEST_NAME " starting\n"); + + printf("\nAccelerator: " NNX_ACCELERATOR "\n"); printf("\n"); layer_info(); @@ -43,13 +44,13 @@ int main() { } pi_cluster_send_task_to_cl( &cl_dev, pi_cluster_task(&cl_task, execute_nnx_layer, NULL)); - pi_cluster_close(&cl_dev); - - printf("\n"); - printf("Test %s finished\n", TEST_NAME); printf("\n"); check_output(); + pi_cluster_close(&cl_dev); + + printf("\nTest " TEST_NAME " finished\n"); + return 0; } diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c index ffd93a1..0d98ff6 100644 --- a/test/app/src/nnx_layer.c +++ b/test/app/src/nnx_layer.c @@ -19,12 +19,89 @@ */ #include "nnx_layer.h" +#include + +#ifdef NNX_NE16 + #include "ne16.h" #include "ne16_gvsoc.h" #include "ne16_pulp_bsp.h" #include "ne16_task.h" #include "pulp_nnx_ne16.h" -#include + +typedef ne16_norm_mode_e nnx_norm_mode_e; +typedef ne16_quant_t nnx_quant_t; +typedef ne16_norm_t nnx_norm_t; +typedef ne16_task_t nnx_task_t; +typedef ne16_dev_t nnx_dev_t; +typedef ne16_pulp_conf_t nnx_bsp_conf_t; + +#define nnxTaskFlagTrue ne16TaskFlagTrue +#define nnxTaskFlagFalse ne16TaskFlagFalse + +#define nnx_task_init ne16_task_init +#define nnx_task_set_op_to_conv ne16_task_set_op_to_conv +#define nnx_task_set_bits ne16_task_set_bits +#define nnx_task_set_norm_quant ne16_task_set_norm_quant +#define nnx_task_set_weight_offset ne16_task_set_weight_offset +#define nnx_task_set_dims ne16_task_set_dims +#define nnx_task_set_dims_stride2x2 ne16_task_set_dims_stride2x2 +#define nnx_task_set_ptrs ne16_task_set_ptrs + +#define NNX_GVSOC_LOG_LEVEL NE16_GVSOC_LOG_LEVEL_ALL +#define NNX_GVSOC_LOG_FORMAT NE16_GVSOC_LOG_FORMAT_HEXADECIMAL +#define nnx_gvsoc_log_activate ne16_gvsoc_log_activate +#define nnx_gvsoc_log_deactivate ne16_gvsoc_log_deactivate + +#define nnx_bsp_get_dev ne16_pulp_get_dev + +#define nnx_init ne16_nnx_init +#define nnx_dispatch_wait ne16_nnx_dispatch_wait +#define nnx_dispatch_stride2x2 ne16_nnx_dispatch_stride2x2 +#define nnx_dispatch ne16_nnx_dispatch +#define nnx_resolve_wait ne16_nnx_resolve_wait +#define nnx_term ne16_nnx_term + +#elif defined NNX_NEUREKA + +#include "neureka.h" +#include "neureka_gvsoc.h" +#include "neureka_siracusa_bsp.h" +#include "neureka_task.h" +#include "pulp_nnx_neureka.h" + +typedef neureka_norm_mode_e nnx_norm_mode_e; +typedef neureka_quant_t nnx_quant_t; +typedef neureka_norm_t nnx_norm_t; +typedef neureka_task_t nnx_task_t; +typedef neureka_dev_t nnx_dev_t; +typedef neureka_siracusa_conf_t nnx_bsp_conf_t; + +#define nnxTaskFlagTrue neurekaTaskFlagTrue +#define nnxTaskFlagFalse neurekaTaskFlagFalse + +#define nnx_task_init neureka_task_init +#define nnx_task_set_op_to_conv neureka_task_set_op_to_conv +#define nnx_task_set_bits neureka_task_set_bits +#define nnx_task_set_norm_quant neureka_task_set_norm_quant +#define nnx_task_set_weight_offset neureka_task_set_weight_offset +#define nnx_task_set_dims neureka_task_set_dims +#define nnx_task_set_ptrs neureka_task_set_ptrs + +#define NNX_GVSOC_LOG_LEVEL NEUREKA_GVSOC_LOG_LEVEL_ALL +#define NNX_GVSOC_LOG_FORMAT NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL +#define nnx_gvsoc_log_activate neureka_gvsoc_log_activate +#define nnx_gvsoc_log_deactivate neureka_gvsoc_log_deactivate + +#define nnx_bsp_get_dev neureka_siracusa_get_dev + +#define nnx_init neureka_nnx_init +#define nnx_dispatch_wait neureka_nnx_dispatch_wait +#define nnx_dispatch neureka_nnx_dispatch +#define nnx_resolve_wait neureka_nnx_resolve_wait +#define nnx_term neureka_nnx_term + +#endif // NNX_NE16 || NNX_NEUREKA // Generated headers #include "bias.h" @@ -34,73 +111,109 @@ #include "scale.h" #include "weight.h" -static void task_prepare(ne16_task_t *task) { - ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, - WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET, - (ne16_quant_t){.shift_amount = OUTSHIFT, - .mode = quantMode8Bit, - .function = HAS_RELU ? quantFunctionRelu - : quantFunctionIdentity, - .flag_rounding = ne16TaskFlagFalse}, - (ne16_norm_t){.mode = normMode8Bit, - .flag_bias = HAS_BIAS ? ne16TaskFlagTrue - : ne16TaskFlagFalse, - .flag_shift = ne16TaskFlagFalse}, - STRIDE_HEIGHT); - - if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { - ne16_task_set_dims_stride2x2( - task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, - OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, - PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); - } else { - ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, - OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, - PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); - } - - ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL, - INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output, - (uint32_t)weight, (uint32_t)scale, NULL, +static void task_prepare(nnx_task_t *task) { + nnx_task_init(task); + nnx_task_set_op_to_conv(task, WEIGHT_HEIGHT, GROUPS > 1, STRIDE_HEIGHT); + nnx_task_set_bits(task, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS); + +#if HAS_NORM_QUANT == 1 +#if SCALE_BITS == 8 + const nnx_norm_mode_e normMode = normMode8Bit; +#elif SCALE_BITS == 32 + const nnx_norm_mode_e normMode = normMode32Bit; +#endif + + nnx_task_set_norm_quant( + task, + (nnx_quant_t){.shift_amount = OUTSHIFT, + .function = + HAS_RELU ? quantFunctionRelu : quantFunctionIdentity, + .flag_rounding = nnxTaskFlagFalse}, + (nnx_norm_t){.mode = normMode, + .flag_bias = HAS_BIAS ? nnxTaskFlagTrue : nnxTaskFlagFalse, + .flag_shift = nnxTaskFlagFalse}); +#endif // HAS_NORM_QUANT + + nnx_task_set_weight_offset(task, weightOffsetModeLayerWise, WEIGHT_OFFSET); + +#ifdef NNX_NEUREKA +#ifdef NEUREKA_WEIGHT_SOURCE_WMEM + neureka_task_set_weight_source(task, neurekaWeightSourceWmem); +#else + neureka_task_set_weight_source(task, neurekaWeightSourceTcdm); +#endif +#if INPUT_SIGNED == 1 + neureka_task_set_input_signed(task); +#else + neureka_task_set_input_unsigned(task); +#endif +#endif + + const uint32_t w_in_stride = INPUT_CHANNEL * INPUT_BITS / 8; + const uint32_t h_in_stride = INPUT_WIDTH * w_in_stride; + const uint32_t w_out_stride = OUTPUT_CHANNEL * OUTPUT_BITS / 8; + const uint32_t h_out_stride = OUTPUT_WIDTH * w_out_stride; + +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, w_out_stride, + WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT, + PADDING_LEFT); +#else + nnx_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, h_in_stride, w_in_stride, + OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, h_out_stride, + w_out_stride, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT, + PADDING_LEFT); +#endif + + nnx_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, w_in_stride, + PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight, +#if HAS_NORM_QUANT == 1 + (uint32_t)scale, NULL, #if HAS_BIAS == 1 - (uint32_t)bias + (uint32_t)bias +#else + NULL +#endif #else - NULL + NULL, NULL, NULL #endif ); } -static void task_execute(ne16_task_t *task) { - ne16_dev_t *dev = ne16_pulp_get_dev(); +static void task_execute(nnx_task_t *task) { + nnx_dev_t *dev = nnx_bsp_get_dev(); - ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG, - NE16_GVSOC_LOG_FORMAT_HEXADECIMAL); +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + nnx_gvsoc_log_activate(dev, NNX_GVSOC_LOG_LEVEL, NNX_GVSOC_LOG_FORMAT); +#endif - ne16_pulp_conf_t conf = {.max_stall = 8}; - ne16_nnx_init(dev, &conf); + nnx_bsp_conf_t conf = {.max_stall = 8}; + nnx_init(dev, &conf); - ne16_nnx_dispatch_wait(dev); + nnx_dispatch_wait(dev); - if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { - ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, - OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, - WEIGHT_HEIGHT, WEIGHT_WIDTH); - } else { - ne16_nnx_dispatch(dev, task); - } +#if STRIDE_HEIGHT == 2 && STRIDE_WIDTH == 2 + nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, + OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, + WEIGHT_WIDTH); +#else + nnx_dispatch(dev, task); +#endif - ne16_nnx_resolve_wait(dev, task); + nnx_resolve_wait(dev, task); - ne16_nnx_term(dev); + nnx_term(dev); - ne16_gvsoc_log_deactivate(dev); +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + nnx_gvsoc_log_deactivate(dev); +#endif } void execute_nnx_layer(void *args) { - ne16_task_t task; + nnx_task_t task; task_prepare(&task); task_execute(&task); } diff --git a/test/conf.toml b/test/conf.toml index 1222f1d..c24055a 100644 --- a/test/conf.toml +++ b/test/conf.toml @@ -22,7 +22,7 @@ # Ne16TestClasses.py:Ne16TestConf().check_valid() # Input dimensions -in_height = 3 +in_height = 4 in_width = 3 in_channel = 8 diff --git a/test/conftest.py b/test/conftest.py index 6c2c15b..3c0a316 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -18,7 +18,17 @@ import os from typing import Union -from Ne16TestClasses import Ne16Test, Ne16TestGenerator + +import pydantic +import pytest + +from Ne16MemoryLayout import Ne16MemoryLayout +from Ne16TestConf import Ne16TestConf +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NeurekaTestConf import NeurekaTestConf +from NnxTestClasses import NnxTest, NnxTestGenerator + +_SUPPORTED_ACCELERATORS = ["ne16", "neureka"] def pytest_addoption(parser): @@ -39,6 +49,13 @@ def pytest_addoption(parser): default=False, help="Recursively search for tests in given test directories.", ) + parser.addoption( + "-A", + "--accelerator", + choices=_SUPPORTED_ACCELERATORS, + default="ne16", + help="Choose an accelerator to test. Default: ne16", + ) parser.addoption( "--regenerate", action="store_true", @@ -54,7 +71,7 @@ def pytest_addoption(parser): def _find_test_dirs(path: Union[str, os.PathLike]): - return [dirpath for dirpath, _, _ in os.walk(path) if Ne16Test.is_test_dir(dirpath)] + return [dirpath for dirpath, _, _ in os.walk(path) if NnxTest.is_test_dir(dirpath)] def pytest_generate_tests(metafunc): @@ -62,6 +79,18 @@ def pytest_generate_tests(metafunc): recursive = metafunc.config.getoption("recursive") regenerate = metafunc.config.getoption("regenerate") timeout = metafunc.config.getoption("timeout") + nnxName = metafunc.config.getoption("accelerator") + + if nnxName == "ne16": + nnxMemoryLayoutCls = Ne16MemoryLayout + nnxTestConfCls = Ne16TestConf + elif nnxName == "neureka": + nnxMemoryLayoutCls = NeurekaMemoryLayout + nnxTestConfCls = NeurekaTestConf + else: + assert ( + False + ), f"Given accelerator {nnxName} not supported. Supported accelerators: {_SUPPORTED_ACCELERATORS}" if recursive: tests_dirs = test_dirs @@ -69,12 +98,28 @@ def pytest_generate_tests(metafunc): for tests_dir in tests_dirs: test_dirs.extend(_find_test_dirs(tests_dir)) - # (Re)Generate test data + # Load valid tests + nnxTestAndNames = [] for test_dir in test_dirs: - test = Ne16Test.load(test_dir) - if not test.is_valid() or regenerate: - test = Ne16TestGenerator.from_conf(test.conf) - test.save_data(test_dir) + try: + test = NnxTest.load(nnxTestConfCls, test_dir) + # (Re)generate data + if not test.is_valid() or regenerate: + test = NnxTestGenerator.from_conf(test.conf) + test.save_data(test_dir) + nnxTestAndNames.append((test, test_dir)) + except pydantic.ValidationError as e: + _ = e + nnxTestAndNames.append( + pytest.param( + (None, test_dir), + marks=pytest.mark.skipif( + True, reason=f"Invalid test {test_dir}: {e.errors}" + ), + ) + ) - metafunc.parametrize("path", test_dirs) + metafunc.parametrize("nnxTestAndName", nnxTestAndNames) metafunc.parametrize("timeout", [timeout]) + metafunc.parametrize("nnxName", [nnxName]) + metafunc.parametrize("nnxMemoryLayoutCls", [nnxMemoryLayoutCls]) diff --git a/test/requirements-dev.txt b/test/requirements-dev.txt index fa0a75a..0956e5e 100644 --- a/test/requirements-dev.txt +++ b/test/requirements-dev.txt @@ -1,2 +1,3 @@ pyright black +isort diff --git a/test/test.py b/test/test.py index 39709b6..1893cdf 100644 --- a/test/test.py +++ b/test/test.py @@ -16,13 +16,16 @@ # # SPDX-License-Identifier: Apache-2.0 +import locale import os import re -from typing import Union, Optional, Tuple -import locale import subprocess -from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator from pathlib import Path +from typing import Dict, Optional, Tuple, Type, Union + +from Ne16MemoryLayout import Ne16MemoryLayout +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NnxTestClasses import NnxTest, NnxTestConf, NnxTestHeaderGenerator HORIZONTAL_LINE = "\n" + "-" * 100 + "\n" @@ -49,17 +52,29 @@ def captured_output( def execute_command( - cmd: str, timeout: int = 30, cflags: Optional[str] = None + cmd: str, + timeout: int = 30, + cflags: Optional[str] = None, + envflags: Optional[Dict[str, str]] = None, ) -> Tuple[bool, str, str, Optional[str]]: - app_cflags = 'APP_CFLAGS="' + " ".join(cflags) + '" ' if cflags else "" - cmd = cmd + app_cflags + env = os.environ + if cflags: + env["APP_CFLAGS"] = '"' + " ".join(cflags) + '"' + if envflags: + for key, value in envflags.items(): + env[key] = value status = None stdout = None try: proc = subprocess.run( - cmd.split(), check=True, capture_output=True, text=True, timeout=timeout + cmd.split(), + check=True, + capture_output=True, + text=True, + timeout=timeout, + env=env, ) status = True msg = "OK" @@ -94,28 +109,35 @@ def assert_message( return retval -def test(path: str, timeout: int): - test_name = path - test = Ne16Test.load(path) - - Ne16TestHeaderGenerator().generate(test_name, test) +def test( + nnxTestAndName: Tuple[NnxTest, str], + timeout: int, + nnxName: str, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], +): + nnxTest, nnxTestName = nnxTestAndName + NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate( + nnxTestName, nnxTest + ) Path("app/src/nnx_layer.c").touch() cmd = f"make -C app all run platform=gvsoc" - passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout) + passed, msg, stdout, stderr = execute_command( + cmd=cmd, timeout=timeout, envflags={"ACCELERATOR": nnxName} + ) - assert passed, assert_message(msg, test_name, cmd, stdout, stderr) + assert passed, assert_message(msg, nnxTestName, cmd, stdout, stderr) match_success = re.search(r"> Success! No errors found.", stdout) match_fail = re.search(r"> Failure! Found (\d*)/(\d*) errors.", stdout) assert match_success or match_fail, assert_message( - "No regexes matched.", test_name, cmd, stdout + "No regexes matched.", nnxTestName, cmd, stdout ) assert not match_fail, assert_message( f"Errors found: {match_fail.group(1)}/{match_fail.group(2)}", - test_name, + nnxTestName, cmd, stdout, ) diff --git a/test/testgen.py b/test/testgen.py index e748f2e..521aecc 100644 --- a/test/testgen.py +++ b/test/testgen.py @@ -16,28 +16,61 @@ # # SPDX-License-Identifier: Apache-2.0 -import os import argparse import json +import os +from typing import Optional, Set, Type, Union + import toml -from typing import Optional, Union, Set -from Ne16TestClasses import ( - Ne16TestConf, - Ne16TestGenerator, - Ne16Test, - Ne16TestHeaderGenerator, + +from Ne16MemoryLayout import Ne16MemoryLayout +from Ne16TestConf import Ne16TestConf +from NeurekaMemoryLayout import NeurekaMemoryLayout +from NeurekaTestConf import NeurekaTestConf +from NnxTestClasses import ( + NnxTest, + NnxTestConf, + NnxTestGenerator, + NnxTestHeaderGenerator, ) -def headers_gen(args, test: Optional[Ne16Test] = None): +def headers_gen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], + test: Optional[NnxTest] = None, +): if test is None: - test = Ne16Test.load(args.test_dir) + test = NnxTest.load(nnxTestConfCls, args.test_dir) + assert test is not None if not test.is_valid(): - test = Ne16TestGenerator.from_conf(test.conf) - Ne16TestHeaderGenerator().generate(args.test_dir, test) - - -def test_gen(args): + test = NnxTestGenerator.from_conf(test.conf) + NnxTestHeaderGenerator(nnxMemoryLayoutCls.weightEncode).generate( + args.test_dir, test + ) + + +def print_tensors(test: NnxTest): + print("INPUT TENSOR:") + print(test.input) + print("WEIGHT TENSOR:") + print(test.weight) + print("SCALE TENSOR:") + print(test.scale) + print("BIAS TENSOR:") + print(test.bias) + print("GLOBAL SHIFT TENSOR:") + print(test.global_shift) + print("EXPECTED OUTPUT TENSOR:") + print(test.output) + + +def test_gen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], +): if args.conf.endswith(".toml"): test_conf_dict = toml.load(args.conf) elif args.conf.endswith(".json"): @@ -49,37 +82,71 @@ def test_gen(args): ) exit(-1) - test_conf = Ne16TestConf.model_validate(test_conf_dict) - test = Ne16TestGenerator.from_conf(test_conf) + test_conf = nnxTestConfCls.model_validate(test_conf_dict) + test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors) if not args.skip_save: test.save(args.test_dir) if args.headers: - headers_gen(args, test) - - -def _regen(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None: - test = Ne16Test.load(path) - test = Ne16TestGenerator.regenerate(test, regen_tensors) + headers_gen(args, nnxMemoryLayoutCls, nnxTestConfCls, test) + if args.print_tensors: + print_tensors(test) + + +def _regen( + path: Union[str, os.PathLike], + regen_tensors: Set[str], + nnxTestConfCls: Type[NnxTestConf], +) -> None: + test = NnxTest.load(nnxTestConfCls, path) + test = NnxTestGenerator.regenerate(test, regen_tensors) test.save(path) -def _regen_recursive(path: Union[str, os.PathLike], regen_tensors: Set[str]) -> None: - if Ne16Test.is_test_dir(path): - _regen(path, regen_tensors) +def _regen_recursive( + path: Union[str, os.PathLike], + regen_tensors: Set[str], + nnxTestConfCls: Type[NnxTestConf], +) -> None: + if NnxTest.is_test_dir(path): + _regen(path, regen_tensors, nnxTestConfCls) return for dirpath, _, _ in os.walk(path): - _regen_recursive(dirpath, regen_tensors) + _regen_recursive(dirpath, regen_tensors, nnxTestConfCls) -def test_regen(args): +def test_regen( + args, + nnxMemoryLayoutCls: Union[Type[Ne16MemoryLayout], Type[NeurekaMemoryLayout]], + nnxTestConfCls: Type[NnxTestConf], +): + _ = nnxMemoryLayoutCls regen_tensors = set(args.tensors + ["output"]) for test_dir in args.test_dirs: if args.recurse: - _regen_recursive(test_dir, regen_tensors) + _regen_recursive(test_dir, regen_tensors, nnxTestConfCls) else: - _regen(test_dir, regen_tensors) + _regen(test_dir, regen_tensors, nnxTestConfCls) + + +def add_common_arguments(parser: argparse.ArgumentParser): + parser.add_argument( + "-t", + "--test-dir", + type=str, + dest="test_dir", + required=True, + help="Path to the test.", + ) + + parser.add_argument( + "-a", + "--accelerator", + choices=["ne16", "neureka"], + default="ne16", + help="Choose an accelerator. Default: ne16", + ) parser = argparse.ArgumentParser( @@ -91,14 +158,7 @@ def test_regen(args): parser_header = subparsers.add_parser( "headers", description="Generate headers for a single test." ) -parser_header.add_argument( - "-t", - "--test-dir", - type=str, - dest="test_dir", - required=True, - help="Path to the test." "basename.", -) +add_common_arguments(parser_header) parser_header.set_defaults(func=headers_gen) parser_test = subparsers.add_parser( @@ -112,14 +172,6 @@ def test_regen(args): required=True, help="Path to the configuration file.", ) -parser_test.add_argument( - "-t", - "--test-dir", - type=str, - dest="test_dir", - required=True, - help="Path to the test. " "basename.", -) parser_test.add_argument( "--headers", action="store_true", default=False, help="Generate headers." ) @@ -130,6 +182,14 @@ def test_regen(args): dest="skip_save", help="Skip saving the test.", ) +parser_test.add_argument( + "--print-tensors", + action="store_true", + default=False, + dest="print_tensors", + help="Print tensor values to stdout.", +) +add_common_arguments(parser_test) parser_test.set_defaults(func=test_gen) parser_regen = subparsers.add_parser("regen", description="Regenerate test tensors.") @@ -138,25 +198,27 @@ def test_regen(args): type=str, nargs="?", default=[], - help="Tensors that should be regenerated. Output " "included by default.", -) -parser_regen.add_argument( - "-t", - "--test-dir", - action="append", - dest="test_dirs", - required=True, - help="Path to the test.", + help="Tensors that should be regenerated. Output included by default.", ) parser_regen.add_argument( "-r", "--recursive", action="store_true", default=False, - help="Recursively search for test directiories " "inside given test directories.", + help="Recursively search for test directiories inside given test directories.", ) +add_common_arguments(parser_regen) parser_regen.set_defaults(func=test_regen) args = parser.parse_args() -args.func(args) +if args.accelerator == "ne16": + nnxMemoryLayoutCls = Ne16MemoryLayout + nnxTestConfCls = Ne16TestConf +elif args.accelerator == "neureka": + nnxMemoryLayoutCls = NeurekaMemoryLayout + nnxTestConfCls = NeurekaTestConf +else: + assert False, f"Unsupported accelerator {args.accelerator}." + +args.func(args, nnxMemoryLayoutCls, nnxTestConfCls) diff --git a/test/tests/test_102/conf.json b/test/tests/test_102/conf.json new file mode 100644 index 0000000..d6d0c17 --- /dev/null +++ b/test/tests/test_102/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 4, + "in_width": 3, + "in_channel": 8, + "out_channel": 8, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_103/conf.json b/test/tests/test_103/conf.json new file mode 100644 index 0000000..3eff547 --- /dev/null +++ b/test/tests/test_103/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 25, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_104/conf.json b/test/tests/test_104/conf.json new file mode 100644 index 0000000..d6d00e4 --- /dev/null +++ b/test/tests/test_104/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 25, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_105/conf.json b/test/tests/test_105/conf.json new file mode 100644 index 0000000..0f34422 --- /dev/null +++ b/test/tests/test_105/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 20, + "in_width": 15, + "in_channel": 40, + "out_channel": 40, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "int8", + "out_type": "uint8", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": true, + "has_bias": true, + "has_relu": true +} \ No newline at end of file diff --git a/test/tests/test_106/conf.json b/test/tests/test_106/conf.json new file mode 100644 index 0000000..0b98f3a --- /dev/null +++ b/test/tests/test_106/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 17, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_107/conf.json b/test/tests/test_107/conf.json new file mode 100644 index 0000000..2f8951c --- /dev/null +++ b/test/tests/test_107/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 17, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_108/conf.json b/test/tests/test_108/conf.json new file mode 100644 index 0000000..7842aaa --- /dev/null +++ b/test/tests/test_108/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_109/conf.json b/test/tests/test_109/conf.json new file mode 100644 index 0000000..a6b71c9 --- /dev/null +++ b/test/tests/test_109/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": true, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_110/conf.json b/test/tests/test_110/conf.json new file mode 100644 index 0000000..622efc4 --- /dev/null +++ b/test/tests/test_110/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_111/conf.json b/test/tests/test_111/conf.json new file mode 100644 index 0000000..d6714c4 --- /dev/null +++ b/test/tests/test_111/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 1, + "width": 1 + }, + "depthwise": false, + "stride": { + "height": 2, + "width": 2 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_112/conf.json b/test/tests/test_112/conf.json new file mode 100644 index 0000000..1991c59 --- /dev/null +++ b/test/tests/test_112/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 1, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_113/conf.json b/test/tests/test_113/conf.json new file mode 100644 index 0000000..1dce097 --- /dev/null +++ b/test/tests/test_113/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 0, + "left": 0, + "right": 1 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_114/conf.json b/test/tests/test_114/conf.json new file mode 100644 index 0000000..c1ce5c3 --- /dev/null +++ b/test/tests/test_114/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 0, + "bottom": 1, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/test/tests/test_115/conf.json b/test/tests/test_115/conf.json new file mode 100644 index 0000000..19153ba --- /dev/null +++ b/test/tests/test_115/conf.json @@ -0,0 +1,29 @@ +{ + "in_height": 15, + "in_width": 34, + "in_channel": 33, + "out_channel": 33, + "padding": { + "top": 1, + "bottom": 0, + "left": 0, + "right": 0 + }, + "kernel_shape": { + "height": 3, + "width": 3 + }, + "depthwise": false, + "stride": { + "height": 1, + "width": 1 + }, + "in_type": "uint8", + "out_type": "int32", + "weight_type": "int8", + "scale_type": "uint8", + "bias_type": "int32", + "has_norm_quant": false, + "has_bias": false, + "has_relu": false +} \ No newline at end of file diff --git a/util/hwpe.c b/util/hwpe.c index 53c1ace..0430081 100644 --- a/util/hwpe.c +++ b/util/hwpe.c @@ -31,11 +31,11 @@ #define HWPE_TASK_REG_OFFSET 8 inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { - *(dev->base_addr + reg) = value; + dev->base_addr[reg] = value; } inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) { - return *(dev->base_addr + reg); + return dev->base_addr[reg]; } inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { diff --git a/util/pulp_nnx_util.c b/util/pulp_nnx_util.c index 34db512..0107fc1 100644 --- a/util/pulp_nnx_util.c +++ b/util/pulp_nnx_util.c @@ -20,14 +20,16 @@ #include "pulp_nnx_util.h" -inline int divnceil(const int dividend, const int divisor) { - return ((dividend - 1) / divisor) + 1; +inline int nnx_calculate_number_of_tiles(const int dim_size, + const int tile_size) { + return ((dim_size - 1) / tile_size) + 1; } -inline int remainder(const int dividend, const int divisor) { - return ((dividend - 1) % divisor) + 1; +inline int nnx_calculate_last_tile_size(const int dim_size, + const int tile_size) { + return ((dim_size - 1) % tile_size) + 1; } -inline uint32_t concat_half(const uint16_t high, const uint16_t low) { +inline uint32_t nnx_concat_half(const uint16_t high, const uint16_t low) { return ((uint32_t)high << 16) | low; } diff --git a/util/pulp_nnx_util.h b/util/pulp_nnx_util.h index 638e5d9..d167f6d 100644 --- a/util/pulp_nnx_util.h +++ b/util/pulp_nnx_util.h @@ -24,26 +24,28 @@ #include /** - * divnceil + * nnx_calculate_number_of_iterations * - * Does integer division and ceiling of it. + * Calculates the number of iterations to go through a dimension. + * It does it by dividing the dimension with the tile size and doing a ceiling + * the result. */ -int divnceil(const int dividend, const int divisor); +int nnx_calculate_number_of_tiles(const int dim_size, const int tile_size); /** - * remainder + * nnx_calculate_last_tile_size * - * Calculates the remainder but if the remainder should be 0, - * returns divisor. Used for calculation of the last `remainding` - * iteration of the tile. + * Calculates the size of the last executed tile by calculating the remainder of + * the dim_size and the tile_size. In case the remainder is 0, it returns the + * full tile_size. */ -int remainder(const int dividend, const int divisor); +int nnx_calculate_last_tile_size(const int dim_size, const int tile_size); /** * concat_half * * Concatenate 2 16-bit numbers into a 32-bit number. */ -uint32_t concat_half(const uint16_t high, const uint16_t low); +uint32_t nnx_concat_half(const uint16_t high, const uint16_t low); #endif // __NNX_UTIL_H__