From bf51fdc7eb306d18fef17552692bbf7fbe449a55 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Sat, 27 Jul 2024 15:11:28 +0200
Subject: [PATCH 1/5] add support for < 8 bits in weight_type

not tested end-to-end yet.
---
 test/NeurekaTestConf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
index f878e68..a8f89a7 100644
--- a/test/NeurekaTestConf.py
+++ b/test/NeurekaTestConf.py
@@ -65,7 +65,7 @@ def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
     @field_validator("weight_type")
     @classmethod
     def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
-        NeurekaTestConf._check_type("weight_type", v, ["int8"])
+        NeurekaTestConf._check_type("weight_type", v, ["int8", "int7", "int6", "int5", "int4", "int3", "int2"])
         return v
 
     @field_validator("scale_type")

From 140fc2c7b0de8c596e4fa922c7dd69947afda1de Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Wed, 21 Aug 2024 11:48:36 +0200
Subject: [PATCH 2/5] print verbose debug info in Hex format

---
 test/NeuralEngineFunctionalModel.py | 39 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
index 32bcc5a..5938554 100644
--- a/test/NeuralEngineFunctionalModel.py
+++ b/test/NeuralEngineFunctionalModel.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn.functional as F
+import numpy as np
 
 from TestClasses import IntegerType, Padding, Stride
 
@@ -9,6 +10,15 @@
 class NeuralEngineFunctionalModel:
     ACCUMULATOR_TYPE = IntegerType(name="int32")
 
+    @staticmethod
+    def _tensor_to_hex(tensor):
+        int_tensor = np.asarray(torch.floor(tensor).to(torch.int64))
+        int_tensor[int_tensor < 0] = 0xffffffff + (int_tensor[int_tensor < 0]+1)
+        hex_tensor = np.empty(int_tensor.shape, dtype=object)
+        for idx in np.ndindex(int_tensor.shape):
+            hex_tensor[idx] = hex(int_tensor[idx].item())
+        return hex_tensor
+
     @staticmethod
     def _cast(
         tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
@@ -36,7 +46,10 @@ def _norm_quant(
 
         if verbose:
             print("INTERMEDIATE RESULTS (after scale):")
-            print(tensor)
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+            np.set_printoptions(threshold=current_threshold)
 
         if has_bias:
             assert bias is not None
@@ -54,7 +67,10 @@ def _norm_quant(
 
             if verbose:
                 print("INTERMEDIATE RESULTS (after bias):")
-                print(tensor)
+                current_threshold = np.get_printoptions()['threshold']
+                np.set_printoptions(threshold=np.inf)
+                print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+                np.set_printoptions(threshold=current_threshold)
 
         if has_relu:
             tensor = F.relu(tensor)
@@ -63,7 +79,10 @@ def _norm_quant(
 
         if verbose:
             print("INTERMEDIATE RESULTS (after shift):")
-            print(tensor)
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+            np.set_printoptions(threshold=current_threshold)
 
         # Saturate into out_type
         tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
@@ -102,6 +121,15 @@ def convolution(
             0,
         )
 
+        if verbose:
+            print("INPUTS (padded):")
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(input_padded))
+            print("WEIGHTS (padded):")
+            print(NeuralEngineFunctionalModel._tensor_to_hex(weight))
+            np.set_printoptions(threshold=current_threshold)
+
         # Accumulators are 32bit non-saturating.
         # Calculate in higher precision (int64)
         output = F.conv2d(
@@ -118,7 +146,10 @@ def convolution(
 
         if verbose:
             print("INTERMEDIATE RESULTS (pre-normalization/requant):")
-            print(output)
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(output))
+            np.set_printoptions(threshold=current_threshold)
 
         if has_norm_quant:
             assert scale is not None

From 973cd49c56f898027b02eaf13663fdff79855c84 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Wed, 21 Aug 2024 11:49:03 +0200
Subject: [PATCH 3/5] Tentative fix for QW<8 bit

This fixes layout + runtime for QW<8 bit. Tested only on pointwise
and only on the special scenario of synthetic weights, for now.
---
 neureka/hal/neureka_task.c  |  2 +-
 test/NeurekaMemoryLayout.py | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index d31c934..d9209f5 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -169,7 +169,7 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
   if (task->kernel_shape == 1) { // 1x1
     task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1;
     task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 * num_k_in;
+        (NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 / 8) * task->qw * num_k_in;
   } else if (!task->depthwise) { // 3x3
     task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
     task->data.cfg.weights_stride.d1 =
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
index 61b3ad8..ca51d4a 100644
--- a/test/NeurekaMemoryLayout.py
+++ b/test/NeurekaMemoryLayout.py
@@ -88,15 +88,8 @@ def weightEncode(
         elif height == 1 and width == 1:
             # (cout * cinMajor, Bits * cinSubtile)
             weight = weight.reshape(-1, bits * cinSubtile)
-            # Pad only the last dimension to weight bandwidth size
-            # (-1, Weight Bandwidth)
-            weight = np.pad(
-                weight,
-                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH_1x1 - weight.shape[-1])),
-                "constant",
-                constant_values=0,
-            )
-            weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH_1x1 / 8))
+            # No padding needed here
+            weightBandwidthBytes = int(np.ceil(bits * cinSubtile / 8))
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)

From 13dd71f5729e2bd7a5fe03a18aa702f56ff9d832 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Wed, 21 Aug 2024 12:00:49 +0200
Subject: [PATCH 4/5] Clean up synthetic weights/inputs generation integration
 in pulp-nnx

---
 test/NnxTestClasses.py | 47 ++++++++++++++++++++++++++++++------------
 test/testgen.py        | 16 +-------------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 41d5131..8d4eed1 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -48,6 +48,8 @@ class NnxTestConf(BaseModel):
     has_norm_quant: bool
     has_bias: bool
     has_relu: bool
+    synthetic_weights: bool
+    synthetic_inputs: bool
 
     @model_validator(mode="after")  # type: ignore
     def check_valid_depthwise_channels(self) -> NnxTestConf:
@@ -116,6 +118,8 @@ def __init__(
         scale: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         global_shift: Optional[torch.Tensor] = torch.Tensor([0]),
+        synthetic_weights: Optional[bool] = False,
+        synthetic_inputs: Optional[bool] = False,
     ) -> None:
         self.conf = conf
         self.input = input
@@ -124,6 +128,8 @@ def __init__(
         self.scale = scale
         self.bias = bias
         self.global_shift = global_shift
+        self.synthetic_weights = synthetic_weights
+        self.synthetic_inputs = synthetic_inputs
 
     def is_valid(self) -> bool:
         return all(
@@ -243,20 +249,30 @@ def from_conf(
         bias_shape = (1, conf.out_channel, 1, 1)
 
         if input is None:
-            input = NnxTestGenerator._random_data(
-                _type=conf.in_type,
-                shape=input_shape,
-            )
+            if conf.synthetic_inputs:
+                inputs = torch.zeros((1, conf.in_channel, conf.in_height, conf.in_width), dtype=torch.int64)
+                for i in range(conf.in_channel):
+                    inputs[:, i,0,0] = i
+            else:
+                input = NnxTestGenerator._random_data(
+                    _type=conf.in_type,
+                    shape=input_shape,
+                )
 
         if weight is None:
-            weight_mean = NnxTestGenerator._DEFAULT_WEIGHT_MEAN
-            weight_std  = NnxTestGenerator._DEFAULT_WEIGHT_STDEV * (1<<(conf.weight_type._bits-1)-1)
-            weight = NnxTestGenerator._random_data_normal(
-                mean = weight_mean,
-                std = weight_std,
-                _type=conf.weight_type,
-                shape=weight_shape,
-            )
+            if conf.synthetic_weights:
+                weight = torch.zeros((conf.out_channel, 1 if conf.depthwise else conf.in_channel, conf.kernel_shape.height, conf.kernel_shape.width), dtype=torch.int64)
+                for i in range(0, min(weight.shape[0], weight.shape[1])):
+                    weight[i,i,0,0] = 1
+            else:
+                weight_mean = NnxTestGenerator._DEFAULT_WEIGHT_MEAN
+                weight_std  = NnxTestGenerator._DEFAULT_WEIGHT_STDEV * (1<<(conf.weight_type._bits-1)-1)
+                weight = NnxTestGenerator._random_data_normal(
+                    mean = weight_mean,
+                    std = weight_std,
+                    _type=conf.weight_type,
+                    shape=weight_shape,
+                )
 
         if conf.has_norm_quant:
             if scale is None:
@@ -306,6 +322,8 @@ def from_conf(
             scale=scale,
             bias=bias,
             global_shift=global_shift,
+            synthetic_inputs=conf.synthetic_inputs,
+            synthetic_weights=conf.synthetic_weights,
         )
 
     @staticmethod
@@ -361,7 +379,10 @@ def generate(self, test_name: str, test: NnxTest):
         weight_type = test.conf.weight_type
         weight_bits = weight_type._bits
         assert weight_bits > 1 and weight_bits <= 8
-        weight_offset = -(2 ** (weight_bits - 1))
+        if test.synthetic_weights:
+            weight_offset = 0
+        else:
+            weight_offset = -(2 ** (weight_bits - 1))
         weight_out_ch, weight_in_ch, weight_ks_h, weight_ks_w = test.weight.shape
         weight_data: np.ndarray = test.weight.numpy() - weight_offset
         weight_init = self.weightEncode(
diff --git a/test/testgen.py b/test/testgen.py
index c128ff4..e5378e4 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -86,21 +86,7 @@ def test_gen(
         exit(-1)
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
-    if test_conf_dict['synthetic_weights']:
-        import torch
-        weight = torch.zeros((test_conf.out_channel, 1 if test_conf.depthwise else test_conf.in_channel, test_conf.kernel_shape.height, test_conf.kernel_shape.width), dtype=torch.int64)
-        for i in range(0, min(weight.shape[0], weight.shape[1])):
-            weight[i,i,0,0] = 1
-    else:
-        weight = None
-    if test_conf_dict['synthetic_inputs']:
-        import torch
-        inputs = torch.zeros((1, test_conf.in_channel, test_conf.in_height, test_conf.in_width), dtype=torch.int64)
-        for i in range(test_conf.in_channel):
-            inputs[:, i,0,0] = i
-    else:
-        inputs = None
-    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors, weight=weight, input=inputs)
+    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers:

From c20c03cefddc6d996124e42c058fa13a54c42b8d Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Wed, 21 Aug 2024 15:57:05 +0200
Subject: [PATCH 5/5] Fix corner cases global shift gen

In some corner cases, the global shift factor was generated as a
number < 0 (down to -inf...). This makes no sense, so now the global
shift must be 0 at a minimum.
---
 test/NnxTestClasses.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
index 8d4eed1..7e0e3a0 100644
--- a/test/NnxTestClasses.py
+++ b/test/NnxTestClasses.py
@@ -213,7 +213,11 @@ def _calculate_global_shift(
         """Calculate global shift so that the output values are in the range of out_type"""
         s = tensor.type(torch.float64).std()
         target_s = 2 ** (out_type._bits - 1)
-        return torch.ceil(torch.log2(s / target_s)).type(torch.int32)
+        shift = torch.ceil(torch.log2(s / target_s)).type(torch.int32)
+        if shift < 1:
+            return torch.zeros((1,)).type(torch.int32)
+        else:
+            return shift
 
     @staticmethod
     def _random_data(_type: IntegerType, shape: Tuple, extremes: Tuple = None):