From 973cd49c56f898027b02eaf13663fdff79855c84 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Wed, 21 Aug 2024 11:49:03 +0200
Subject: [PATCH] Tentative fix for QW<8 bit

This fixes layout + runtime for QW<8 bit. Tested only on pointwise
and only on the special scenario of synthetic weights, for now.
---
 neureka/hal/neureka_task.c  |  2 +-
 test/NeurekaMemoryLayout.py | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
index d31c934..d9209f5 100644
--- a/neureka/hal/neureka_task.c
+++ b/neureka/hal/neureka_task.c
@@ -169,7 +169,7 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
   if (task->kernel_shape == 1) { // 1x1
     task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1;
     task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 * num_k_in;
+        (NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 / 8) * task->qw * num_k_in;
   } else if (!task->depthwise) { // 3x3
     task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
     task->data.cfg.weights_stride.d1 =
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
index 61b3ad8..ca51d4a 100644
--- a/test/NeurekaMemoryLayout.py
+++ b/test/NeurekaMemoryLayout.py
@@ -88,15 +88,8 @@ def weightEncode(
         elif height == 1 and width == 1:
             # (cout * cinMajor, Bits * cinSubtile)
             weight = weight.reshape(-1, bits * cinSubtile)
-            # Pad only the last dimension to weight bandwidth size
-            # (-1, Weight Bandwidth)
-            weight = np.pad(
-                weight,
-                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH_1x1 - weight.shape[-1])),
-                "constant",
-                constant_values=0,
-            )
-            weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH_1x1 / 8))
+            # No padding needed here
+            weightBandwidthBytes = int(np.ceil(bits * cinSubtile / 8))
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)