From 5bee5482e05701fc50cd68fc6826e017517c23d1 Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 10 Dec 2024 11:01:48 +0100
Subject: [PATCH] [software] Move the port-conflict optimized matmul to
 matmul_i32p

---
 software/apps/baremetal/matmul_i32/main.c     |   3 +-
 software/apps/baremetal/matrix_mul/main.c     |  68 +-
 software/apps/baremetal/tests/main.c          |   3 +-
 software/apps/matmul_i32_conflict_opt/main.c  | 208 -----
 .../omp/omp_parallel_for_benchmark/main.c     |  70 +-
 .../kernels/baremetal/mat_mul_conflict_opt.h  | 828 ------------------
 .../kernels/baremetal/mempool_matmul_i32p.h   | 617 +++++++++++++
 7 files changed, 699 insertions(+), 1098 deletions(-)
 delete mode 100644 software/apps/matmul_i32_conflict_opt/main.c
 delete mode 100644 software/kernels/baremetal/mat_mul_conflict_opt.h

diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c
index 3713dcabe..b6744a49c 100644
--- a/software/apps/baremetal/matmul_i32/main.c
+++ b/software/apps/baremetal/matmul_i32/main.c
@@ -13,9 +13,10 @@
 #include "runtime.h"
 #include "synchronization.h"
 
+#include "data_matmul_i32.h"
+
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_i32p.h"
-#include "data_matmul_i32.h"
 
 int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
diff --git a/software/apps/baremetal/matrix_mul/main.c b/software/apps/baremetal/matrix_mul/main.c
index 094817929..a1b6626ac 100644
--- a/software/apps/baremetal/matrix_mul/main.c
+++ b/software/apps/baremetal/matrix_mul/main.c
@@ -7,7 +7,6 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
@@ -15,19 +14,21 @@
 
 // Define Matrix dimensions:
 // C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
-#define N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
-#define P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
 // Specify how the matrices A and B should be initialized
 // The entries will follow this format:
 // a(i,j) = A_a*i + A_b*j + A_c
 // b(i,j) = B_a*i + B_b*j + B_c
 // The result will be the following matrix
-// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * N
-//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * (N*(N-1))/2
-//        + (A_b*B_a) * (N*(N-1)*(2*N-1))/6
-// Note: To keep the code simpler, we use indices that go from 0 to N-1 instead
-// of 1 to N as the mathematicians do. Hence, for A, i=[0,M-1] j=[0,M-1]
+// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * matrix_N
+//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) *
+//        (matrix_N*(matrix_N-1))/2
+//        + (A_b*B_a) * (matrix_N*(matrix_N-1)*(2*matrix_N-1))/6
+// Note: To keep the code simpler, we use indices that go from 0 to matrix_N-1
+// instead of 1 to matrix_N as the mathematicians do. Hence, for A,
+// i=[0,matrix_M-1] j=[0,matrix_M-1]
 #define A_a 1
 #define A_b 1
 #define A_c -32
@@ -37,10 +38,11 @@
 // Enable verbose printing
 // #define VERBOSE
 
+#include "baremetal/mempool_matmul_i32p.h"
 int32_t volatile init __attribute__((section(".l2"))) = 0;
-int32_t a[M * N] __attribute__((section(".l1")));
-int32_t b[N * P] __attribute__((section(".l1")));
-int32_t c[M * P] __attribute__((section(".l1")));
+int32_t a[matrix_M * matrix_N] __attribute__((section(".l1")));
+int32_t b[matrix_N * matrix_P] __attribute__((section(".l1")));
+int32_t c[matrix_M * matrix_P] __attribute__((section(".l1")));
 
 // Initialize the matrices in parallel
 void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
@@ -61,10 +63,13 @@ int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
   // Parallelize over rows
   for (int32_t i = 0; i < (int32_t)num_rows; ++i) {
     for (int32_t j = 0; j < (int32_t)num_columns; ++j) {
-      int32_t lin = (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * N;
-      int32_t qua =
-          ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * (N * (N - 1))) / 2;
-      int32_t cub = ((ab * ba) * (N * (N - 1) * (2 * N - 1))) / 6;
+      int32_t lin =
+          (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * matrix_N;
+      int32_t qua = ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) *
+                     (matrix_N * (matrix_N - 1))) /
+                    2;
+      int32_t cub =
+          ((ab * ba) * (matrix_N * (matrix_N - 1) * (2 * matrix_N - 1))) / 6;
       int32_t golden = lin + qua + cub;
       if (matrix[i * (int32_t)num_columns + j] != golden) {
         return (i + j) == 0 ? -1 : i * (int32_t)num_columns + j;
@@ -100,14 +105,14 @@ int main() {
   // #endif
 
   // Initialize Matrices
-  init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores);
+  init_matrix(a, matrix_M, matrix_N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(b, matrix_N, matrix_P, B_a, B_b, B_c, core_id, num_cores);
 
 #ifdef VERBOSE
   mempool_barrier(num_cores);
   if (core_id == 0) {
-    print_matrix(a, M, N);
-    print_matrix(b, N, P);
+    print_matrix(a, matrix_M, matrix_N);
+    print_matrix(b, matrix_N, matrix_P);
   }
 #endif
 
@@ -121,20 +126,24 @@ int main() {
     mempool_start_benchmark();
     switch (i) {
     case 0:
-      mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                       num_cores);
       break;
     case 1:
-      mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_unrolled_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                                num_cores);
       break;
     case 2:
-      mat_mul_asm_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_asm_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                           num_cores);
       break;
     case 3:
-      mat_mul_parallel_finegrained(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_parallel_finegrained(a, b, c, matrix_M, matrix_N, matrix_P,
+                                   core_id, num_cores);
       break;
     case 4:
-      mat_mul_unrolled_parallel_finegrained(a, b, c, M, N, P, core_id,
-                                            num_cores);
+      mat_mul_unrolled_parallel_finegrained(a, b, c, matrix_M, matrix_N,
+                                            matrix_P, core_id, num_cores);
       break;
     }
     mempool_stop_benchmark();
@@ -144,7 +153,8 @@ int main() {
     // Check result
     if (core_id == 0) {
       // printf("Duration: %d\n", cycles);
-      int error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+      int error =
+          verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
       if (error != 0) {
         printf("Error code %d\n", error);
         printf("c[%d]=%d\n", error, c[error]);
@@ -154,7 +164,7 @@ int main() {
 #endif
     } else {
       // Wait for the approx amount it takes core 0 to verify the result
-      mempool_wait(M * P * 12);
+      mempool_wait(matrix_M * matrix_P * 12);
     }
   }
 
@@ -163,7 +173,7 @@ int main() {
 
 #ifdef VERBOSE
   if (core_id == 0) {
-    print_matrix(c, M, P);
+    print_matrix(c, matrix_M, matrix_P);
   }
   mempool_barrier(num_cores);
 #endif
diff --git a/software/apps/baremetal/tests/main.c b/software/apps/baremetal/tests/main.c
index d835d28fb..622b35e34 100644
--- a/software/apps/baremetal/tests/main.c
+++ b/software/apps/baremetal/tests/main.c
@@ -7,7 +7,6 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
@@ -25,6 +24,8 @@
 #define matrix_P (NUM_CORES)
 #endif
 
+#include "baremetal/mempool_matmul_i32p.h"
+
 int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
 int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
diff --git a/software/apps/matmul_i32_conflict_opt/main.c b/software/apps/matmul_i32_conflict_opt/main.c
deleted file mode 100644
index c3d61b64b..000000000
--- a/software/apps/matmul_i32_conflict_opt/main.c
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Yichao Zhang,  ETH Zurich
-// Author: Samuel Riedel, ETH Zurich
-
-#include <stdint.h>
-#include <string.h>
-
-#include "encoding.h"
-#include "printf.h"
-#include "runtime.h"
-#include "synchronization.h"
-
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 128
-#define matrix_N 64
-#define matrix_P 128
-
-// Define Benchmark Flag
-#define SERIAL_MODE (0)
-#define PARALLEL_MODE (1)
-#define CONCURRENT_MODE (0)
-#define NUM_PARALLEL_CORES (1024)
-
-// Define kernel include
-#include "kernel/mat_mul.h"
-#include "kernel/mat_mul_conflict_opt.h"
-
-// Define memory distributing
-int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1")));
-int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1")));
-#if (CONCURRENT_MODE == 1)
-int32_t matrix_d[matrix_M * matrix_N] __attribute__((section(".l1")));
-int32_t matrix_e[matrix_N * matrix_P] __attribute__((section(".l1")));
-int32_t matrix_f[matrix_M * matrix_P] __attribute__((section(".l1")));
-#endif
-int volatile error __attribute__((section(".l2")));
-
-// Function init_matrix
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  // How many rows/columns to split the matrix into
-  uint32_t const split = 4;
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
-
-// Function verify_matrix
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac,
-                  int32_t ba, int32_t bb, int32_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin =
-          (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n;
-      int32_t qua =
-          ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) /
-          2;
-      int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
-  }
-  return 0;
-}
-
-// Function test_matrix_multiplication
-int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int32_t const A_a = 1;
-  int32_t const A_b = 2;
-  int32_t const A_c = -32;
-  int32_t const B_a = 1;
-  int32_t const B_b = 1;
-  int32_t const B_c = 16;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-#if (CONCURRENT_MODE == 1)
-  init_matrix(D, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(E, N, P, B_a, B_b, B_c, core_id, num_cores);
-#endif
-  mempool_barrier(num_cores);
-
-// Serial Benchmark
-#if (SERIAL_MODE == 1)
-  if (core_id == 0) {
-    printf("Serial Calculation Start\n");
-    mempool_start_benchmark();
-    mat_mul_unrolled_4x4_serial(A, B, C, M, N, P);
-    mempool_stop_benchmark();
-    printf("Calculation Finish\n");
-  }
-#endif
-
-// Parallel Benchmark
-#if (PARALLEL_MODE == 1)
-  if (core_id == 0) {
-    printf("Parallel Calculation Start\n");
-  }
-  mempool_barrier(num_cores);
-
-  if (core_id < NUM_PARALLEL_CORES) {
-    mempool_start_benchmark();
-    mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id,
-                                                   NUM_PARALLEL_CORES);
-    mempool_start_benchmark();
-    mempool_log_partial_barrier(2, core_id, NUM_PARALLEL_CORES);
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-#endif
-
-// Concurrent Benchmark
-#if (CONCURRENT_MODE == 1)
-  if (core_id == 0) {
-    printf("Concurrent Calculation Start\n");
-  }
-  mempool_barrier(num_cores);
-
-  if (core_id < 512) {
-    mempool_start_benchmark();
-    mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id,
-                                                   512);
-    mempool_start_benchmark();
-    mempool_log_partial_barrier(2, core_id, 512);
-    mempool_stop_benchmark();
-  }
-  if (core_id >= 512) {
-    uint32_t core_id_new = core_id - 512;
-    mempool_start_benchmark();
-    mat_mul_unrolled_4x4_conflict_opt_parallel_asm(D, E, F, M, N, P,
-                                                   core_id_new, 512);
-    mempool_start_benchmark();
-    mempool_log_partial_barrier(2, core_id, 512);
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-#endif
-
-  // Verify results
-  if (core_id == 0) {
-    printf("Start Verify Results\n");
-  }
-  mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
-
-// Main function block
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
-  mempool_barrier(num_cores);
-
-  return error;
-}
diff --git a/software/apps/omp/omp_parallel_for_benchmark/main.c b/software/apps/omp/omp_parallel_for_benchmark/main.c
index bc4de3d07..fe3f02b9f 100644
--- a/software/apps/omp/omp_parallel_for_benchmark/main.c
+++ b/software/apps/omp/omp_parallel_for_benchmark/main.c
@@ -5,8 +5,6 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
-#include "baremetal/mempool_matmul_i32s.h"
 #include "encoding.h"
 #include "libgomp.h"
 #include "omp/mempool_matmul_i32.h"
@@ -16,19 +14,25 @@
 
 // Define Matrix dimensions:
 // C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define M 32
-#define N 32
-#define P 32
+#define matrix_M 32
+#define matrix_N 32
+#define matrix_P 32
+
+#include "baremetal/mempool_matmul_i32p.h"
+#include "baremetal/mempool_matmul_i32s.h"
+
 // Specify how the matrices A and B should be initialized
 // The entries will follow this format:
 // a(i,j) = A_a*i + A_b*j + A_c
 // b(i,j) = B_a*i + B_b*j + B_c
 // The result will be the following matrix
-// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * N
-//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * (N*(N-1))/2
-//        + (A_b*B_a) * (N*(N-1)*(2*N-1))/6
-// Note: To keep the code simpler, we use indices that go from 0 to N-1 instead
-// of 1 to N as the mathematicians do. Hence, for A, i=[0,M-1] j=[0,M-1]
+// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * matrix_N
+//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) *
+//        (matrix_N*(matrix_N-1))/2
+//        + (A_b*B_a) * (matrix_N*(matrix_N-1)*(2*matrix_N-1))/6
+// Note: To keep the code simpler, we use indices that go from 0 to matrix_N-1
+// instead of 1 to matrix_N as the mathematicians do. Hence, for A,
+// i=[0,matrix_M-1] j=[0,matrix_M-1]
 #define A_a 1
 #define A_b 1
 #define A_c -32
@@ -37,9 +41,9 @@
 #define B_c 16
 
 int32_t volatile init __attribute__((section(".l2"))) = 0;
-int32_t a[M * N] __attribute__((section(".l1")));
-int32_t b[N * P] __attribute__((section(".l1")));
-int32_t c[M * P] __attribute__((section(".l1")));
+int32_t a[matrix_M * matrix_N] __attribute__((section(".l1")));
+int32_t b[matrix_N * matrix_P] __attribute__((section(".l1")));
+int32_t c[matrix_M * matrix_P] __attribute__((section(".l1")));
 
 // Initialize the matrices in parallel
 void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
@@ -60,10 +64,13 @@ int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
   // Parallelize over rows
   for (int32_t i = 0; i < (int32_t)num_rows; ++i) {
     for (int32_t j = 0; j < (int32_t)num_columns; ++j) {
-      int32_t lin = (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * N;
-      int32_t qua =
-          ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * (N * (N - 1))) / 2;
-      int32_t cub = ((ab * ba) * (N * (N - 1) * (2 * N - 1))) / 6;
+      int32_t lin =
+          (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * matrix_N;
+      int32_t qua = ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) *
+                     (matrix_N * (matrix_N - 1))) /
+                    2;
+      int32_t cub =
+          ((ab * ba) * (matrix_N * (matrix_N - 1) * (2 * matrix_N - 1))) / 6;
       int32_t golden = lin + qua + cub;
       if (matrix[i * (int32_t)num_columns + j] != golden) {
         return (i + j) == 0 ? -1 : i * (int32_t)num_columns + j;
@@ -95,14 +102,14 @@ int main() {
   mempool_barrier_init(core_id);
 
   // Initialize Matrices
-  init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores);
+  init_matrix(a, matrix_M, matrix_N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(b, matrix_N, matrix_P, B_a, B_b, B_c, core_id, num_cores);
 
   mempool_barrier(num_cores);
 
   cycles = mempool_get_timer();
   mempool_start_benchmark();
-  mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores);
+  mat_mul_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id, num_cores);
   mempool_stop_benchmark();
   cycles = mempool_get_timer() - cycles;
   mempool_barrier(num_cores);
@@ -110,19 +117,20 @@ int main() {
   // Check result
   if (core_id == 0) {
     printf("Manual Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+    error = verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
     if (error != 0) {
       printf("Error code %d\n", error);
       printf("c[%d]=%d\n", error, c[error]);
     }
   } else {
-    mempool_wait(M * P * 12);
+    mempool_wait(matrix_M * matrix_P * 12);
   }
   mempool_barrier(num_cores);
 
   cycles = mempool_get_timer();
   mempool_start_benchmark();
-  mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores);
+  mat_mul_unrolled_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                            num_cores);
   mempool_stop_benchmark();
   cycles = mempool_get_timer() - cycles;
   mempool_barrier(num_cores);
@@ -130,13 +138,13 @@ int main() {
   // Check result
   if (core_id == 0) {
     printf("Manual unrolled Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+    error = verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
     if (error != 0) {
       printf("Error code %d\n", error);
       printf("c[%d]=%d\n", error, c[error]);
     }
   } else {
-    mempool_wait(M * P * 12);
+    mempool_wait(matrix_M * matrix_P * 12);
   }
   mempool_barrier(num_cores);
 
@@ -146,11 +154,11 @@ int main() {
 
     cycles = mempool_get_timer();
     mempool_start_benchmark();
-    mat_mul_sequential(a, b, c, M, N, P);
+    mat_mul_sequential(a, b, c, matrix_M, matrix_N, matrix_P);
     mempool_stop_benchmark();
     cycles = mempool_get_timer() - cycles;
     printf("Sequqntial Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+    error = verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
     if (error != 0) {
       printf("Error code %d\n", error);
       printf("c[%d]=%d\n", error, c[error]);
@@ -160,11 +168,11 @@ int main() {
 
     cycles = mempool_get_timer();
     mempool_start_benchmark();
-    mat_mul_parallel_omp(a, b, c, M, N, P);
+    mat_mul_parallel_omp(a, b, c, matrix_M, matrix_N, matrix_P);
     mempool_stop_benchmark();
     cycles = mempool_get_timer() - cycles;
     printf("OpenMP Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+    error = verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
     if (error != 0) {
       printf("Error code %d\n", error);
       printf("c[%d]=%d\n", error, c[error]);
@@ -172,11 +180,11 @@ int main() {
 
     cycles = mempool_get_timer();
     mempool_start_benchmark();
-    mat_mul_unrolled_parallel_omp(a, b, c, M, N, P);
+    mat_mul_unrolled_parallel_omp(a, b, c, matrix_M, matrix_N, matrix_P);
     mempool_stop_benchmark();
     cycles = mempool_get_timer() - cycles;
     printf("OpenMP Unrolled Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+    error = verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
     if (error != 0) {
       printf("Error code %d\n", error);
       printf("c[%d]=%d\n", error, c[error]);
diff --git a/software/kernels/baremetal/mat_mul_conflict_opt.h b/software/kernels/baremetal/mat_mul_conflict_opt.h
deleted file mode 100644
index 108f074a8..000000000
--- a/software/kernels/baremetal/mat_mul_conflict_opt.h
+++ /dev/null
@@ -1,828 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Yichao Zhang, ETH Zurich
-// Author: Samuel Riedel, ETH Zurich
-
-/* This library implements the matrix multiplication in multiple different ways.
- * The functions all follow the following format:
- *
- * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
- * C = AB
- */
-
-/* For parallel computation, general kernels support the power of 2 of matrix
- * dimension; The max size for M and P should be M=P=<4096; The min size, it
- * need to make sure "c_end - c_start >=4"; For mempool, the min matrix size is
- * M=P=64; For terapool, the min matrix size is M=P=128;
- */
-
-void mat_mul_unrolled_4x2_serial(int32_t const *__restrict__ A,
-                                 int32_t const *__restrict__ B,
-                                 int32_t *__restrict__ C, uint32_t M,
-                                 uint32_t N, uint32_t P) {
-  // Parallelize by assigning each core one row
-  for (uint32_t i = 0; i < M; i += 2) {
-    for (uint32_t j = 0; j < P; j += 4) {
-      int32_t c00 = 0;
-      int32_t c01 = 0;
-      int32_t c02 = 0;
-      int32_t c03 = 0;
-      int32_t c10 = 0;
-      int32_t c11 = 0;
-      int32_t c12 = 0;
-      int32_t c13 = 0;
-      for (uint32_t k = 0; k < N; k += 2) {
-        // Explicitly load the values first to help with scheduling
-        int32_t val_a00 = A[(i + 0) * N + k + 0];
-        int32_t val_a01 = A[(i + 0) * N + k + 1];
-        int32_t val_a10 = A[(i + 1) * N + k + 0];
-        int32_t val_a11 = A[(i + 1) * N + k + 1];
-
-        int32_t val_b00 = B[(k + 0) * P + j + 0];
-        int32_t val_b01 = B[(k + 0) * P + j + 1];
-        int32_t val_b02 = B[(k + 0) * P + j + 2];
-        int32_t val_b03 = B[(k + 0) * P + j + 3];
-
-        int32_t val_b10 = B[(k + 1) * P + j + 0];
-        int32_t val_b11 = B[(k + 1) * P + j + 1];
-        int32_t val_b12 = B[(k + 1) * P + j + 2];
-        int32_t val_b13 = B[(k + 1) * P + j + 3];
-
-        c00 += val_a00 * val_b00;
-        c00 += val_a01 * val_b10;
-        c01 += val_a00 * val_b01;
-        c01 += val_a01 * val_b11;
-        c02 += val_a00 * val_b02;
-        c02 += val_a01 * val_b12;
-        c03 += val_a00 * val_b03;
-        c03 += val_a01 * val_b13;
-
-        c10 += val_a10 * val_b00;
-        c10 += val_a11 * val_b10;
-        c11 += val_a10 * val_b01;
-        c11 += val_a11 * val_b11;
-        c12 += val_a10 * val_b02;
-        c12 += val_a11 * val_b12;
-        c13 += val_a10 * val_b03;
-        c13 += val_a11 * val_b13;
-      }
-      C[(i + 0) * P + j + 0] = c00;
-      C[(i + 0) * P + j + 1] = c01;
-      C[(i + 0) * P + j + 2] = c02;
-      C[(i + 0) * P + j + 3] = c03;
-      C[(i + 1) * P + j + 0] = c10;
-      C[(i + 1) * P + j + 1] = c11;
-      C[(i + 1) * P + j + 2] = c12;
-      C[(i + 1) * P + j + 3] = c13;
-    }
-  }
-}
-
-void mat_mul_unrolled_4x2_parallel(int32_t const *__restrict__ A,
-                                   int32_t const *__restrict__ B,
-                                   int32_t *__restrict__ C, uint32_t M,
-                                   uint32_t N, uint32_t P, uint32_t id,
-                                   uint32_t numThreads) {
-  // Parallelize by assigning each core one row
-  uint32_t const c =
-      numThreads / (M / 2); // How many columns to split the matrix into, best
-                            // should be numThreads/(M/2);
-  uint32_t const c_start = (P / c) * (id % c);
-  uint32_t const c_end = (P / c) * ((id % c) + 1);
-  for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) {
-    for (uint32_t j = c_start; j < c_end; j += 4) {
-      int32_t c00 = 0;
-      int32_t c01 = 0;
-      int32_t c02 = 0;
-      int32_t c03 = 0;
-      int32_t c10 = 0;
-      int32_t c11 = 0;
-      int32_t c12 = 0;
-      int32_t c13 = 0;
-      for (uint32_t k = 0; k < N; k += 2) {
-        // Explicitly load the values first to help with scheduling
-        int32_t val_a00 = A[(i + 0) * N + k + 0];
-        int32_t val_a01 = A[(i + 0) * N + k + 1];
-        int32_t val_a10 = A[(i + 1) * N + k + 0];
-        int32_t val_a11 = A[(i + 1) * N + k + 1];
-
-        int32_t val_b00 = B[(k + 0) * P + j + 0];
-        int32_t val_b01 = B[(k + 0) * P + j + 1];
-        int32_t val_b02 = B[(k + 0) * P + j + 2];
-        int32_t val_b03 = B[(k + 0) * P + j + 3];
-
-        int32_t val_b10 = B[(k + 1) * P + j + 0];
-        int32_t val_b11 = B[(k + 1) * P + j + 1];
-        int32_t val_b12 = B[(k + 1) * P + j + 2];
-        int32_t val_b13 = B[(k + 1) * P + j + 3];
-
-        c00 += val_a00 * val_b00;
-        c00 += val_a01 * val_b10;
-        c01 += val_a00 * val_b01;
-        c01 += val_a01 * val_b11;
-        c02 += val_a00 * val_b02;
-        c02 += val_a01 * val_b12;
-        c03 += val_a00 * val_b03;
-        c03 += val_a01 * val_b13;
-
-        c10 += val_a10 * val_b00;
-        c10 += val_a11 * val_b10;
-        c11 += val_a10 * val_b01;
-        c11 += val_a11 * val_b11;
-        c12 += val_a10 * val_b02;
-        c12 += val_a11 * val_b12;
-        c13 += val_a10 * val_b03;
-        c13 += val_a11 * val_b13;
-      }
-      C[(i + 0) * P + j + 0] = c00;
-      C[(i + 0) * P + j + 1] = c01;
-      C[(i + 0) * P + j + 2] = c02;
-      C[(i + 0) * P + j + 3] = c03;
-      C[(i + 1) * P + j + 0] = c10;
-      C[(i + 1) * P + j + 1] = c11;
-      C[(i + 1) * P + j + 2] = c12;
-      C[(i + 1) * P + j + 3] = c13;
-    }
-  }
-}
-
-void mat_mul_unrolled_4x4_serial(int32_t const *__restrict__ A,
-                                 int32_t const *__restrict__ B,
-                                 int32_t *__restrict__ C, uint32_t M,
-                                 uint32_t N, uint32_t P) {
-  // Parallelize by assigning each core one row
-  for (uint32_t i = 0; i < M; i += 4) {
-    for (uint32_t j = 0; j < P; j += 4) {
-      // Initialize 4x4 output tile
-      int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0;
-      int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0;
-      int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0;
-      int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0;
-      for (uint32_t k = 0; k < N; k += 1) {
-        // Explicitly load the values first to help with scheduling
-        int32_t b0 = B[k * P + j + 0];
-        int32_t b1 = B[k * P + j + 1];
-        int32_t b2 = B[k * P + j + 2];
-        int32_t b3 = B[k * P + j + 3];
-        // A could be local with scrambling
-        int32_t a0 = A[(i + 0) * N + k];
-        int32_t a1 = A[(i + 1) * N + k];
-        int32_t a2 = A[(i + 2) * N + k];
-        int32_t a3 = A[(i + 3) * N + k];
-        // Compute
-        c00 += a0 * b0;
-        c01 += a0 * b1;
-        c02 += a0 * b2;
-        c03 += a0 * b3;
-        c10 += a1 * b0;
-        c11 += a1 * b1;
-        c12 += a1 * b2;
-        c13 += a1 * b3;
-        c20 += a2 * b0;
-        c21 += a2 * b1;
-        c22 += a2 * b2;
-        c23 += a2 * b3;
-        c30 += a3 * b0;
-        c31 += a3 * b1;
-        c32 += a3 * b2;
-        c33 += a3 * b3;
-      }
-      // Store
-      C[(i + 0) * P + j + 0] = c00;
-      C[(i + 0) * P + j + 1] = c01;
-      C[(i + 0) * P + j + 2] = c02;
-      C[(i + 0) * P + j + 3] = c03;
-      C[(i + 1) * P + j + 0] = c10;
-      C[(i + 1) * P + j + 1] = c11;
-      C[(i + 1) * P + j + 2] = c12;
-      C[(i + 1) * P + j + 3] = c13;
-      C[(i + 2) * P + j + 0] = c20;
-      C[(i + 2) * P + j + 1] = c21;
-      C[(i + 2) * P + j + 2] = c22;
-      C[(i + 2) * P + j + 3] = c23;
-      C[(i + 3) * P + j + 0] = c30;
-      C[(i + 3) * P + j + 1] = c31;
-      C[(i + 3) * P + j + 2] = c32;
-      C[(i + 3) * P + j + 3] = c33;
-    }
-  }
-}
-
-void mat_mul_unrolled_4x4_parallel(int32_t const *__restrict__ A,
-                                   int32_t const *__restrict__ B,
-                                   int32_t *__restrict__ C, uint32_t M,
-                                   uint32_t N, uint32_t P, uint32_t id,
-                                   uint32_t numThreads) {
-  // Parallelize by assigning each core one row
-  uint32_t const c =
-      numThreads / (M / 4); // How many columns to split the matrix into
-  uint32_t const c_start = (P / c) * (id % c);
-  uint32_t const c_end = (P / c) * ((id % c) + 1);
-  for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) {
-    for (uint32_t j = c_start; j < c_end; j += 4) {
-      // Initialize 4x4 output tile
-      int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0;
-      int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0;
-      int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0;
-      int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0;
-      for (uint32_t k = 0; k < N; k += 1) {
-        // Explicitly load the values first to help with scheduling
-        int32_t b0 = B[k * P + j + 0];
-        int32_t b1 = B[k * P + j + 1];
-        int32_t b2 = B[k * P + j + 2];
-        int32_t b3 = B[k * P + j + 3];
-        // A could be local with scrambling
-        int32_t a0 = A[(i + 0) * N + k];
-        int32_t a1 = A[(i + 1) * N + k];
-        int32_t a2 = A[(i + 2) * N + k];
-        int32_t a3 = A[(i + 3) * N + k];
-        // Compute
-        c00 += a0 * b0;
-        c01 += a0 * b1;
-        c02 += a0 * b2;
-        c03 += a0 * b3;
-        c10 += a1 * b0;
-        c11 += a1 * b1;
-        c12 += a1 * b2;
-        c13 += a1 * b3;
-        c20 += a2 * b0;
-        c21 += a2 * b1;
-        c22 += a2 * b2;
-        c23 += a2 * b3;
-        c30 += a3 * b0;
-        c31 += a3 * b1;
-        c32 += a3 * b2;
-        c33 += a3 * b3;
-      }
-      // Store
-      C[(i + 0) * P + j + 0] = c00;
-      C[(i + 0) * P + j + 1] = c01;
-      C[(i + 0) * P + j + 2] = c02;
-      C[(i + 0) * P + j + 3] = c03;
-      C[(i + 1) * P + j + 0] = c10;
-      C[(i + 1) * P + j + 1] = c11;
-      C[(i + 1) * P + j + 2] = c12;
-      C[(i + 1) * P + j + 3] = c13;
-      C[(i + 2) * P + j + 0] = c20;
-      C[(i + 2) * P + j + 1] = c21;
-      C[(i + 2) * P + j + 2] = c22;
-      C[(i + 2) * P + j + 3] = c23;
-      C[(i + 3) * P + j + 0] = c30;
-      C[(i + 3) * P + j + 1] = c31;
-      C[(i + 3) * P + j + 2] = c32;
-      C[(i + 3) * P + j + 3] = c33;
-    }
-  }
-}
-
-void mat_mul_unrolled_4x4_conflict_opt_parallel(int32_t const *__restrict__ A,
-                                                int32_t const *__restrict__ B,
-                                                int32_t *__restrict__ C,
-                                                uint32_t M, uint32_t N,
-                                                uint32_t P, uint32_t id,
-                                                uint32_t numThreads) {
-
-  /////////////////////////////
-  //      Configuration      //
-  /////////////////////////////
-  // Parallelize by assigning each core one row
-  // How many cores per window
-  uint32_t c = numThreads / (M / 4);
-  if (numThreads * 4 < M) {
-    c = 1;
-  }
-  uint32_t const c_start = (P / c) * (id % c);
-  uint32_t const c_end = (P / c) * ((id % c) + 1);
-
-  // For avoiding group conflict by same tile
-  // Each cores in the same tile should access to different groups
-  uint32_t group_bank_nums = 512;              // MemPool = 256
-  uint32_t tile_core_nums = 8;                 // MemPool = 4
-  uint32_t jump_lines_A = group_bank_nums / N; // Used for i control
-  uint32_t jump_lines_B = group_bank_nums / P; // Used for k control
-  // Window size limit, min jump lines is 4 for MatrixA
-  if (jump_lines_A < 4) {
-    jump_lines_A = 4;
-  }
-
-  /////////////////////////////
-  //      LOOP   OFFSET      //
-  /////////////////////////////
-  // Outer Loop Control, for group access port conflict
-  uint32_t i_offset = jump_lines_A * (id % tile_core_nums);
-  // Inner Loop Incremental Control, for group access port conflict
-  uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums);
-  // Inner Loop Control
-  // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB)
-  uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr;
-  // Middle Loop Control, window jump for avoiding bank conflict
-  uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P;
-  uint32_t j_offset = (2 * (id / c)) / conflict_row;
-
-  /////////////////////////////
-  //      LOOP  CONTROL      //
-  /////////////////////////////
-  // Inner Round-Robin
-  if (k_offset >= N) {
-    k_offset = k_offset - N * (k_offset / N);
-  }
-  // Middle Round-Robin
-  uint32_t window_in_P = (P / c) / 4;
-  if (j_offset >= window_in_P) {
-    j_offset = j_offset - window_in_P * (j_offset / window_in_P);
-  }
-  // Outer Loop Control
-  uint32_t outer_loop_counter = 0;
-  uint32_t outer_loop_time = M / (4 * numThreads);
-  if (outer_loop_time < 1) {
-    outer_loop_time = 1;
-  }
-  uint32_t M_partition = M / outer_loop_time;
-
-  /////////////////////////////
-  //      *LOOP  START*      //
-  /////////////////////////////
-  for (uint32_t i_ori = 4 * (id / c); i_ori < M;
-       i_ori += 4 * (numThreads / c)) {
-    outer_loop_counter += 1;
-    uint32_t i = i_ori + i_offset;
-    // Round-Robin control, if offset lines > M, back to the first window
-    if (i >= M_partition * outer_loop_counter) {
-      i = i - M_partition * (i / (M_partition * outer_loop_counter));
-    }
-    // Backup counter for mid-loop
-    uint32_t j_offset_counter = c_start + j_offset * 4;
-    uint32_t P_counter = c_end;
-
-  Mid_loop:
-    for (uint32_t j = j_offset_counter; j < P_counter; j += 4) {
-      // Initialize 4x4 output tile
-      int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0;
-      int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0;
-      int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0;
-      int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0;
-
-      // Backup the variables for restore and later use
-      uint32_t k_offset_counter = k_offset;
-      uint32_t N_counter = N;
-
-    Inner_Loop:
-      for (uint32_t k = k_offset_counter; k < N_counter; k += 1) {
-        // Explicitly load the values first to help with scheduling
-        int32_t b0 = B[k * P + j + 0];
-        int32_t b1 = B[k * P + j + 1];
-        int32_t b2 = B[k * P + j + 2];
-        int32_t b3 = B[k * P + j + 3];
-        // A could be local with scrambling
-        int32_t a0 = A[(i + 0) * N + k];
-        int32_t a1 = A[(i + 1) * N + k];
-        int32_t a2 = A[(i + 2) * N + k];
-        int32_t a3 = A[(i + 3) * N + k];
-        // Compute
-        c00 += a0 * b0;
-        c01 += a0 * b1;
-        c02 += a0 * b2;
-        c03 += a0 * b3;
-        c10 += a1 * b0;
-        c11 += a1 * b1;
-        c12 += a1 * b2;
-        c13 += a1 * b3;
-        c20 += a2 * b0;
-        c21 += a2 * b1;
-        c22 += a2 * b2;
-        c23 += a2 * b3;
-        c30 += a3 * b0;
-        c31 += a3 * b1;
-        c32 += a3 * b2;
-        c33 += a3 * b3;
-      }
-
-      // Pseudo-jump code to avoid complie inner-loop twice
-      // Complie twice will have scheduling issue due to register file limit.
-      if (k_offset_counter > 0) {
-        N_counter = k_offset;
-        k_offset_counter = 0;
-        goto Inner_Loop;
-      }
-
-      // Store
-      C[(i + 0) * P + j + 0] = c00;
-      C[(i + 0) * P + j + 1] = c01;
-      C[(i + 0) * P + j + 2] = c02;
-      C[(i + 0) * P + j + 3] = c03;
-      C[(i + 1) * P + j + 0] = c10;
-      C[(i + 1) * P + j + 1] = c11;
-      C[(i + 1) * P + j + 2] = c12;
-      C[(i + 1) * P + j + 3] = c13;
-      C[(i + 2) * P + j + 0] = c20;
-      C[(i + 2) * P + j + 1] = c21;
-      C[(i + 2) * P + j + 2] = c22;
-      C[(i + 2) * P + j + 3] = c23;
-      C[(i + 3) * P + j + 0] = c30;
-      C[(i + 3) * P + j + 1] = c31;
-      C[(i + 3) * P + j + 2] = c32;
-      C[(i + 3) * P + j + 3] = c33;
-    }
-
-    if (j_offset_counter != c_start) {
-      P_counter = j_offset_counter;
-      j_offset_counter = c_start;
-      goto Mid_loop;
-    }
-  }
-}
-
-/*******************************/
-/* ASM CODE KERNEL START BELOW */
-/*******************************/
-
-// Define immediate values that used in asm code.
-#define N3 (matrix_M - 3) * 4
-#define N31 (-3 * matrix_N + 1) * 4
-#define P3 (matrix_P - 3) * 4
-#define P31 (-3 * matrix_N + 1) * 4
-
-void mat_mul_unrolled_4x4_parallel_asm(int32_t const *__restrict__ A,
-                                       int32_t const *__restrict__ B,
-                                       int32_t *__restrict__ C, uint32_t M,
-                                       uint32_t N, uint32_t P, uint32_t id,
-                                       uint32_t numThreads) {
-  // Parallelize by assigning each tile one row
-  uint32_t c = numThreads / (M / 4);
-  if (numThreads * 4 < M) {
-    c = 1;
-  }
-  // numThreads / (M / 4); // How many columns to split the matrix into
-  uint32_t const c_start = (P / c) * (id % c);
-  uint32_t const c_end = (P / c) * ((id % c) + 1);
-  for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) {
-    for (uint32_t j = c_start; j < c_end; j += 4) {
-      // Address registers
-      int32_t const *addr_a = &A[i * N];
-      int32_t const *addr_b = &B[j];
-      int32_t const *end_b = &B[N * P + j];
-      int32_t const *addr_c = &C[i * P + j];
-      int32_t const N3_1_r = (-3 * (int32_t)N + 1) * 4;
-      int32_t const P_3_r = ((int32_t)P - 3) * 4;
-
-      register int32_t k asm("x1") = (int32_t)end_b;
-      //      x12 x13 x14 x15
-      //
-      // x3   x16 x17 x18 x19
-      // x4   x20 x21 x22 x23
-      // x10  x24 x25 x26 x27
-      // x11  x28 x29 x30 x31
-      //
-      //
-      __asm__ volatile(
-          ".balign 16 \n\t"
-          // Outer loop: Initialize and preload. Execute this loop P times
-          // TODO arrange
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-          // Initial computation + prefetching
-          "mul x16,  x3, x12 \n\t"
-          "mul x17,  x3, x13 \n\t"
-          "mul x18,  x3, x14 \n\t"
-          "mul x19,  x3, x15 \n\t"
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "mul x20,  x4, x12 \n\t"
-          "mul x21,  x4, x13 \n\t"
-          "mul x22,  x4, x14 \n\t"
-          "mul x23,  x4, x15 \n\t"
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "mul x24, x10, x12 \n\t"
-          "mul x25, x10, x13 \n\t"
-          "mul x26, x10, x14 \n\t"
-          "mul x27, x10, x15 \n\t"
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "mul x28, x11, x12 \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "mul x29, x11, x13 \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "mul x30, x11, x14 \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "mul x31, x11, x15 \n\t"
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t"  // Increment by P-3
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-          // Inner loop: Do this loop N times
-          "1: \n\t"
-          "p.mac x16,  x3, x12 \n\t"
-          "p.mac x17,  x3, x13 \n\t"
-          "p.mac x20,  x4, x12 \n\t"
-          "p.mac x21,  x4, x13 \n\t"
-          "p.mac x18,  x3, x14 \n\t"
-          "p.mac x22,  x4, x14 \n\t"
-          "p.mac x19,  x3, x15 \n\t"
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "p.mac x23,  x4, x15 \n\t"
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "p.mac x24, x10, x12 \n\t"
-          "p.mac x28, x11, x12 \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "p.mac x25, x10, x13 \n\t"
-          "p.mac x29, x11, x13 \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "p.mac x26, x10, x14 \n\t"
-          "p.mac x30, x11, x14 \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "p.mac x27, x10, x15 \n\t"
-          "p.mac x31, x11, x15 \n\t"
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-          "bne %[addr_b], x1, 1b \n\t"
-          // Loop done store
-          "p.mac x16,  x3, x12 \n\t"
-          "p.mac x17,  x3, x13 \n\t"
-          "p.mac x18,  x3, x14 \n\t"
-          "p.sw x16, 4(%[addr_c]!) \n\t"
-          "p.mac x19,  x3, x15 \n\t"
-          "p.sw x17, 4(%[addr_c]!) \n\t"
-          "p.mac x20,  x4, x12 \n\t"
-          "p.sw x18, 4(%[addr_c]!) \n\t"
-          "p.mac x21,  x4, x13 \n\t"
-          "p.sw x19, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x22,  x4, x14 \n\t"
-          "p.sw x20, 4(%[addr_c]!) \n\t"
-          "p.mac x23,  x4, x15 \n\t"
-          "p.sw x21, 4(%[addr_c]!) \n\t"
-          "p.mac x24, x10, x12 \n\t"
-          "p.sw x22, 4(%[addr_c]!) \n\t"
-          "p.mac x25, x10, x13 \n\t"
-          "p.sw x23, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x26, x10, x14 \n\t"
-          "p.sw x24, 4(%[addr_c]!) \n\t"
-          "p.mac x27, x10, x15 \n\t"
-          "p.sw x25, 4(%[addr_c]!) \n\t"
-          "p.mac x28, x11, x12 \n\t"
-          "p.sw x26, 4(%[addr_c]!) \n\t"
-          "p.mac x29, x11, x13 \n\t"
-          "p.sw x27, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x30, x11, x14 \n\t"
-          "p.sw x28, 4(%[addr_c]!) \n\t"
-          "p.mac x31, x11, x15 \n\t"
-          "p.sw x29, 4(%[addr_c]!) \n\t"
-          "p.sw x30, 4(%[addr_c]!) \n\t"
-          "p.sw x31, %[P_3](%[addr_c]!) \n\t"
-          : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b),
-            [addr_c] "+&r"(addr_c) // Outputs
-          : [N3_1] "r"(N3_1_r), [P_3] "r"(P_3_r), [x1] "r"(k),
-            [N] "I"(matrix_N * 4) // Inputs
-          : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
-            "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
-            "x27", "x28", "x29", "x30", "x31", "memory"); // Clobber
-    }
-  }
-}
-
-void mat_mul_unrolled_4x4_conflict_opt_parallel_asm(
-    int32_t const *__restrict__ A, int32_t const *__restrict__ B,
-    int32_t *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, uint32_t id,
-    uint32_t numThreads) {
-
-  /////////////////////////////
-  //      Configuration      //
-  /////////////////////////////
-  // Parallelize by assigning each core one row
-  // How many cores per window
-  uint32_t c = numThreads / (M / 4);
-  if (numThreads * 4 < M) {
-    c = 1;
-  }
-  uint32_t const c_start = (P / c) * (id % c);
-  uint32_t const c_end = (P / c) * ((id % c) + 1);
-
-  // For avoiding group conflict by same tile
-  // Each cores in the same tile should access to different groups
-  uint32_t group_bank_nums = 512;              // MemPool = 256
-  uint32_t tile_core_nums = 8;                 // MemPool = 4
-  uint32_t jump_lines_A = group_bank_nums / N; // Used for i control
-  uint32_t jump_lines_B = group_bank_nums / P; // Used for k control
-  // Window size limit, min jump lines is 4 for MatrixA
-  if (jump_lines_A < 4) {
-    jump_lines_A = 4;
-  }
-
-  /////////////////////////////
-  //      LOOP   OFFSET      //
-  /////////////////////////////
-  // Outer Loop Control, for group access port conflict
-  uint32_t i_offset = jump_lines_A * (id % tile_core_nums);
-  // Inner Loop Incremental Control, for group access port conflict
-  uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums);
-  // Inner Loop Control
-  // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB)
-  uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr;
-  // Middle Loop Control, window jump for avoiding bank conflict
-  uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P;
-  uint32_t j_offset = (2 * (id / c)) / conflict_row;
-
-  /////////////////////////////
-  //      LOOP  CONTROL      //
-  /////////////////////////////
-  // Inner Round-Robin
-  if (k_offset >= N) {
-    k_offset = k_offset - N * (k_offset / N);
-  }
-  // Middle Round-Robin
-  uint32_t window_in_P = (P / c) / 4;
-  if (j_offset >= window_in_P) {
-    j_offset = j_offset - window_in_P * (j_offset / window_in_P);
-  }
-  // Outer Loop Control
-  uint32_t outer_loop_counter = 0;
-  uint32_t outer_loop_time = M / (4 * numThreads);
-  if (outer_loop_time < 1) {
-    outer_loop_time = 1;
-  }
-  uint32_t M_partition = M / outer_loop_time;
-
-  /////////////////////////////
-  //      *LOOP  START*      //
-  /////////////////////////////
-  for (uint32_t i_ori = 4 * (id / c); i_ori < M;
-       i_ori += 4 * (numThreads / c)) {
-    outer_loop_counter += 1;
-    uint32_t i = i_ori + i_offset;
-    // Round-Robin control, if offset lines > M, back to the first window
-    if (i >= M_partition * outer_loop_counter) {
-      i = i - M_partition * (i / (M_partition * outer_loop_counter));
-    }
-    // Backup counter for mid-loop
-    uint32_t j_offset_counter = c_start + j_offset * 4;
-    uint32_t P_counter = c_end;
-
-  Mid_loop:
-    for (uint32_t j = j_offset_counter; j < P_counter; j += 4) {
-      // Address registers
-      int32_t const *addr_a_ori = &A[i * N];
-      int32_t const *addr_b_ori = &B[j];
-      int32_t const *addr_a = &A[i * N + k_offset];
-      int32_t const *addr_b = &B[k_offset * P + j];
-      int32_t const *end_b = &B[N * P + j];
-      int32_t const *addr_c = &C[i * P + j];
-      register int32_t k asm("x1") = (int32_t)end_b;
-
-      __asm__ volatile(
-          ".balign 16 \n\t"
-          // Outer loop: Initialize and preload. Execute this loop P times
-          // TODO arrange
-          "add sp, sp, -8 \n\t"
-          "sw %[addr_b], 0(sp) \n\t"
-          "sw %[addr_a_ori], 4(sp) \n\t"
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-
-          // If reach endpoint, swap address
-          "bne %[addr_b], x1, init_comp \n\t"
-          "lw x1, 0(sp) \n\t"
-          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
-          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
-          "sw %[addr_b], 0(sp) \n\t"
-
-          // Initial computation + prefetching
-          "init_comp: \n\t"
-          "mul x16,  x3, x12 \n\t"
-          "mul x17,  x3, x13 \n\t"
-          "mul x18,  x3, x14 \n\t"
-          "mul x19,  x3, x15 \n\t"
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "mul x20,  x4, x12 \n\t"
-          "mul x21,  x4, x13 \n\t"
-          "mul x22,  x4, x14 \n\t"
-          "mul x23,  x4, x15 \n\t"
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "mul x24, x10, x12 \n\t"
-          "mul x25, x10, x13 \n\t"
-          "mul x26, x10, x14 \n\t"
-          "mul x27, x10, x15 \n\t"
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "mul x28, x11, x12 \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "mul x29, x11, x13 \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "mul x30, x11, x14 \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "mul %[addr_a_ori], x11, x15 \n\t"   // Use addr_a_ori instead of x31
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t"  // Increment by P-3
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-
-          // If reach endpoint, swap address
-          "bne %[addr_b], x1, inner_loop \n\t"
-          "sw %[addr_a_ori], 8(sp) \n\t" // backup x31
-          "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori
-          "lw x1, 0(sp) \n\t"
-          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
-          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
-          "sw %[addr_b], 0(sp) \n\t"
-          "lw %[addr_a_ori], 8(sp) \n\t" // load back x31
-
-          // Inner loop: Do this loop N times
-          "inner_loop: \n\t"
-          "1: \n\t"
-          "p.mac x16,  x3, x12 \n\t"
-          "p.mac x17,  x3, x13 \n\t"
-          "p.mac x20,  x4, x12 \n\t"
-          "p.mac x21,  x4, x13 \n\t"
-          "p.mac x18,  x3, x14 \n\t"
-          "p.mac x22,  x4, x14 \n\t"
-          "p.mac x19,  x3, x15 \n\t"
-          "p.lw  x3, %[N](%[addr_a]!) \n\t"
-          "p.mac x23,  x4, x15 \n\t"
-          "p.lw  x4, %[N](%[addr_a]!) \n\t"
-          "p.mac x24, x10, x12 \n\t"
-          "p.mac x28, x11, x12 \n\t"
-          "p.lw x12, 4(%[addr_b]!) \n\t"
-          "p.mac x25, x10, x13 \n\t"
-          "p.mac x29, x11, x13 \n\t"
-          "p.lw x13, 4(%[addr_b]!) \n\t"
-          "p.mac x26, x10, x14 \n\t"
-          "p.mac x30, x11, x14 \n\t"
-          "p.lw x14, 4(%[addr_b]!) \n\t"
-          "p.mac x27, x10, x15 \n\t"
-          "p.mac %[addr_a_ori], x11, x15 \n\t"
-          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
-          "p.lw x10, %[N](%[addr_a]!) \n\t"
-          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
-          "bne %[addr_b], x1, 1b \n\t"
-
-          // Case1: Loop done if k_offset = 0
-          // Case2: Loop done when 2nd time to here
-          // Case3: If reach endpoint, swap address
-          "lw %[addr_b], 0(sp) \n\t"
-          "beq %[addr_b_ori], %[addr_b], store \n\t"
-          "sw %[addr_a_ori], 8(sp) \n\t" // backup x31
-          "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori
-          "addi x1, %[addr_b], 0 \n\t"
-          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
-          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
-          "sw %[addr_b], 0(sp) \n\t"
-          "lw %[addr_a_ori], 8(sp) \n\t" // load back x31
-          "j 1b \n\t"
-
-          // Loop done store
-          "store: \n\t"
-          "p.mac x16,  x3, x12 \n\t"
-          "p.mac x17,  x3, x13 \n\t"
-          "p.mac x18,  x3, x14 \n\t"
-          "p.sw x16, 4(%[addr_c]!) \n\t"
-          "p.mac x19,  x3, x15 \n\t"
-          "p.sw x17, 4(%[addr_c]!) \n\t"
-          "p.mac x20,  x4, x12 \n\t"
-          "p.sw x18, 4(%[addr_c]!) \n\t"
-          "p.mac x21,  x4, x13 \n\t"
-          "p.sw x19, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x22,  x4, x14 \n\t"
-          "p.sw x20, 4(%[addr_c]!) \n\t"
-          "p.mac x23,  x4, x15 \n\t"
-          "p.sw x21, 4(%[addr_c]!) \n\t"
-          "p.mac x24, x10, x12 \n\t"
-          "p.sw x22, 4(%[addr_c]!) \n\t"
-          "p.mac x25, x10, x13 \n\t"
-          "p.sw x23, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x26, x10, x14 \n\t"
-          "p.sw x24, 4(%[addr_c]!) \n\t"
-          "p.mac x27, x10, x15 \n\t"
-          "p.sw x25, 4(%[addr_c]!) \n\t"
-          "p.mac x28, x11, x12 \n\t"
-          "p.sw x26, 4(%[addr_c]!) \n\t"
-          "p.mac x29, x11, x13 \n\t"
-          "p.sw x27, %[P_3](%[addr_c]!) \n\t"
-          "p.mac x30, x11, x14 \n\t"
-          "p.sw x28, 4(%[addr_c]!) \n\t"
-          "p.mac %[addr_a_ori], x11, x15 \n\t"
-          "p.sw x29, 4(%[addr_c]!) \n\t"
-          "p.sw x30, 4(%[addr_c]!) \n\t"
-          "p.sw %[addr_a_ori], %[P_3](%[addr_c]!) \n\t"
-          "add sp, sp, 8 \n\t"
-          : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b),
-            [addr_c] "+&r"(addr_c), [addr_a_ori] "+&r"(addr_a_ori),
-            [addr_b_ori] "+&r"(addr_b_ori) // Outputs
-          : [N3_1] "r"(N31), [P_3] "I"(P3), [x1] "r"(k),
-            [N] "I"(matrix_N * 4) // Inputs
-          : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
-            "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
-            "x27", "x28", "x29", "x30", "memory"); // Clobber
-    }
-    if (j_offset_counter != c_start) {
-      P_counter = j_offset_counter;
-      j_offset_counter = c_start;
-      goto Mid_loop;
-    }
-  }
-}
diff --git a/software/kernels/baremetal/mempool_matmul_i32p.h b/software/kernels/baremetal/mempool_matmul_i32p.h
index ac6a6647a..9eb86f32c 100644
--- a/software/kernels/baremetal/mempool_matmul_i32p.h
+++ b/software/kernels/baremetal/mempool_matmul_i32p.h
@@ -414,3 +414,620 @@ void matmul_unrolled_2x2_parallel_i32_xpulpv2(int32_t const *__restrict__ A,
   }
 }
 #endif
+
+void mat_mul_unrolled_4x4_parallel(int32_t const *__restrict__ A,
+                                   int32_t const *__restrict__ B,
+                                   int32_t *__restrict__ C, uint32_t M,
+                                   uint32_t N, uint32_t P, uint32_t id,
+                                   uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c =
+      numThreads / (M / 4); // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+  for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 4) {
+      // Initialize 4x4 output tile
+      int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0;
+      int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0;
+      int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0;
+      int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0;
+      for (uint32_t k = 0; k < N; k += 1) {
+        // Explicitly load the values first to help with scheduling
+        int32_t b0 = B[k * P + j + 0];
+        int32_t b1 = B[k * P + j + 1];
+        int32_t b2 = B[k * P + j + 2];
+        int32_t b3 = B[k * P + j + 3];
+        // A could be local with scrambling
+        int32_t a0 = A[(i + 0) * N + k];
+        int32_t a1 = A[(i + 1) * N + k];
+        int32_t a2 = A[(i + 2) * N + k];
+        int32_t a3 = A[(i + 3) * N + k];
+        // Compute
+        c00 += a0 * b0;
+        c01 += a0 * b1;
+        c02 += a0 * b2;
+        c03 += a0 * b3;
+        c10 += a1 * b0;
+        c11 += a1 * b1;
+        c12 += a1 * b2;
+        c13 += a1 * b3;
+        c20 += a2 * b0;
+        c21 += a2 * b1;
+        c22 += a2 * b2;
+        c23 += a2 * b3;
+        c30 += a3 * b0;
+        c31 += a3 * b1;
+        c32 += a3 * b2;
+        c33 += a3 * b3;
+      }
+      // Store
+      C[(i + 0) * P + j + 0] = c00;
+      C[(i + 0) * P + j + 1] = c01;
+      C[(i + 0) * P + j + 2] = c02;
+      C[(i + 0) * P + j + 3] = c03;
+      C[(i + 1) * P + j + 0] = c10;
+      C[(i + 1) * P + j + 1] = c11;
+      C[(i + 1) * P + j + 2] = c12;
+      C[(i + 1) * P + j + 3] = c13;
+      C[(i + 2) * P + j + 0] = c20;
+      C[(i + 2) * P + j + 1] = c21;
+      C[(i + 2) * P + j + 2] = c22;
+      C[(i + 2) * P + j + 3] = c23;
+      C[(i + 3) * P + j + 0] = c30;
+      C[(i + 3) * P + j + 1] = c31;
+      C[(i + 3) * P + j + 2] = c32;
+      C[(i + 3) * P + j + 3] = c33;
+    }
+  }
+}
+
+void mat_mul_unrolled_4x4_conflict_opt_parallel(int32_t const *__restrict__ A,
+                                                int32_t const *__restrict__ B,
+                                                int32_t *__restrict__ C,
+                                                uint32_t M, uint32_t N,
+                                                uint32_t P, uint32_t id,
+                                                uint32_t numThreads) {
+
+  /////////////////////////////
+  //      Configuration      //
+  /////////////////////////////
+  // Parallelize by assigning each core one row
+  // How many cores per window
+  uint32_t c = numThreads / (M / 4);
+  if (numThreads * 4 < M) {
+    c = 1;
+  }
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+
+  // For avoiding group conflict by same tile
+  // Each cores in the same tile should access to different groups
+  uint32_t group_bank_nums = 512;              // MemPool = 256
+  uint32_t tile_core_nums = 8;                 // MemPool = 4
+  uint32_t jump_lines_A = group_bank_nums / N; // Used for i control
+  uint32_t jump_lines_B = group_bank_nums / P; // Used for k control
+  // Window size limit, min jump lines is 4 for MatrixA
+  if (jump_lines_A < 4) {
+    jump_lines_A = 4;
+  }
+
+  /////////////////////////////
+  //      LOOP   OFFSET      //
+  /////////////////////////////
+  // Outer Loop Control, for group access port conflict
+  uint32_t i_offset = jump_lines_A * (id % tile_core_nums);
+  // Inner Loop Incremental Control, for group access port conflict
+  uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums);
+  // Inner Loop Control
+  // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB)
+  uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr;
+  // Middle Loop Control, window jump for avoiding bank conflict
+  uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P;
+  uint32_t j_offset = (2 * (id / c)) / conflict_row;
+
+  /////////////////////////////
+  //      LOOP  CONTROL      //
+  /////////////////////////////
+  // Inner Round-Robin
+  if (k_offset >= N) {
+    k_offset = k_offset - N * (k_offset / N);
+  }
+  // Middle Round-Robin
+  uint32_t window_in_P = (P / c) / 4;
+  if (j_offset >= window_in_P) {
+    j_offset = j_offset - window_in_P * (j_offset / window_in_P);
+  }
+  // Outer Loop Control
+  uint32_t outer_loop_counter = 0;
+  uint32_t outer_loop_time = M / (4 * numThreads);
+  if (outer_loop_time < 1) {
+    outer_loop_time = 1;
+  }
+  uint32_t M_partition = M / outer_loop_time;
+
+  /////////////////////////////
+  //      *LOOP  START*      //
+  /////////////////////////////
+  for (uint32_t i_ori = 4 * (id / c); i_ori < M;
+       i_ori += 4 * (numThreads / c)) {
+    outer_loop_counter += 1;
+    uint32_t i = i_ori + i_offset;
+    // Round-Robin control, if offset lines > M, back to the first window
+    if (i >= M_partition * outer_loop_counter) {
+      i = i - M_partition * (i / (M_partition * outer_loop_counter));
+    }
+    // Backup counter for mid-loop
+    uint32_t j_offset_counter = c_start + j_offset * 4;
+    uint32_t P_counter = c_end;
+
+  Mid_loop:
+    for (uint32_t j = j_offset_counter; j < P_counter; j += 4) {
+      // Initialize 4x4 output tile
+      int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0;
+      int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0;
+      int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0;
+      int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0;
+
+      // Backup the variables for restore and later use
+      uint32_t k_offset_counter = k_offset;
+      uint32_t N_counter = N;
+
+    Inner_Loop:
+      for (uint32_t k = k_offset_counter; k < N_counter; k += 1) {
+        // Explicitly load the values first to help with scheduling
+        int32_t b0 = B[k * P + j + 0];
+        int32_t b1 = B[k * P + j + 1];
+        int32_t b2 = B[k * P + j + 2];
+        int32_t b3 = B[k * P + j + 3];
+        // A could be local with scrambling
+        int32_t a0 = A[(i + 0) * N + k];
+        int32_t a1 = A[(i + 1) * N + k];
+        int32_t a2 = A[(i + 2) * N + k];
+        int32_t a3 = A[(i + 3) * N + k];
+        // Compute
+        c00 += a0 * b0;
+        c01 += a0 * b1;
+        c02 += a0 * b2;
+        c03 += a0 * b3;
+        c10 += a1 * b0;
+        c11 += a1 * b1;
+        c12 += a1 * b2;
+        c13 += a1 * b3;
+        c20 += a2 * b0;
+        c21 += a2 * b1;
+        c22 += a2 * b2;
+        c23 += a2 * b3;
+        c30 += a3 * b0;
+        c31 += a3 * b1;
+        c32 += a3 * b2;
+        c33 += a3 * b3;
+      }
+
+      // Pseudo-jump code to avoid complie inner-loop twice
+      // Complie twice will have scheduling issue due to register file limit.
+      if (k_offset_counter > 0) {
+        N_counter = k_offset;
+        k_offset_counter = 0;
+        goto Inner_Loop;
+      }
+
+      // Store
+      C[(i + 0) * P + j + 0] = c00;
+      C[(i + 0) * P + j + 1] = c01;
+      C[(i + 0) * P + j + 2] = c02;
+      C[(i + 0) * P + j + 3] = c03;
+      C[(i + 1) * P + j + 0] = c10;
+      C[(i + 1) * P + j + 1] = c11;
+      C[(i + 1) * P + j + 2] = c12;
+      C[(i + 1) * P + j + 3] = c13;
+      C[(i + 2) * P + j + 0] = c20;
+      C[(i + 2) * P + j + 1] = c21;
+      C[(i + 2) * P + j + 2] = c22;
+      C[(i + 2) * P + j + 3] = c23;
+      C[(i + 3) * P + j + 0] = c30;
+      C[(i + 3) * P + j + 1] = c31;
+      C[(i + 3) * P + j + 2] = c32;
+      C[(i + 3) * P + j + 3] = c33;
+    }
+
+    if (j_offset_counter != c_start) {
+      P_counter = j_offset_counter;
+      j_offset_counter = c_start;
+      goto Mid_loop;
+    }
+  }
+}
+
+/*******************************/
+/* ASM CODE KERNEL START BELOW */
+/*******************************/
+
+// Define immediate values that used in asm code.
+#define N3 (matrix_M - 3) * 4
+#define N31 (-3 * matrix_N + 1) * 4
+#define P3 (matrix_P - 3) * 4
+#define P31 (-3 * matrix_N + 1) * 4
+
+void mat_mul_unrolled_4x4_parallel_asm(int32_t const *__restrict__ A,
+                                       int32_t const *__restrict__ B,
+                                       int32_t *__restrict__ C, uint32_t M,
+                                       uint32_t N, uint32_t P, uint32_t id,
+                                       uint32_t numThreads) {
+  // Parallelize by assigning each tile one row
+  uint32_t c = numThreads / (M / 4);
+  if (numThreads * 4 < M) {
+    c = 1;
+  }
+  // numThreads / (M / 4); // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+  for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 4) {
+      // Address registers
+      int32_t const *addr_a = &A[i * N];
+      int32_t const *addr_b = &B[j];
+      int32_t const *end_b = &B[N * P + j];
+      int32_t const *addr_c = &C[i * P + j];
+      int32_t const N3_1_r = (-3 * (int32_t)N + 1) * 4;
+      int32_t const P_3_r = ((int32_t)P - 3) * 4;
+
+      register int32_t k asm("x1") = (int32_t)end_b;
+      //      x12 x13 x14 x15
+      //
+      // x3   x16 x17 x18 x19
+      // x4   x20 x21 x22 x23
+      // x10  x24 x25 x26 x27
+      // x11  x28 x29 x30 x31
+      //
+      //
+      __asm__ volatile(
+          ".balign 16 \n\t"
+          // Outer loop: Initialize and preload. Execute this loop P times
+          // TODO arrange
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+          // Initial computation + prefetching
+          "mul x16,  x3, x12 \n\t"
+          "mul x17,  x3, x13 \n\t"
+          "mul x18,  x3, x14 \n\t"
+          "mul x19,  x3, x15 \n\t"
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "mul x20,  x4, x12 \n\t"
+          "mul x21,  x4, x13 \n\t"
+          "mul x22,  x4, x14 \n\t"
+          "mul x23,  x4, x15 \n\t"
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "mul x24, x10, x12 \n\t"
+          "mul x25, x10, x13 \n\t"
+          "mul x26, x10, x14 \n\t"
+          "mul x27, x10, x15 \n\t"
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "mul x28, x11, x12 \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "mul x29, x11, x13 \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "mul x30, x11, x14 \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "mul x31, x11, x15 \n\t"
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t"  // Increment by P-3
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+          // Inner loop: Do this loop N times
+          "1: \n\t"
+          "p.mac x16,  x3, x12 \n\t"
+          "p.mac x17,  x3, x13 \n\t"
+          "p.mac x20,  x4, x12 \n\t"
+          "p.mac x21,  x4, x13 \n\t"
+          "p.mac x18,  x3, x14 \n\t"
+          "p.mac x22,  x4, x14 \n\t"
+          "p.mac x19,  x3, x15 \n\t"
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "p.mac x23,  x4, x15 \n\t"
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "p.mac x24, x10, x12 \n\t"
+          "p.mac x28, x11, x12 \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "p.mac x25, x10, x13 \n\t"
+          "p.mac x29, x11, x13 \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "p.mac x26, x10, x14 \n\t"
+          "p.mac x30, x11, x14 \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "p.mac x27, x10, x15 \n\t"
+          "p.mac x31, x11, x15 \n\t"
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+          "bne %[addr_b], x1, 1b \n\t"
+          // Loop done store
+          "p.mac x16,  x3, x12 \n\t"
+          "p.mac x17,  x3, x13 \n\t"
+          "p.mac x18,  x3, x14 \n\t"
+          "p.sw x16, 4(%[addr_c]!) \n\t"
+          "p.mac x19,  x3, x15 \n\t"
+          "p.sw x17, 4(%[addr_c]!) \n\t"
+          "p.mac x20,  x4, x12 \n\t"
+          "p.sw x18, 4(%[addr_c]!) \n\t"
+          "p.mac x21,  x4, x13 \n\t"
+          "p.sw x19, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x22,  x4, x14 \n\t"
+          "p.sw x20, 4(%[addr_c]!) \n\t"
+          "p.mac x23,  x4, x15 \n\t"
+          "p.sw x21, 4(%[addr_c]!) \n\t"
+          "p.mac x24, x10, x12 \n\t"
+          "p.sw x22, 4(%[addr_c]!) \n\t"
+          "p.mac x25, x10, x13 \n\t"
+          "p.sw x23, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x26, x10, x14 \n\t"
+          "p.sw x24, 4(%[addr_c]!) \n\t"
+          "p.mac x27, x10, x15 \n\t"
+          "p.sw x25, 4(%[addr_c]!) \n\t"
+          "p.mac x28, x11, x12 \n\t"
+          "p.sw x26, 4(%[addr_c]!) \n\t"
+          "p.mac x29, x11, x13 \n\t"
+          "p.sw x27, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x30, x11, x14 \n\t"
+          "p.sw x28, 4(%[addr_c]!) \n\t"
+          "p.mac x31, x11, x15 \n\t"
+          "p.sw x29, 4(%[addr_c]!) \n\t"
+          "p.sw x30, 4(%[addr_c]!) \n\t"
+          "p.sw x31, %[P_3](%[addr_c]!) \n\t"
+          : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b),
+            [addr_c] "+&r"(addr_c) // Outputs
+          : [N3_1] "r"(N3_1_r), [P_3] "r"(P_3_r), [x1] "r"(k),
+            [N] "I"(matrix_N * 4) // Inputs
+          : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
+            "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+            "x27", "x28", "x29", "x30", "x31", "memory"); // Clobber
+    }
+  }
+}
+
+void mat_mul_unrolled_4x4_conflict_opt_parallel_asm(
+    int32_t const *__restrict__ A, int32_t const *__restrict__ B,
+    int32_t *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, uint32_t id,
+    uint32_t numThreads) {
+
+  /////////////////////////////
+  //      Configuration      //
+  /////////////////////////////
+  // Parallelize by assigning each core one row
+  // How many cores per window
+  uint32_t c = numThreads / (M / 4);
+  if (numThreads * 4 < M) {
+    c = 1;
+  }
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+
+  // For avoiding group conflict by same tile
+  // Each cores in the same tile should access to different groups
+  uint32_t group_bank_nums = 512;              // MemPool = 256
+  uint32_t tile_core_nums = 8;                 // MemPool = 4
+  uint32_t jump_lines_A = group_bank_nums / N; // Used for i control
+  uint32_t jump_lines_B = group_bank_nums / P; // Used for k control
+  // Window size limit, min jump lines is 4 for MatrixA
+  if (jump_lines_A < 4) {
+    jump_lines_A = 4;
+  }
+
+  /////////////////////////////
+  //      LOOP   OFFSET      //
+  /////////////////////////////
+  // Outer Loop Control, for group access port conflict
+  uint32_t i_offset = jump_lines_A * (id % tile_core_nums);
+  // Inner Loop Incremental Control, for group access port conflict
+  uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums);
+  // Inner Loop Control
+  // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB)
+  uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr;
+  // Middle Loop Control, window jump for avoiding bank conflict
+  uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P;
+  uint32_t j_offset = (2 * (id / c)) / conflict_row;
+
+  /////////////////////////////
+  //      LOOP  CONTROL      //
+  /////////////////////////////
+  // Inner Round-Robin
+  if (k_offset >= N) {
+    k_offset = k_offset - N * (k_offset / N);
+  }
+  // Middle Round-Robin
+  uint32_t window_in_P = (P / c) / 4;
+  if (j_offset >= window_in_P) {
+    j_offset = j_offset - window_in_P * (j_offset / window_in_P);
+  }
+  // Outer Loop Control
+  uint32_t outer_loop_counter = 0;
+  uint32_t outer_loop_time = M / (4 * numThreads);
+  if (outer_loop_time < 1) {
+    outer_loop_time = 1;
+  }
+  uint32_t M_partition = M / outer_loop_time;
+
+  /////////////////////////////
+  //      *LOOP  START*      //
+  /////////////////////////////
+  for (uint32_t i_ori = 4 * (id / c); i_ori < M;
+       i_ori += 4 * (numThreads / c)) {
+    outer_loop_counter += 1;
+    uint32_t i = i_ori + i_offset;
+    // Round-Robin control, if offset lines > M, back to the first window
+    if (i >= M_partition * outer_loop_counter) {
+      i = i - M_partition * (i / (M_partition * outer_loop_counter));
+    }
+    // Backup counter for mid-loop
+    uint32_t j_offset_counter = c_start + j_offset * 4;
+    uint32_t P_counter = c_end;
+
+  Mid_loop:
+    for (uint32_t j = j_offset_counter; j < P_counter; j += 4) {
+      // Address registers
+      int32_t const *addr_a_ori = &A[i * N];
+      int32_t const *addr_b_ori = &B[j];
+      int32_t const *addr_a = &A[i * N + k_offset];
+      int32_t const *addr_b = &B[k_offset * P + j];
+      int32_t const *end_b = &B[N * P + j];
+      int32_t const *addr_c = &C[i * P + j];
+      register int32_t k asm("x1") = (int32_t)end_b;
+
+      __asm__ volatile(
+          ".balign 16 \n\t"
+          // Outer loop: Initialize and preload. Execute this loop P times
+          // TODO arrange
+          "add sp, sp, -8 \n\t"
+          "sw %[addr_b], 0(sp) \n\t"
+          "sw %[addr_a_ori], 4(sp) \n\t"
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+
+          // If reach endpoint, swap address
+          "bne %[addr_b], x1, init_comp \n\t"
+          "lw x1, 0(sp) \n\t"
+          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
+          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
+          "sw %[addr_b], 0(sp) \n\t"
+
+          // Initial computation + prefetching
+          "init_comp: \n\t"
+          "mul x16,  x3, x12 \n\t"
+          "mul x17,  x3, x13 \n\t"
+          "mul x18,  x3, x14 \n\t"
+          "mul x19,  x3, x15 \n\t"
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "mul x20,  x4, x12 \n\t"
+          "mul x21,  x4, x13 \n\t"
+          "mul x22,  x4, x14 \n\t"
+          "mul x23,  x4, x15 \n\t"
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "mul x24, x10, x12 \n\t"
+          "mul x25, x10, x13 \n\t"
+          "mul x26, x10, x14 \n\t"
+          "mul x27, x10, x15 \n\t"
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "mul x28, x11, x12 \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "mul x29, x11, x13 \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "mul x30, x11, x14 \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "mul %[addr_a_ori], x11, x15 \n\t"   // Use addr_a_ori instead of x31
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t"  // Increment by P-3
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+
+          // If reach endpoint, swap address
+          "bne %[addr_b], x1, inner_loop \n\t"
+          "sw %[addr_a_ori], 8(sp) \n\t" // backup x31
+          "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori
+          "lw x1, 0(sp) \n\t"
+          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
+          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
+          "sw %[addr_b], 0(sp) \n\t"
+          "lw %[addr_a_ori], 8(sp) \n\t" // load back x31
+
+          // Inner loop: Do this loop N times
+          "inner_loop: \n\t"
+          "1: \n\t"
+          "p.mac x16,  x3, x12 \n\t"
+          "p.mac x17,  x3, x13 \n\t"
+          "p.mac x20,  x4, x12 \n\t"
+          "p.mac x21,  x4, x13 \n\t"
+          "p.mac x18,  x3, x14 \n\t"
+          "p.mac x22,  x4, x14 \n\t"
+          "p.mac x19,  x3, x15 \n\t"
+          "p.lw  x3, %[N](%[addr_a]!) \n\t"
+          "p.mac x23,  x4, x15 \n\t"
+          "p.lw  x4, %[N](%[addr_a]!) \n\t"
+          "p.mac x24, x10, x12 \n\t"
+          "p.mac x28, x11, x12 \n\t"
+          "p.lw x12, 4(%[addr_b]!) \n\t"
+          "p.mac x25, x10, x13 \n\t"
+          "p.mac x29, x11, x13 \n\t"
+          "p.lw x13, 4(%[addr_b]!) \n\t"
+          "p.mac x26, x10, x14 \n\t"
+          "p.mac x30, x11, x14 \n\t"
+          "p.lw x14, 4(%[addr_b]!) \n\t"
+          "p.mac x27, x10, x15 \n\t"
+          "p.mac %[addr_a_ori], x11, x15 \n\t"
+          "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3
+          "p.lw x10, %[N](%[addr_a]!) \n\t"
+          "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1
+          "bne %[addr_b], x1, 1b \n\t"
+
+          // Case1: Loop done if k_offset = 0
+          // Case2: Loop done when 2nd time to here
+          // Case3: If reach endpoint, swap address
+          "lw %[addr_b], 0(sp) \n\t"
+          "beq %[addr_b_ori], %[addr_b], store \n\t"
+          "sw %[addr_a_ori], 8(sp) \n\t" // backup x31
+          "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori
+          "addi x1, %[addr_b], 0 \n\t"
+          "addi %[addr_a], %[addr_a_ori], 0 \n\t"
+          "addi %[addr_b], %[addr_b_ori], 0 \n\t"
+          "sw %[addr_b], 0(sp) \n\t"
+          "lw %[addr_a_ori], 8(sp) \n\t" // load back x31
+          "j 1b \n\t"
+
+          // Loop done store
+          "store: \n\t"
+          "p.mac x16,  x3, x12 \n\t"
+          "p.mac x17,  x3, x13 \n\t"
+          "p.mac x18,  x3, x14 \n\t"
+          "p.sw x16, 4(%[addr_c]!) \n\t"
+          "p.mac x19,  x3, x15 \n\t"
+          "p.sw x17, 4(%[addr_c]!) \n\t"
+          "p.mac x20,  x4, x12 \n\t"
+          "p.sw x18, 4(%[addr_c]!) \n\t"
+          "p.mac x21,  x4, x13 \n\t"
+          "p.sw x19, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x22,  x4, x14 \n\t"
+          "p.sw x20, 4(%[addr_c]!) \n\t"
+          "p.mac x23,  x4, x15 \n\t"
+          "p.sw x21, 4(%[addr_c]!) \n\t"
+          "p.mac x24, x10, x12 \n\t"
+          "p.sw x22, 4(%[addr_c]!) \n\t"
+          "p.mac x25, x10, x13 \n\t"
+          "p.sw x23, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x26, x10, x14 \n\t"
+          "p.sw x24, 4(%[addr_c]!) \n\t"
+          "p.mac x27, x10, x15 \n\t"
+          "p.sw x25, 4(%[addr_c]!) \n\t"
+          "p.mac x28, x11, x12 \n\t"
+          "p.sw x26, 4(%[addr_c]!) \n\t"
+          "p.mac x29, x11, x13 \n\t"
+          "p.sw x27, %[P_3](%[addr_c]!) \n\t"
+          "p.mac x30, x11, x14 \n\t"
+          "p.sw x28, 4(%[addr_c]!) \n\t"
+          "p.mac %[addr_a_ori], x11, x15 \n\t"
+          "p.sw x29, 4(%[addr_c]!) \n\t"
+          "p.sw x30, 4(%[addr_c]!) \n\t"
+          "p.sw %[addr_a_ori], %[P_3](%[addr_c]!) \n\t"
+          "add sp, sp, 8 \n\t"
+          : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b),
+            [addr_c] "+&r"(addr_c), [addr_a_ori] "+&r"(addr_a_ori),
+            [addr_b_ori] "+&r"(addr_b_ori) // Outputs
+          : [N3_1] "r"(N31), [P_3] "I"(P3), [x1] "r"(k),
+            [N] "I"(matrix_N * 4) // Inputs
+          : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
+            "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+            "x27", "x28", "x29", "x30", "memory"); // Clobber
+    }
+    if (j_offset_counter != c_start) {
+      P_counter = j_offset_counter;
+      j_offset_counter = c_start;
+      goto Mid_loop;
+    }
+  }
+}