[software] Move the port-conflict optimized matmul to matmul_i32p

pulp-platform · Dec 10, 2024 · 5bee548 · 5bee548
1 parent 3ea70e0
commit 5bee548
Show file tree

Hide file tree

Showing 7 changed files with 699 additions and 1,098 deletions.
diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c
@@ -13,9 +13,10 @@
 #include "runtime.h"
 #include "synchronization.h"
 
+#include "data_matmul_i32.h"
+
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_matmul_i32p.h"
-#include "data_matmul_i32.h"
 
 int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));

diff --git a/software/apps/baremetal/matrix_mul/main.c b/software/apps/baremetal/matrix_mul/main.c
@@ -7,27 +7,28 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
 // Define Matrix dimensions:
 // C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
-#define N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
-#define P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
+#define matrix_P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
 // Specify how the matrices A and B should be initialized
 // The entries will follow this format:
 // a(i,j) = A_a*i + A_b*j + A_c
 // b(i,j) = B_a*i + B_b*j + B_c
 // The result will be the following matrix
-// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * N
-//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * (N*(N-1))/2
-//        + (A_b*B_a) * (N*(N-1)*(2*N-1))/6
-// Note: To keep the code simpler, we use indices that go from 0 to N-1 instead
-// of 1 to N as the mathematicians do. Hence, for A, i=[0,M-1] j=[0,M-1]
+// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * matrix_N
+//        + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) *
+//        (matrix_N*(matrix_N-1))/2
+//        + (A_b*B_a) * (matrix_N*(matrix_N-1)*(2*matrix_N-1))/6
+// Note: To keep the code simpler, we use indices that go from 0 to matrix_N-1
+// instead of 1 to matrix_N as the mathematicians do. Hence, for A,
+// i=[0,matrix_M-1] j=[0,matrix_M-1]
 #define A_a 1
 #define A_b 1
 #define A_c -32
@@ -37,10 +38,11 @@
 // Enable verbose printing
 // #define VERBOSE
 
+#include "baremetal/mempool_matmul_i32p.h"
 int32_t volatile init __attribute__((section(".l2"))) = 0;
-int32_t a[M * N] __attribute__((section(".l1")));
-int32_t b[N * P] __attribute__((section(".l1")));
-int32_t c[M * P] __attribute__((section(".l1")));
+int32_t a[matrix_M * matrix_N] __attribute__((section(".l1")));
+int32_t b[matrix_N * matrix_P] __attribute__((section(".l1")));
+int32_t c[matrix_M * matrix_P] __attribute__((section(".l1")));
 
 // Initialize the matrices in parallel
 void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
@@ -61,10 +63,13 @@ int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
   // Parallelize over rows
   for (int32_t i = 0; i < (int32_t)num_rows; ++i) {
     for (int32_t j = 0; j < (int32_t)num_columns; ++j) {
-      int32_t lin = (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * N;
-      int32_t qua =
-          ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * (N * (N - 1))) / 2;
-      int32_t cub = ((ab * ba) * (N * (N - 1) * (2 * N - 1))) / 6;
+      int32_t lin =
+          (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * matrix_N;
+      int32_t qua = ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) *
+                     (matrix_N * (matrix_N - 1))) /
+                    2;
+      int32_t cub =
+          ((ab * ba) * (matrix_N * (matrix_N - 1) * (2 * matrix_N - 1))) / 6;
       int32_t golden = lin + qua + cub;
       if (matrix[i * (int32_t)num_columns + j] != golden) {
         return (i + j) == 0 ? -1 : i * (int32_t)num_columns + j;
@@ -100,14 +105,14 @@ int main() {
   // #endif
 
   // Initialize Matrices
-  init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores);
+  init_matrix(a, matrix_M, matrix_N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(b, matrix_N, matrix_P, B_a, B_b, B_c, core_id, num_cores);
 
 #ifdef VERBOSE
   mempool_barrier(num_cores);
   if (core_id == 0) {
-    print_matrix(a, M, N);
-    print_matrix(b, N, P);
+    print_matrix(a, matrix_M, matrix_N);
+    print_matrix(b, matrix_N, matrix_P);
   }
 #endif
 
@@ -121,20 +126,24 @@ int main() {
     mempool_start_benchmark();
     switch (i) {
     case 0:
-      mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                       num_cores);
       break;
     case 1:
-      mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_unrolled_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                                num_cores);
       break;
     case 2:
-      mat_mul_asm_parallel(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_asm_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
+                           num_cores);
       break;
     case 3:
-      mat_mul_parallel_finegrained(a, b, c, M, N, P, core_id, num_cores);
+      mat_mul_parallel_finegrained(a, b, c, matrix_M, matrix_N, matrix_P,
+                                   core_id, num_cores);
       break;
     case 4:
-      mat_mul_unrolled_parallel_finegrained(a, b, c, M, N, P, core_id,
-                                            num_cores);
+      mat_mul_unrolled_parallel_finegrained(a, b, c, matrix_M, matrix_N,
+                                            matrix_P, core_id, num_cores);
       break;
     }
     mempool_stop_benchmark();
@@ -144,7 +153,8 @@ int main() {
     // Check result
     if (core_id == 0) {
       // printf("Duration: %d\n", cycles);
-      int error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+      int error =
+          verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
       if (error != 0) {
         printf("Error code %d\n", error);
         printf("c[%d]=%d\n", error, c[error]);
@@ -154,7 +164,7 @@ int main() {
 #endif
     } else {
       // Wait for the approx amount it takes core 0 to verify the result
-      mempool_wait(M * P * 12);
+      mempool_wait(matrix_M * matrix_P * 12);
     }
   }
 
@@ -163,7 +173,7 @@ int main() {
 
 #ifdef VERBOSE
   if (core_id == 0) {
-    print_matrix(c, M, P);
+    print_matrix(c, matrix_M, matrix_P);
   }
   mempool_barrier(num_cores);
 #endif

diff --git a/software/apps/baremetal/tests/main.c b/software/apps/baremetal/tests/main.c
@@ -7,7 +7,6 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
@@ -25,6 +24,8 @@
 #define matrix_P (NUM_CORES)
 #endif
 
+#include "baremetal/mempool_matmul_i32p.h"
+
 int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
 int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
 int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));

diff --git a/software/apps/matmul_i32_conflict_opt/main.c b/software/apps/matmul_i32_conflict_opt/main.c