Skip to content

Commit

Permalink
[software] Move the port-conflict optimized matmul to matmul_i32p
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 10, 2024
1 parent 3ea70e0 commit 5bee548
Show file tree
Hide file tree
Showing 7 changed files with 699 additions and 1,098 deletions.
3 changes: 2 additions & 1 deletion software/apps/baremetal/matmul_i32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
#include "runtime.h"
#include "synchronization.h"

#include "data_matmul_i32.h"

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_matmul_i32p.h"
#include "data_matmul_i32.h"

int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
Expand Down
68 changes: 39 additions & 29 deletions software/apps/baremetal/matrix_mul/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,28 @@
#include <stdint.h>
#include <string.h>

#include "baremetal/mempool_matmul_i32p.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"

// Define Matrix dimensions:
// C = AB with A=[MxN], B=[NxP], C=[MxP]
#define M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
#define N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
#define P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
#define matrix_M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
#define matrix_N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
#define matrix_P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2))
// Specify how the matrices A and B should be initialized
// The entries will follow this format:
// a(i,j) = A_a*i + A_b*j + A_c
// b(i,j) = B_a*i + B_b*j + B_c
// The result will be the following matrix
// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * N
// + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * (N*(N-1))/2
// + (A_b*B_a) * (N*(N-1)*(2*N-1))/6
// Note: To keep the code simpler, we use indices that go from 0 to N-1 instead
// of 1 to N as the mathematicians do. Hence, for A, i=[0,M-1] j=[0,M-1]
// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * matrix_N
// + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) *
// (matrix_N*(matrix_N-1))/2
// + (A_b*B_a) * (matrix_N*(matrix_N-1)*(2*matrix_N-1))/6
// Note: To keep the code simpler, we use indices that go from 0 to matrix_N-1
// instead of 1 to matrix_N as the mathematicians do. Hence, for A,
// i=[0,matrix_M-1] j=[0,matrix_M-1]
#define A_a 1
#define A_b 1
#define A_c -32
Expand All @@ -37,10 +38,11 @@
// Enable verbose printing
// #define VERBOSE

#include "baremetal/mempool_matmul_i32p.h"
int32_t volatile init __attribute__((section(".l2"))) = 0;
int32_t a[M * N] __attribute__((section(".l1")));
int32_t b[N * P] __attribute__((section(".l1")));
int32_t c[M * P] __attribute__((section(".l1")));
int32_t a[matrix_M * matrix_N] __attribute__((section(".l1")));
int32_t b[matrix_N * matrix_P] __attribute__((section(".l1")));
int32_t c[matrix_M * matrix_P] __attribute__((section(".l1")));

// Initialize the matrices in parallel
void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
Expand All @@ -61,10 +63,13 @@ int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
// Parallelize over rows
for (int32_t i = 0; i < (int32_t)num_rows; ++i) {
for (int32_t j = 0; j < (int32_t)num_columns; ++j) {
int32_t lin = (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * N;
int32_t qua =
((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * (N * (N - 1))) / 2;
int32_t cub = ((ab * ba) * (N * (N - 1) * (2 * N - 1))) / 6;
int32_t lin =
(aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * matrix_N;
int32_t qua = ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) *
(matrix_N * (matrix_N - 1))) /
2;
int32_t cub =
((ab * ba) * (matrix_N * (matrix_N - 1) * (2 * matrix_N - 1))) / 6;
int32_t golden = lin + qua + cub;
if (matrix[i * (int32_t)num_columns + j] != golden) {
return (i + j) == 0 ? -1 : i * (int32_t)num_columns + j;
Expand Down Expand Up @@ -100,14 +105,14 @@ int main() {
// #endif

// Initialize Matrices
init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores);
init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores);
init_matrix(a, matrix_M, matrix_N, A_a, A_b, A_c, core_id, num_cores);
init_matrix(b, matrix_N, matrix_P, B_a, B_b, B_c, core_id, num_cores);

#ifdef VERBOSE
mempool_barrier(num_cores);
if (core_id == 0) {
print_matrix(a, M, N);
print_matrix(b, N, P);
print_matrix(a, matrix_M, matrix_N);
print_matrix(b, matrix_N, matrix_P);
}
#endif

Expand All @@ -121,20 +126,24 @@ int main() {
mempool_start_benchmark();
switch (i) {
case 0:
mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores);
mat_mul_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
num_cores);
break;
case 1:
mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores);
mat_mul_unrolled_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
num_cores);
break;
case 2:
mat_mul_asm_parallel(a, b, c, M, N, P, core_id, num_cores);
mat_mul_asm_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id,
num_cores);
break;
case 3:
mat_mul_parallel_finegrained(a, b, c, M, N, P, core_id, num_cores);
mat_mul_parallel_finegrained(a, b, c, matrix_M, matrix_N, matrix_P,
core_id, num_cores);
break;
case 4:
mat_mul_unrolled_parallel_finegrained(a, b, c, M, N, P, core_id,
num_cores);
mat_mul_unrolled_parallel_finegrained(a, b, c, matrix_M, matrix_N,
matrix_P, core_id, num_cores);
break;
}
mempool_stop_benchmark();
Expand All @@ -144,7 +153,8 @@ int main() {
// Check result
if (core_id == 0) {
// printf("Duration: %d\n", cycles);
int error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
int error =
verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c);
if (error != 0) {
printf("Error code %d\n", error);
printf("c[%d]=%d\n", error, c[error]);
Expand All @@ -154,7 +164,7 @@ int main() {
#endif
} else {
// Wait for the approx amount it takes core 0 to verify the result
mempool_wait(M * P * 12);
mempool_wait(matrix_M * matrix_P * 12);
}
}

Expand All @@ -163,7 +173,7 @@ int main() {

#ifdef VERBOSE
if (core_id == 0) {
print_matrix(c, M, P);
print_matrix(c, matrix_M, matrix_P);
}
mempool_barrier(num_cores);
#endif
Expand Down
3 changes: 2 additions & 1 deletion software/apps/baremetal/tests/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <stdint.h>
#include <string.h>

#include "baremetal/mempool_matmul_i32p.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
Expand All @@ -25,6 +24,8 @@
#define matrix_P (NUM_CORES)
#endif

#include "baremetal/mempool_matmul_i32p.h"

int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
Expand Down
208 changes: 0 additions & 208 deletions software/apps/matmul_i32_conflict_opt/main.c

This file was deleted.

Loading

0 comments on commit 5bee548

Please sign in to comment.