diff --git a/software/apps/matmul_i32_conflict_opt/main.c b/software/apps/matmul_i32_conflict_opt/main.c new file mode 100644 index 000000000..c3d61b64b --- /dev/null +++ b/software/apps/matmul_i32_conflict_opt/main.c @@ -0,0 +1,208 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Yichao Zhang, ETH Zurich +// Author: Samuel Riedel, ETH Zurich + +#include +#include + +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +// Define Matrix dimensions: +// C = AB with A=[MxN], B=[NxP], C=[MxP] +#define matrix_M 128 +#define matrix_N 64 +#define matrix_P 128 + +// Define Benchmark Flag +#define SERIAL_MODE (0) +#define PARALLEL_MODE (1) +#define CONCURRENT_MODE (0) +#define NUM_PARALLEL_CORES (1024) + +// Define kernel include +#include "kernel/mat_mul.h" +#include "kernel/mat_mul_conflict_opt.h" + +// Define memory distributing +int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1"))); +int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1"))); +int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1"))); +#if (CONCURRENT_MODE == 1) +int32_t matrix_d[matrix_M * matrix_N] __attribute__((section(".l1"))); +int32_t matrix_e[matrix_N * matrix_P] __attribute__((section(".l1"))); +int32_t matrix_f[matrix_M * matrix_P] __attribute__((section(".l1"))); +#endif +int volatile error __attribute__((section(".l2"))); + +// Function init_matrix +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id, + uint32_t num_cores) { + // How many rows/columns to split the matrix into + uint32_t const split = 4; + if (num_columns > num_rows) { + // Parallelize over columns + uint32_t const c_start = (num_rows / split) * (core_id % split); + uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); + for (uint32_t j = (core_id / split); j < num_columns; + j += (num_cores / split)) { + for (uint32_t i = c_start; i < c_end; ++i) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } else { + // Parallelize over rows + uint32_t const c_start = (num_columns / split) * (core_id % split); + uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); + for (uint32_t i = (core_id / split); i < num_rows; + i += (num_cores / split)) { + for (uint32_t j = c_start; j < c_end; ++j) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } +} + +// Function verify_matrix +int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac, + int32_t ba, int32_t bb, int32_t bc, uint32_t core_id, + uint32_t num_cores) { + // Convert to signed + int32_t n = (int32_t)inner_dim; + // Parallelize over rows + for (uint32_t i = core_id; i < num_rows; i += num_cores) { + for (uint32_t j = 0; j < num_columns; ++j) { + int32_t ii = (int32_t)i; + int32_t jj = (int32_t)j; + int32_t lin = + (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n; + int32_t qua = + ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) / + 2; + int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; + int32_t golden = lin + qua + cub; + if (matrix[i * num_columns + j] != golden) { + return (i + j) == 0 ? -1 : (int)(i * num_columns + j); + } + matrix[i * num_columns + j] = 0; + } + } + return 0; +} + +// Function test_matrix_multiplication +int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, uint32_t N, + uint32_t P, uint32_t core_id, + uint32_t num_cores) { + int32_t const A_a = 1; + int32_t const A_b = 2; + int32_t const A_c = -32; + int32_t const B_a = 1; + int32_t const B_b = 1; + int32_t const B_c = 16; + + // Initialize Matrices + init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); + init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); +#if (CONCURRENT_MODE == 1) + init_matrix(D, M, N, A_a, A_b, A_c, core_id, num_cores); + init_matrix(E, N, P, B_a, B_b, B_c, core_id, num_cores); +#endif + mempool_barrier(num_cores); + +// Serial Benchmark +#if (SERIAL_MODE == 1) + if (core_id == 0) { + printf("Serial Calculation Start\n"); + mempool_start_benchmark(); + mat_mul_unrolled_4x4_serial(A, B, C, M, N, P); + mempool_stop_benchmark(); + printf("Calculation Finish\n"); + } +#endif + +// Parallel Benchmark +#if (PARALLEL_MODE == 1) + if (core_id == 0) { + printf("Parallel Calculation Start\n"); + } + mempool_barrier(num_cores); + + if (core_id < NUM_PARALLEL_CORES) { + mempool_start_benchmark(); + mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id, + NUM_PARALLEL_CORES); + mempool_start_benchmark(); + mempool_log_partial_barrier(2, core_id, NUM_PARALLEL_CORES); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#endif + +// Concurrent Benchmark +#if (CONCURRENT_MODE == 1) + if (core_id == 0) { + printf("Concurrent Calculation Start\n"); + } + mempool_barrier(num_cores); + + if (core_id < 512) { + mempool_start_benchmark(); + mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id, + 512); + mempool_start_benchmark(); + mempool_log_partial_barrier(2, core_id, 512); + mempool_stop_benchmark(); + } + if (core_id >= 512) { + uint32_t core_id_new = core_id - 512; + mempool_start_benchmark(); + mat_mul_unrolled_4x4_conflict_opt_parallel_asm(D, E, F, M, N, P, + core_id_new, 512); + mempool_start_benchmark(); + mempool_log_partial_barrier(2, core_id, 512); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#endif + + // Verify results + if (core_id == 0) { + printf("Start Verify Results\n"); + } + mempool_barrier(num_cores); + if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, + num_cores)) { + error = 1; + return -1; + } + return 0; +} + +// Main function block +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + if (core_id == 0) { + error = 0; + } + + // Test the Matrix multiplication + test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, + matrix_P, core_id, num_cores); + // wait until all cores have finished + mempool_barrier(num_cores); + + return error; +} diff --git a/software/kernels/baremetal/mat_mul_conflict_opt.h b/software/kernels/baremetal/mat_mul_conflict_opt.h new file mode 100644 index 000000000..108f074a8 --- /dev/null +++ b/software/kernels/baremetal/mat_mul_conflict_opt.h @@ -0,0 +1,828 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Yichao Zhang, ETH Zurich +// Author: Samuel Riedel, ETH Zurich + +/* This library implements the matrix multiplication in multiple different ways. + * The functions all follow the following format: + * + * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix + * C = AB + */ + +/* For parallel computation, general kernels support the power of 2 of matrix + * dimension; The max size for M and P should be M=P=<4096; The min size, it + * need to make sure "c_end - c_start >=4"; For mempool, the min matrix size is + * M=P=64; For terapool, the min matrix size is M=P=128; + */ + +void mat_mul_unrolled_4x2_serial(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P) { + // Parallelize by assigning each core one row + for (uint32_t i = 0; i < M; i += 2) { + for (uint32_t j = 0; j < P; j += 4) { + int32_t c00 = 0; + int32_t c01 = 0; + int32_t c02 = 0; + int32_t c03 = 0; + int32_t c10 = 0; + int32_t c11 = 0; + int32_t c12 = 0; + int32_t c13 = 0; + for (uint32_t k = 0; k < N; k += 2) { + // Explicitly load the values first to help with scheduling + int32_t val_a00 = A[(i + 0) * N + k + 0]; + int32_t val_a01 = A[(i + 0) * N + k + 1]; + int32_t val_a10 = A[(i + 1) * N + k + 0]; + int32_t val_a11 = A[(i + 1) * N + k + 1]; + + int32_t val_b00 = B[(k + 0) * P + j + 0]; + int32_t val_b01 = B[(k + 0) * P + j + 1]; + int32_t val_b02 = B[(k + 0) * P + j + 2]; + int32_t val_b03 = B[(k + 0) * P + j + 3]; + + int32_t val_b10 = B[(k + 1) * P + j + 0]; + int32_t val_b11 = B[(k + 1) * P + j + 1]; + int32_t val_b12 = B[(k + 1) * P + j + 2]; + int32_t val_b13 = B[(k + 1) * P + j + 3]; + + c00 += val_a00 * val_b00; + c00 += val_a01 * val_b10; + c01 += val_a00 * val_b01; + c01 += val_a01 * val_b11; + c02 += val_a00 * val_b02; + c02 += val_a01 * val_b12; + c03 += val_a00 * val_b03; + c03 += val_a01 * val_b13; + + c10 += val_a10 * val_b00; + c10 += val_a11 * val_b10; + c11 += val_a10 * val_b01; + c11 += val_a11 * val_b11; + c12 += val_a10 * val_b02; + c12 += val_a11 * val_b12; + c13 += val_a10 * val_b03; + c13 += val_a11 * val_b13; + } + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + } + } +} + +void mat_mul_unrolled_4x2_parallel(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + // Parallelize by assigning each core one row + uint32_t const c = + numThreads / (M / 2); // How many columns to split the matrix into, best + // should be numThreads/(M/2); + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) { + for (uint32_t j = c_start; j < c_end; j += 4) { + int32_t c00 = 0; + int32_t c01 = 0; + int32_t c02 = 0; + int32_t c03 = 0; + int32_t c10 = 0; + int32_t c11 = 0; + int32_t c12 = 0; + int32_t c13 = 0; + for (uint32_t k = 0; k < N; k += 2) { + // Explicitly load the values first to help with scheduling + int32_t val_a00 = A[(i + 0) * N + k + 0]; + int32_t val_a01 = A[(i + 0) * N + k + 1]; + int32_t val_a10 = A[(i + 1) * N + k + 0]; + int32_t val_a11 = A[(i + 1) * N + k + 1]; + + int32_t val_b00 = B[(k + 0) * P + j + 0]; + int32_t val_b01 = B[(k + 0) * P + j + 1]; + int32_t val_b02 = B[(k + 0) * P + j + 2]; + int32_t val_b03 = B[(k + 0) * P + j + 3]; + + int32_t val_b10 = B[(k + 1) * P + j + 0]; + int32_t val_b11 = B[(k + 1) * P + j + 1]; + int32_t val_b12 = B[(k + 1) * P + j + 2]; + int32_t val_b13 = B[(k + 1) * P + j + 3]; + + c00 += val_a00 * val_b00; + c00 += val_a01 * val_b10; + c01 += val_a00 * val_b01; + c01 += val_a01 * val_b11; + c02 += val_a00 * val_b02; + c02 += val_a01 * val_b12; + c03 += val_a00 * val_b03; + c03 += val_a01 * val_b13; + + c10 += val_a10 * val_b00; + c10 += val_a11 * val_b10; + c11 += val_a10 * val_b01; + c11 += val_a11 * val_b11; + c12 += val_a10 * val_b02; + c12 += val_a11 * val_b12; + c13 += val_a10 * val_b03; + c13 += val_a11 * val_b13; + } + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + } + } +} + +void mat_mul_unrolled_4x4_serial(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P) { + // Parallelize by assigning each core one row + for (uint32_t i = 0; i < M; i += 4) { + for (uint32_t j = 0; j < P; j += 4) { + // Initialize 4x4 output tile + int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; + int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; + int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; + int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; + for (uint32_t k = 0; k < N; k += 1) { + // Explicitly load the values first to help with scheduling + int32_t b0 = B[k * P + j + 0]; + int32_t b1 = B[k * P + j + 1]; + int32_t b2 = B[k * P + j + 2]; + int32_t b3 = B[k * P + j + 3]; + // A could be local with scrambling + int32_t a0 = A[(i + 0) * N + k]; + int32_t a1 = A[(i + 1) * N + k]; + int32_t a2 = A[(i + 2) * N + k]; + int32_t a3 = A[(i + 3) * N + k]; + // Compute + c00 += a0 * b0; + c01 += a0 * b1; + c02 += a0 * b2; + c03 += a0 * b3; + c10 += a1 * b0; + c11 += a1 * b1; + c12 += a1 * b2; + c13 += a1 * b3; + c20 += a2 * b0; + c21 += a2 * b1; + c22 += a2 * b2; + c23 += a2 * b3; + c30 += a3 * b0; + c31 += a3 * b1; + c32 += a3 * b2; + c33 += a3 * b3; + } + // Store + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + C[(i + 2) * P + j + 0] = c20; + C[(i + 2) * P + j + 1] = c21; + C[(i + 2) * P + j + 2] = c22; + C[(i + 2) * P + j + 3] = c23; + C[(i + 3) * P + j + 0] = c30; + C[(i + 3) * P + j + 1] = c31; + C[(i + 3) * P + j + 2] = c32; + C[(i + 3) * P + j + 3] = c33; + } + } +} + +void mat_mul_unrolled_4x4_parallel(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + // Parallelize by assigning each core one row + uint32_t const c = + numThreads / (M / 4); // How many columns to split the matrix into + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { + for (uint32_t j = c_start; j < c_end; j += 4) { + // Initialize 4x4 output tile + int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; + int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; + int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; + int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; + for (uint32_t k = 0; k < N; k += 1) { + // Explicitly load the values first to help with scheduling + int32_t b0 = B[k * P + j + 0]; + int32_t b1 = B[k * P + j + 1]; + int32_t b2 = B[k * P + j + 2]; + int32_t b3 = B[k * P + j + 3]; + // A could be local with scrambling + int32_t a0 = A[(i + 0) * N + k]; + int32_t a1 = A[(i + 1) * N + k]; + int32_t a2 = A[(i + 2) * N + k]; + int32_t a3 = A[(i + 3) * N + k]; + // Compute + c00 += a0 * b0; + c01 += a0 * b1; + c02 += a0 * b2; + c03 += a0 * b3; + c10 += a1 * b0; + c11 += a1 * b1; + c12 += a1 * b2; + c13 += a1 * b3; + c20 += a2 * b0; + c21 += a2 * b1; + c22 += a2 * b2; + c23 += a2 * b3; + c30 += a3 * b0; + c31 += a3 * b1; + c32 += a3 * b2; + c33 += a3 * b3; + } + // Store + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + C[(i + 2) * P + j + 0] = c20; + C[(i + 2) * P + j + 1] = c21; + C[(i + 2) * P + j + 2] = c22; + C[(i + 2) * P + j + 3] = c23; + C[(i + 3) * P + j + 0] = c30; + C[(i + 3) * P + j + 1] = c31; + C[(i + 3) * P + j + 2] = c32; + C[(i + 3) * P + j + 3] = c33; + } + } +} + +void mat_mul_unrolled_4x4_conflict_opt_parallel(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, + uint32_t M, uint32_t N, + uint32_t P, uint32_t id, + uint32_t numThreads) { + + ///////////////////////////// + // Configuration // + ///////////////////////////// + // Parallelize by assigning each core one row + // How many cores per window + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + + // For avoiding group conflict by same tile + // Each cores in the same tile should access to different groups + uint32_t group_bank_nums = 512; // MemPool = 256 + uint32_t tile_core_nums = 8; // MemPool = 4 + uint32_t jump_lines_A = group_bank_nums / N; // Used for i control + uint32_t jump_lines_B = group_bank_nums / P; // Used for k control + // Window size limit, min jump lines is 4 for MatrixA + if (jump_lines_A < 4) { + jump_lines_A = 4; + } + + ///////////////////////////// + // LOOP OFFSET // + ///////////////////////////// + // Outer Loop Control, for group access port conflict + uint32_t i_offset = jump_lines_A * (id % tile_core_nums); + // Inner Loop Incremental Control, for group access port conflict + uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); + // Inner Loop Control + // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) + uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; + // Middle Loop Control, window jump for avoiding bank conflict + uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; + uint32_t j_offset = (2 * (id / c)) / conflict_row; + + ///////////////////////////// + // LOOP CONTROL // + ///////////////////////////// + // Inner Round-Robin + if (k_offset >= N) { + k_offset = k_offset - N * (k_offset / N); + } + // Middle Round-Robin + uint32_t window_in_P = (P / c) / 4; + if (j_offset >= window_in_P) { + j_offset = j_offset - window_in_P * (j_offset / window_in_P); + } + // Outer Loop Control + uint32_t outer_loop_counter = 0; + uint32_t outer_loop_time = M / (4 * numThreads); + if (outer_loop_time < 1) { + outer_loop_time = 1; + } + uint32_t M_partition = M / outer_loop_time; + + ///////////////////////////// + // *LOOP START* // + ///////////////////////////// + for (uint32_t i_ori = 4 * (id / c); i_ori < M; + i_ori += 4 * (numThreads / c)) { + outer_loop_counter += 1; + uint32_t i = i_ori + i_offset; + // Round-Robin control, if offset lines > M, back to the first window + if (i >= M_partition * outer_loop_counter) { + i = i - M_partition * (i / (M_partition * outer_loop_counter)); + } + // Backup counter for mid-loop + uint32_t j_offset_counter = c_start + j_offset * 4; + uint32_t P_counter = c_end; + + Mid_loop: + for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { + // Initialize 4x4 output tile + int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; + int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; + int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; + int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; + + // Backup the variables for restore and later use + uint32_t k_offset_counter = k_offset; + uint32_t N_counter = N; + + Inner_Loop: + for (uint32_t k = k_offset_counter; k < N_counter; k += 1) { + // Explicitly load the values first to help with scheduling + int32_t b0 = B[k * P + j + 0]; + int32_t b1 = B[k * P + j + 1]; + int32_t b2 = B[k * P + j + 2]; + int32_t b3 = B[k * P + j + 3]; + // A could be local with scrambling + int32_t a0 = A[(i + 0) * N + k]; + int32_t a1 = A[(i + 1) * N + k]; + int32_t a2 = A[(i + 2) * N + k]; + int32_t a3 = A[(i + 3) * N + k]; + // Compute + c00 += a0 * b0; + c01 += a0 * b1; + c02 += a0 * b2; + c03 += a0 * b3; + c10 += a1 * b0; + c11 += a1 * b1; + c12 += a1 * b2; + c13 += a1 * b3; + c20 += a2 * b0; + c21 += a2 * b1; + c22 += a2 * b2; + c23 += a2 * b3; + c30 += a3 * b0; + c31 += a3 * b1; + c32 += a3 * b2; + c33 += a3 * b3; + } + + // Pseudo-jump code to avoid complie inner-loop twice + // Complie twice will have scheduling issue due to register file limit. + if (k_offset_counter > 0) { + N_counter = k_offset; + k_offset_counter = 0; + goto Inner_Loop; + } + + // Store + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + C[(i + 2) * P + j + 0] = c20; + C[(i + 2) * P + j + 1] = c21; + C[(i + 2) * P + j + 2] = c22; + C[(i + 2) * P + j + 3] = c23; + C[(i + 3) * P + j + 0] = c30; + C[(i + 3) * P + j + 1] = c31; + C[(i + 3) * P + j + 2] = c32; + C[(i + 3) * P + j + 3] = c33; + } + + if (j_offset_counter != c_start) { + P_counter = j_offset_counter; + j_offset_counter = c_start; + goto Mid_loop; + } + } +} + +/*******************************/ +/* ASM CODE KERNEL START BELOW */ +/*******************************/ + +// Define immediate values that used in asm code. +#define N3 (matrix_M - 3) * 4 +#define N31 (-3 * matrix_N + 1) * 4 +#define P3 (matrix_P - 3) * 4 +#define P31 (-3 * matrix_N + 1) * 4 + +void mat_mul_unrolled_4x4_parallel_asm(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + // Parallelize by assigning each tile one row + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + // numThreads / (M / 4); // How many columns to split the matrix into + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { + for (uint32_t j = c_start; j < c_end; j += 4) { + // Address registers + int32_t const *addr_a = &A[i * N]; + int32_t const *addr_b = &B[j]; + int32_t const *end_b = &B[N * P + j]; + int32_t const *addr_c = &C[i * P + j]; + int32_t const N3_1_r = (-3 * (int32_t)N + 1) * 4; + int32_t const P_3_r = ((int32_t)P - 3) * 4; + + register int32_t k asm("x1") = (int32_t)end_b; + // x12 x13 x14 x15 + // + // x3 x16 x17 x18 x19 + // x4 x20 x21 x22 x23 + // x10 x24 x25 x26 x27 + // x11 x28 x29 x30 x31 + // + // + __asm__ volatile( + ".balign 16 \n\t" + // Outer loop: Initialize and preload. Execute this loop P times + // TODO arrange + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + // Initial computation + prefetching + "mul x16, x3, x12 \n\t" + "mul x17, x3, x13 \n\t" + "mul x18, x3, x14 \n\t" + "mul x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "mul x20, x4, x12 \n\t" + "mul x21, x4, x13 \n\t" + "mul x22, x4, x14 \n\t" + "mul x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "mul x24, x10, x12 \n\t" + "mul x25, x10, x13 \n\t" + "mul x26, x10, x14 \n\t" + "mul x27, x10, x15 \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "mul x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "mul x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "mul x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "mul x31, x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + // Inner loop: Do this loop N times + "1: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x20, x4, x12 \n\t" + "p.mac x21, x4, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.mac x22, x4, x14 \n\t" + "p.mac x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.mac x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.mac x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.mac x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.mac x31, x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + "bne %[addr_b], x1, 1b \n\t" + // Loop done store + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.sw x16, 4(%[addr_c]!) \n\t" + "p.mac x19, x3, x15 \n\t" + "p.sw x17, 4(%[addr_c]!) \n\t" + "p.mac x20, x4, x12 \n\t" + "p.sw x18, 4(%[addr_c]!) \n\t" + "p.mac x21, x4, x13 \n\t" + "p.sw x19, %[P_3](%[addr_c]!) \n\t" + "p.mac x22, x4, x14 \n\t" + "p.sw x20, 4(%[addr_c]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.sw x21, 4(%[addr_c]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.sw x22, 4(%[addr_c]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.sw x23, %[P_3](%[addr_c]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.sw x24, 4(%[addr_c]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.sw x25, 4(%[addr_c]!) \n\t" + "p.mac x28, x11, x12 \n\t" + "p.sw x26, 4(%[addr_c]!) \n\t" + "p.mac x29, x11, x13 \n\t" + "p.sw x27, %[P_3](%[addr_c]!) \n\t" + "p.mac x30, x11, x14 \n\t" + "p.sw x28, 4(%[addr_c]!) \n\t" + "p.mac x31, x11, x15 \n\t" + "p.sw x29, 4(%[addr_c]!) \n\t" + "p.sw x30, 4(%[addr_c]!) \n\t" + "p.sw x31, %[P_3](%[addr_c]!) \n\t" + : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), + [addr_c] "+&r"(addr_c) // Outputs + : [N3_1] "r"(N3_1_r), [P_3] "r"(P_3_r), [x1] "r"(k), + [N] "I"(matrix_N * 4) // Inputs + : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", + "x27", "x28", "x29", "x30", "x31", "memory"); // Clobber + } + } +} + +void mat_mul_unrolled_4x4_conflict_opt_parallel_asm( + int32_t const *__restrict__ A, int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + + ///////////////////////////// + // Configuration // + ///////////////////////////// + // Parallelize by assigning each core one row + // How many cores per window + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + + // For avoiding group conflict by same tile + // Each cores in the same tile should access to different groups + uint32_t group_bank_nums = 512; // MemPool = 256 + uint32_t tile_core_nums = 8; // MemPool = 4 + uint32_t jump_lines_A = group_bank_nums / N; // Used for i control + uint32_t jump_lines_B = group_bank_nums / P; // Used for k control + // Window size limit, min jump lines is 4 for MatrixA + if (jump_lines_A < 4) { + jump_lines_A = 4; + } + + ///////////////////////////// + // LOOP OFFSET // + ///////////////////////////// + // Outer Loop Control, for group access port conflict + uint32_t i_offset = jump_lines_A * (id % tile_core_nums); + // Inner Loop Incremental Control, for group access port conflict + uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); + // Inner Loop Control + // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) + uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; + // Middle Loop Control, window jump for avoiding bank conflict + uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; + uint32_t j_offset = (2 * (id / c)) / conflict_row; + + ///////////////////////////// + // LOOP CONTROL // + ///////////////////////////// + // Inner Round-Robin + if (k_offset >= N) { + k_offset = k_offset - N * (k_offset / N); + } + // Middle Round-Robin + uint32_t window_in_P = (P / c) / 4; + if (j_offset >= window_in_P) { + j_offset = j_offset - window_in_P * (j_offset / window_in_P); + } + // Outer Loop Control + uint32_t outer_loop_counter = 0; + uint32_t outer_loop_time = M / (4 * numThreads); + if (outer_loop_time < 1) { + outer_loop_time = 1; + } + uint32_t M_partition = M / outer_loop_time; + + ///////////////////////////// + // *LOOP START* // + ///////////////////////////// + for (uint32_t i_ori = 4 * (id / c); i_ori < M; + i_ori += 4 * (numThreads / c)) { + outer_loop_counter += 1; + uint32_t i = i_ori + i_offset; + // Round-Robin control, if offset lines > M, back to the first window + if (i >= M_partition * outer_loop_counter) { + i = i - M_partition * (i / (M_partition * outer_loop_counter)); + } + // Backup counter for mid-loop + uint32_t j_offset_counter = c_start + j_offset * 4; + uint32_t P_counter = c_end; + + Mid_loop: + for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { + // Address registers + int32_t const *addr_a_ori = &A[i * N]; + int32_t const *addr_b_ori = &B[j]; + int32_t const *addr_a = &A[i * N + k_offset]; + int32_t const *addr_b = &B[k_offset * P + j]; + int32_t const *end_b = &B[N * P + j]; + int32_t const *addr_c = &C[i * P + j]; + register int32_t k asm("x1") = (int32_t)end_b; + + __asm__ volatile( + ".balign 16 \n\t" + // Outer loop: Initialize and preload. Execute this loop P times + // TODO arrange + "add sp, sp, -8 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "sw %[addr_a_ori], 4(sp) \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + + // If reach endpoint, swap address + "bne %[addr_b], x1, init_comp \n\t" + "lw x1, 0(sp) \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + + // Initial computation + prefetching + "init_comp: \n\t" + "mul x16, x3, x12 \n\t" + "mul x17, x3, x13 \n\t" + "mul x18, x3, x14 \n\t" + "mul x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "mul x20, x4, x12 \n\t" + "mul x21, x4, x13 \n\t" + "mul x22, x4, x14 \n\t" + "mul x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "mul x24, x10, x12 \n\t" + "mul x25, x10, x13 \n\t" + "mul x26, x10, x14 \n\t" + "mul x27, x10, x15 \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "mul x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "mul x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "mul x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "mul %[addr_a_ori], x11, x15 \n\t" // Use addr_a_ori instead of x31 + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + + // If reach endpoint, swap address + "bne %[addr_b], x1, inner_loop \n\t" + "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 + "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori + "lw x1, 0(sp) \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 + + // Inner loop: Do this loop N times + "inner_loop: \n\t" + "1: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x20, x4, x12 \n\t" + "p.mac x21, x4, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.mac x22, x4, x14 \n\t" + "p.mac x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.mac x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.mac x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.mac x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.mac %[addr_a_ori], x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + "bne %[addr_b], x1, 1b \n\t" + + // Case1: Loop done if k_offset = 0 + // Case2: Loop done when 2nd time to here + // Case3: If reach endpoint, swap address + "lw %[addr_b], 0(sp) \n\t" + "beq %[addr_b_ori], %[addr_b], store \n\t" + "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 + "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori + "addi x1, %[addr_b], 0 \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 + "j 1b \n\t" + + // Loop done store + "store: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.sw x16, 4(%[addr_c]!) \n\t" + "p.mac x19, x3, x15 \n\t" + "p.sw x17, 4(%[addr_c]!) \n\t" + "p.mac x20, x4, x12 \n\t" + "p.sw x18, 4(%[addr_c]!) \n\t" + "p.mac x21, x4, x13 \n\t" + "p.sw x19, %[P_3](%[addr_c]!) \n\t" + "p.mac x22, x4, x14 \n\t" + "p.sw x20, 4(%[addr_c]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.sw x21, 4(%[addr_c]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.sw x22, 4(%[addr_c]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.sw x23, %[P_3](%[addr_c]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.sw x24, 4(%[addr_c]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.sw x25, 4(%[addr_c]!) \n\t" + "p.mac x28, x11, x12 \n\t" + "p.sw x26, 4(%[addr_c]!) \n\t" + "p.mac x29, x11, x13 \n\t" + "p.sw x27, %[P_3](%[addr_c]!) \n\t" + "p.mac x30, x11, x14 \n\t" + "p.sw x28, 4(%[addr_c]!) \n\t" + "p.mac %[addr_a_ori], x11, x15 \n\t" + "p.sw x29, 4(%[addr_c]!) \n\t" + "p.sw x30, 4(%[addr_c]!) \n\t" + "p.sw %[addr_a_ori], %[P_3](%[addr_c]!) \n\t" + "add sp, sp, 8 \n\t" + : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), + [addr_c] "+&r"(addr_c), [addr_a_ori] "+&r"(addr_a_ori), + [addr_b_ori] "+&r"(addr_b_ori) // Outputs + : [N3_1] "r"(N31), [P_3] "I"(P3), [x1] "r"(k), + [N] "I"(matrix_N * 4) // Inputs + : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", + "x27", "x28", "x29", "x30", "memory"); // Clobber + } + if (j_offset_counter != c_start) { + P_counter = j_offset_counter; + j_offset_counter = c_start; + goto Mid_loop; + } + } +}