Skip to content

Commit

Permalink
[software] Fix floating point testing applications
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 12, 2023
1 parent 2d1bb31 commit 206c1bf
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 224 deletions.
108 changes: 0 additions & 108 deletions software/apps/fp_test/main.c

This file was deleted.

80 changes: 20 additions & 60 deletions software/apps/matmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,98 +7,58 @@
#include <stdint.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "runtime.h"
#include "synchronization.h"

#include "data/data_matmul_f16.h"
#include "kernel/matmul_f16.h"
#include "kernel/mempool_checks.h"

#define PARALLEL

__fp16 matrix_a[matrix_M * matrix_N]
__attribute__((aligned((matrix_M * matrix_N) / 2), section(".l1")));
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 matrix_b[matrix_N * matrix_P]
__attribute__((aligned((matrix_N * matrix_P) / 2), section(".l1")));
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 matrix_c[matrix_M * matrix_P]
__attribute__((aligned((matrix_M * matrix_P) / 2), section(".l1")));

int volatile error __attribute__((section(".l1")));

void init_matrix(__fp16 *matrix, __fp16 *input, uint32_t num_rows,
uint32_t num_columns, uint32_t core_id, uint32_t num_cores) {
for (uint32_t i = core_id; i < (num_columns * num_rows); i += num_cores) {
matrix[i] = input[i];
}
return;
}

int verify_result(__fp16 *__restrict__ Res, __fp16 *__restrict__ Exp,
uint32_t M, uint32_t P, uint32_t core_id,
uint32_t num_cores) {
if (core_id == 0) {
for (uint32_t i = 0; i < M * P; i++) {
__fp16 exp = Exp[i];
__fp16 res = Res[i];
__fp16 dif;
float tol = (__fp16)0.05f;
float dif_f32;
asm volatile("fsub.h %[dif], %[res], %[exp];"
"fcvt.h.s %[dif_f32], %[dif];"
: [dif] "+&r"(dif), [dif_f32] "+&r"(dif_f32)
: [res] "r"(res), [exp] "r"(exp)
:);

if ((dif_f32 > tol) || (dif_f32 < (-tol))) {
printf("ERROR(%d): %x - %x - %x\n", i, *(int32_t *)&dif,
*(int32_t *)&exp, *(int32_t *)&res);
}
}
// Wait at barrier before checking
mempool_barrier(num_cores);
}
return 0;
}
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));

int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
// Initialize barrier and synchronize
mempool_barrier_init(core_id);

// Initialize Matrices 1
if (core_id == 0) {
error = 0;
dma_memcpy_blocking(matrix_a, A, (matrix_M * matrix_N) * sizeof(int16_t));
dma_memcpy_blocking(matrix_b, B, (matrix_N * matrix_P) * sizeof(int16_t));
}
// Initialize Matrices
init_matrix(matrix_a, A, matrix_M, matrix_N, core_id, num_cores);
init_matrix(matrix_b, B, matrix_N, matrix_P, core_id, num_cores);
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);

#if defined(PARALLEL)
// Execute function to test.
mempool_start_benchmark();
// matmul_2x2_parallel_f16(matrix_a, matrix_b, matrix_c, matrix_M,
// matrix_N, matrix_P, core_id, num_cores);
matmul_4x2_parallel_f16vec(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P, core_id, num_cores);
// dump_id(core_id);
mempool_stop_benchmark();
// Wait at barrier before checking
mempool_barrier(num_cores);
#elif defined(SINGLE)
#if defined(SINGLE)
if (core_id == 0) {
// Execute function to test.
mempool_start_benchmark();
matmul_2x2_single_f16(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P);
mempool_barrier(num_cores);
mempool_stop_benchmark();
}
// Wait at barrier before checking
#endif

#if defined(PARALLEL)
// Execute function to test.
mempool_start_benchmark();
matmul_4x2_parallel_f16vec(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P, core_id, num_cores);
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

verify_result(matrix_c, C, matrix_M, matrix_P, core_id, num_cores);
mempool_check_f16(matrix_c, C, matrix_M * matrix_P, 0.1f, 0);
mempool_barrier(num_cores);
return error;
return 0;
}
73 changes: 20 additions & 53 deletions software/apps/matmul_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,89 +7,56 @@
#include <stdint.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "runtime.h"
#include "synchronization.h"

#include "data/data_matmul_f32.h"
#include "kernel/matmul_f32.h"
#include "kernel/mempool_checks.h"

#define PARALLEL
#define ASM

float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1")));
float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1")));
float matrix_c[matrix_M * matrix_P] __attribute__((section(".l1")));

int volatile error __attribute__((section(".l1")));

void init_matrix(float *matrix, float *input, uint32_t num_rows,
uint32_t num_columns, uint32_t core_id, uint32_t num_cores) {
for (uint32_t i = core_id; i < (num_columns * num_rows); i += num_cores) {
matrix[i] = input[i];
}
}

int verify_result(float *__restrict__ C, float *__restrict__ Exp, uint32_t M,
uint32_t P, uint32_t core_id, uint32_t num_cores) {
if (core_id == 0) {
for (uint32_t i = 0; i < M * P; i++) {
float error = 0.0f;
float exp = Exp[i];
float res = C[i];
asm volatile("fsub.s %[error], %[res], %[exp];"
: [error] "+&r"(error)
: [res] "r"(res), [exp] "r"(exp));
if (error != 0.0f) {
printf("ERROR!!! OUT[%d] = 0x%8x\n", i, *(uint32_t *)&error);
}
}
// Wait at barrier before checking
mempool_barrier(num_cores);
}
return 0;
}
float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
float matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
// Initialize barrier and synchronize
mempool_barrier_init(core_id);

// Initialize Matrices
if (core_id == 0) {
error = 0;
dma_memcpy_blocking(matrix_a, A, matrix_M * matrix_N * sizeof(int32_t));
dma_memcpy_blocking(matrix_b, B, matrix_N * matrix_P * sizeof(int32_t));
}

// Initialize Matrices
init_matrix(matrix_a, A, matrix_M, matrix_N, core_id, num_cores);
init_matrix(matrix_b, B, matrix_N, matrix_P, core_id, num_cores);
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);

#if defined(PARALLEL)
// Execute function to test.
mempool_start_benchmark();
// matmul_2x2_parallel_f32(matrix_a, matrix_b, matrix_c, matrix_M,
// matrix_N, matrix_P, core_id, num_cores);
matmul_4x4_parallel_f32(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P, core_id, num_cores);
mempool_stop_benchmark();
// Wait at barrier before checking
mempool_barrier(num_cores);
#elif defined(SINGLE)
#if defined(SINGLE)
if (core_id == 0) {
// Execute function to test.
mempool_start_benchmark();
matmul_2x2_single_f32(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P);
mempool_stop_benchmark();
}
// Wait at barrier before checking
mempool_barrier(num_cores);
#endif

verify_result(matrix_c, C, matrix_M, matrix_P, core_id, num_cores);
#if defined(PARALLEL)
// Execute function to test.
mempool_start_benchmark();
matmul_2x2_parallel_f32(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
matrix_P, core_id, num_cores);
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

return error;
mempool_check_f32(matrix_c, C, matrix_M * matrix_P, 0.01f, 0);
mempool_barrier(num_cores);
return 0;
}
2 changes: 1 addition & 1 deletion software/runtime/data/data_matmul_f16.h.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
i = 0
out += '\n'
for a in array:
out += '{}f, '.format(a)
out += '(__fp16){}f, '.format(a)
i += 1
if i % 8 == 0:
out += '\n'
Expand Down
4 changes: 2 additions & 2 deletions software/runtime/kernel/matmul_f16.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,10 @@ void matmul_4x2_parallel_f16vec(const __fp16 *__restrict__ pSrcA,
asm volatile(
"pv.extract.h %[TempH], %[bVecTemp0], 1;"
"pv.extract.h %[TempL], %[bVecTemp1], 1;"
"pv.pack %[bVec0], %[TempL], %[TempH];"
"pv.pack.h %[bVec0], %[TempL], %[TempH];"
"pv.extract.h %[TempH], %[bVecTemp0], 0;"
"pv.extract.h %[TempL], %[bVecTemp1], 0;"
"pv.pack %[bVec1], %[TempL], %[TempH];"
"pv.pack.h %[bVec1], %[TempL], %[TempH];"
"vfdotpex.s.h %[sum00], %[aVec0], %[bVec0];"
"vfdotpex.s.h %[sum01], %[aVec0], %[bVec1];"
"vfdotpex.s.h %[sum10], %[aVec1], %[bVec0];"
Expand Down
Loading

0 comments on commit 206c1bf

Please sign in to comment.