Skip to content

Commit

Permalink
[software] Change Data Generation
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 6, 2024
1 parent 92c681c commit e10df9f
Show file tree
Hide file tree
Showing 49 changed files with 875 additions and 2,146 deletions.
20 changes: 10 additions & 10 deletions software/apps/baremetal/axpy_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_axpy_f16.h"
#include "baremetal/mempool_checks.h"
Expand All @@ -34,27 +34,27 @@ int main() {
time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t));
dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t));
}
uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
uint32_t register volatile a = *(uint32_t *)&(l2_A)&0x0000FFFF;
mempool_barrier(num_cores);

// // SINGLE
// time_init = mempool_get_timer();
// axpy_f16s(A, l1_X, l1_Y, LEN);
// axpy_f16s(A, l1_X, l1_Y, array_N);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
// axpy_f16vecp_unrolled4(A, l1_X, l1_Y, array_N, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
// axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
// axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
mempool_start_benchmark();
axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
mempool_stop_benchmark();
time_end = mempool_get_timer();

Expand All @@ -64,7 +64,7 @@ int main() {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
mempool_check_f16(l1_Y, l2_Z, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
Expand Down
18 changes: 9 additions & 9 deletions software/apps/baremetal/axpy_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_axpy_f32.h"
#include "baremetal/mempool_checks.h"
Expand All @@ -34,25 +34,25 @@ int main() {
time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
}
float register volatile a = A;
float register volatile a = l2_A;
mempool_barrier(num_cores);

// PARALLEL
time_init = mempool_get_timer();
// axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
// axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
// axpy_f32p(a, l1_X, l1_Y, array_N, num_cores);
// axpy_f32p_unrolled4(a, l1_X, l1_Y, array_N, num_cores);
axpy_f32p_local_unrolled4(a, l1_X, l1_Y, array_N);
time_end = mempool_get_timer();

// Check results
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
mempool_check_f32(l1_Y, l2_Z, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
Expand Down
5 changes: 3 additions & 2 deletions software/apps/baremetal/axpy_i32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "runtime.h"
#include "synchronization.h"

#include "baremetal/mempool_axpy_i32p.h"
#include "baremetal/mempool_axpy_i32.h"
#include "baremetal/mempool_checks.h"
#include "data_axpy_i32.h"

Expand All @@ -38,11 +38,12 @@ int main() {
dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
error = 0;
}
register volatile int32_t a = l2_A;
mempool_barrier(num_cores);

// Benchmark
mempool_start_benchmark();
calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores);
calc_axpy_unloop_x4_localbank(l1_X, l1_Y, a, array_N, core_id, num_cores);
mempool_barrier(num_cores);
mempool_stop_benchmark();

Expand Down
32 changes: 16 additions & 16 deletions software/apps/baremetal/cfft_radix4_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,26 @@

/* CFFT data libraries */
#include "data_cfft_radix4_f16.h"
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))

/* CHOOSE ONE */
//#define PARALLEL // Parallel FFT not "memory-aware".
//#define FOLDED // Parallel FFT with "memory-aware" load/store.
#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
#define PARALLEL // Parallel FFT not "memory-aware".
// #define FOLDED // Parallel FFT with "memory-aware" load/store.
//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''

// Bitreversal index from table.
#define BITREVERSETABLE
// Also the twiddles have "memory-aware" load/stores.
// #define FOLDED_TWIDDLES

// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 1
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 1
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#endif
// Also the twiddles have "memory-aware" load/stores.
#define FOLDED_TWIDDLES

#include "baremetal/mempool_cfft_q16_bitreversal.h"
#include "baremetal/mempool_checks.h"
Expand All @@ -47,9 +50,9 @@ __fp16 l1_pSrc[2 * N_CSAMPLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_pDst[2 * N_CSAMPLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
__fp16 l1_twiddleCoef_f16_src[2 * N_TWIDDLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
__fp16 l1_twiddleCoef_f16_dst[2 * N_TWIDDLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
Expand Down Expand Up @@ -80,7 +83,7 @@ int main() {
if (core_id == 0) {
dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
N_TWIDDLES * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
printf("01: END INITIALIZATION\n");
Expand All @@ -97,6 +100,8 @@ int main() {
l2_pSrc, N_CSAMPLES * sizeof(int32_t));
}
}
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
N_TWIDDLES * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
}
Expand All @@ -114,13 +119,8 @@ int main() {
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
}
}
#else
if (core_id == 0) {
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
}
#endif
mempool_barrier(num_cores);
#endif

if (core_id == 0) {
printf("01: END INITIALIZATION\n");
Expand All @@ -132,7 +132,7 @@ int main() {

#ifdef PARALLEL
mempool_start_benchmark();
mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src,
num_cores);
mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
l1_BitRevIndexTable, num_cores);
Expand Down Expand Up @@ -176,7 +176,7 @@ int main() {
printf("02: END COMPUTATION\n");
}

mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, (float)TOLERANCE, 0);
mempool_barrier(num_cores);
return 0;
}
19 changes: 10 additions & 9 deletions software/apps/baremetal/cholesky_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
#include "baremetal/mempool_cholesky_f16s.h"

#define SINGLE
#define FOLDED (0)

__fp16 l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
__fp16 l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
__attribute__((section(".l1_prio")));
__fp16 l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
__fp16 l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
__attribute__((section(".l1_prio")));

int main() {
Expand All @@ -32,9 +33,9 @@ int main() {
/* Initialize matrices */
if (core_id == 0) {
dma_memcpy_blocking(l1_GIn, l2_GIn,
dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_LOut, l2_LOut,
dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
}
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);
Expand All @@ -43,7 +44,7 @@ int main() {
/* Benchmark */
if (core_id == 0) {
mempool_start_benchmark();
mempool_cholesky_f16vecs(l1_GIn, l1_LOut, dim_N);
mempool_cholesky_f16vecs(l1_GIn, l1_LOut, matrix_N, FOLDED);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand All @@ -52,15 +53,15 @@ int main() {
#ifdef PARALLEL
for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
mempool_start_benchmark();
__fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
__fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, dim_N);
__fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
__fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, matrix_N, FOLDED);
}
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

mempool_check_f16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 0.01f, 0);
mempool_check_f16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 0.01f, 0);
mempool_barrier(num_cores);
return 0;
}
18 changes: 9 additions & 9 deletions software/apps/baremetal/cholesky_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

#define SINGLE

int16_t l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
int16_t l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
__attribute__((section(".l1_prio")));
int16_t l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
int16_t l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
__attribute__((section(".l1_prio")));

int main() {
Expand All @@ -29,9 +29,9 @@ int main() {
/* Initialize matrices */
if (core_id == 0) {
dma_memcpy_blocking(l1_GIn, l2_GIn,
dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_LOut, l2_LOut,
dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
}
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);
Expand All @@ -40,7 +40,7 @@ int main() {
/* Benchmark */
if (core_id == 0) {
mempool_start_benchmark();
mempool_cholesky_q16vecs(l1_GIn, l1_LOut, dim_N);
mempool_cholesky_q16vecs(l1_GIn, l1_LOut, matrix_N);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand All @@ -49,15 +49,15 @@ int main() {
#ifdef PARALLEL
for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
mempool_start_benchmark();
__fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
__fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, dim_N);
__fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
__fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, matrix_N);
}
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

mempool_check_q16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 16, 0);
mempool_check_i16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 16, 0);
mempool_barrier(num_cores);
return 0;
}
15 changes: 7 additions & 8 deletions software/apps/baremetal/cmatmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
#include "synchronization.h"

#include "data_cmatmul_f16.h"
#define dim_M (matrix_M)
#define dim_N (matrix_N)
#define dim_P (matrix_P)

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cmatmul_f16.h"
#define PARALLEL_2x4
#define TEST
#define PARALLEL_4x4

#if defined(PARALLEL_4x4_COPIES_A)
__fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
Expand All @@ -43,8 +45,8 @@ int main() {

// Initialize Matrices
if (core_id == 0) {
dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t));
dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t));
dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t));
dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t));
}
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);
Expand Down Expand Up @@ -104,10 +106,7 @@ int main() {
mempool_stop_benchmark();
#endif

#if defined(TEST)
mempool_check_f16(matrix_c, C, 2 * dim_M * dim_P, 0.1f, 0);
mempool_check_f16(matrix_c, l2_C, 10, 0.1f, 0);
mempool_barrier(num_cores);
#endif

return 0;
}
Loading

0 comments on commit e10df9f

Please sign in to comment.