diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c index ff13cb879..46119009a 100644 --- a/software/apps/baremetal/axpy_f16/main.c +++ b/software/apps/baremetal/axpy_f16/main.c @@ -18,8 +18,8 @@ #define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation -__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #include "baremetal/mempool_axpy_f16.h" #include "baremetal/mempool_checks.h" @@ -34,27 +34,27 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t)); - dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t)); } - uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF; + uint32_t register volatile a = *(uint32_t *)&(l2_A)&0x0000FFFF; mempool_barrier(num_cores); // // SINGLE // time_init = mempool_get_timer(); - // axpy_f16s(A, l1_X, l1_Y, LEN); + // axpy_f16s(A, l1_X, l1_Y, array_N); // time_end = mempool_get_timer(); // // PARALLEL // time_init = mempool_get_timer(); - // axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores); + // axpy_f16vecp_unrolled4(A, l1_X, l1_Y, array_N, num_cores); // time_end = mempool_get_timer(); // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN); + // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N); mempool_start_benchmark(); - axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN); + axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N); mempool_stop_benchmark(); time_end = mempool_get_timer(); @@ -64,7 +64,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); } - mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0); + mempool_check_f16(l1_Y, l2_Z, 100, 0.1f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c index 1b1bef859..da34c5fcd 100644 --- a/software/apps/baremetal/axpy_f32/main.c +++ b/software/apps/baremetal/axpy_f32/main.c @@ -18,8 +18,8 @@ #define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation -float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #include "baremetal/mempool_axpy_f32.h" #include "baremetal/mempool_checks.h" @@ -34,17 +34,17 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t)); - dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); } - float register volatile a = A; + float register volatile a = l2_A; mempool_barrier(num_cores); // PARALLEL time_init = mempool_get_timer(); - // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores); - // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores); - axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN); + // axpy_f32p(a, l1_X, l1_Y, array_N, num_cores); + // axpy_f32p_unrolled4(a, l1_X, l1_Y, array_N, num_cores); + axpy_f32p_local_unrolled4(a, l1_X, l1_Y, array_N); time_end = mempool_get_timer(); // Check results @@ -52,7 +52,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); } - mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0); + mempool_check_f32(l1_Y, l2_Z, 100, 0.1f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c index c391ba040..e5590c124 100644 --- a/software/apps/baremetal/axpy_i32/main.c +++ b/software/apps/baremetal/axpy_i32/main.c @@ -16,7 +16,7 @@ #include "runtime.h" #include "synchronization.h" -#include "baremetal/mempool_axpy_i32p.h" +#include "baremetal/mempool_axpy_i32.h" #include "baremetal/mempool_checks.h" #include "data_axpy_i32.h" @@ -38,11 +38,12 @@ int main() { dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); error = 0; } + register volatile int32_t a = l2_A; mempool_barrier(num_cores); // Benchmark mempool_start_benchmark(); - calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores); + calc_axpy_unloop_x4_localbank(l1_X, l1_Y, a, array_N, core_id, num_cores); mempool_barrier(num_cores); mempool_stop_benchmark(); diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c index 30341f46d..b06ae3189 100644 --- a/software/apps/baremetal/cfft_radix4_f16/main.c +++ b/software/apps/baremetal/cfft_radix4_f16/main.c @@ -19,14 +19,19 @@ /* CFFT data libraries */ #include "data_cfft_radix4_f16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) /* CHOOSE ONE */ -//#define PARALLEL // Parallel FFT not "memory-aware". -//#define FOLDED // Parallel FFT with "memory-aware" load/store. -#define SCHEDULED // Folded FFTs arranged in rows and cols.''' +#define PARALLEL // Parallel FFT not "memory-aware". +// #define FOLDED // Parallel FFT with "memory-aware" load/store. +//#define SCHEDULED // Folded FFTs arranged in rows and cols.''' // Bitreversal index from table. #define BITREVERSETABLE +// Also the twiddles have "memory-aware" load/stores. +// #define FOLDED_TWIDDLES + // Independent FFTs scheduled on one row (default 1). #define N_FFTs_ROW 1 // Independent FFTs scheduled on columns (default 1). @@ -34,8 +39,6 @@ #if (N_FFTs_COL > MAX_COL) #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] #endif -// Also the twiddles have "memory-aware" load/stores. -#define FOLDED_TWIDDLES #include "baremetal/mempool_cfft_q16_bitreversal.h" #include "baremetal/mempool_checks.h" @@ -47,9 +50,9 @@ __fp16 l1_pSrc[2 * N_CSAMPLES] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_pDst[2 * N_CSAMPLES] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4] +__fp16 l1_twiddleCoef_f16_src[2 * N_TWIDDLES] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4] +__fp16 l1_twiddleCoef_f16_dst[2 * N_TWIDDLES] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); @@ -80,7 +83,7 @@ int main() { if (core_id == 0) { dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t)); dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, - 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + N_TWIDDLES * sizeof(int32_t)); dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); printf("01: END INITIALIZATION\n"); @@ -97,6 +100,8 @@ int main() { l2_pSrc, N_CSAMPLES * sizeof(int32_t)); } } + dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, + N_TWIDDLES * sizeof(int32_t)); dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int32_t)); } @@ -114,13 +119,8 @@ int main() { *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)]; } } -#else - if (core_id == 0) { - dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, - 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); - } -#endif mempool_barrier(num_cores); +#endif if (core_id == 0) { printf("01: END INITIALIZATION\n"); @@ -132,7 +132,7 @@ int main() { #ifdef PARALLEL mempool_start_benchmark(); - mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1, + mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, num_cores); mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable, num_cores); @@ -176,7 +176,7 @@ int main() { printf("02: END COMPUTATION\n"); } - mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0); + mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, (float)TOLERANCE, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c index 908ca99fa..6d1c26ff2 100644 --- a/software/apps/baremetal/cholesky_f16/main.c +++ b/software/apps/baremetal/cholesky_f16/main.c @@ -18,10 +18,11 @@ #include "baremetal/mempool_cholesky_f16s.h" #define SINGLE +#define FOLDED (0) -__fp16 l1_GIn[2 * dim_N * dim_N * N_SAMPLES] +__fp16 l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES] __attribute__((section(".l1_prio"))); -__fp16 l1_LOut[2 * dim_N * dim_N * N_SAMPLES] +__fp16 l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES] __attribute__((section(".l1_prio"))); int main() { @@ -32,9 +33,9 @@ int main() { /* Initialize matrices */ if (core_id == 0) { dma_memcpy_blocking(l1_GIn, l2_GIn, - dim_N * dim_N * N_SAMPLES * sizeof(int32_t)); + matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t)); dma_memcpy_blocking(l1_LOut, l2_LOut, - dim_N * dim_N * N_SAMPLES * sizeof(int32_t)); + matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t)); } // Wait at barrier until everyone is ready mempool_barrier(num_cores); @@ -43,7 +44,7 @@ int main() { /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); - mempool_cholesky_f16vecs(l1_GIn, l1_LOut, dim_N); + mempool_cholesky_f16vecs(l1_GIn, l1_LOut, matrix_N, FOLDED); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -52,15 +53,15 @@ int main() { #ifdef PARALLEL for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) { mempool_start_benchmark(); - __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N; - __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N; - mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, dim_N); + __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N; + __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N; + mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, matrix_N, FOLDED); } mempool_barrier(num_cores); mempool_stop_benchmark(); #endif - mempool_check_f16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 0.01f, 0); + mempool_check_f16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 0.01f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c index 3c382c500..b0168614c 100644 --- a/software/apps/baremetal/cholesky_q16/main.c +++ b/software/apps/baremetal/cholesky_q16/main.c @@ -16,9 +16,9 @@ #define SINGLE -int16_t l1_GIn[2 * dim_N * dim_N * N_SAMPLES] +int16_t l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES] __attribute__((section(".l1_prio"))); -int16_t l1_LOut[2 * dim_N * dim_N * N_SAMPLES] +int16_t l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES] __attribute__((section(".l1_prio"))); int main() { @@ -29,9 +29,9 @@ int main() { /* Initialize matrices */ if (core_id == 0) { dma_memcpy_blocking(l1_GIn, l2_GIn, - dim_N * dim_N * N_SAMPLES * sizeof(int32_t)); + matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t)); dma_memcpy_blocking(l1_LOut, l2_LOut, - dim_N * dim_N * N_SAMPLES * sizeof(int32_t)); + matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t)); } // Wait at barrier until everyone is ready mempool_barrier(num_cores); @@ -40,7 +40,7 @@ int main() { /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); - mempool_cholesky_q16vecs(l1_GIn, l1_LOut, dim_N); + mempool_cholesky_q16vecs(l1_GIn, l1_LOut, matrix_N); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -49,15 +49,15 @@ int main() { #ifdef PARALLEL for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) { mempool_start_benchmark(); - __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N; - __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N; - mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, dim_N); + __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N; + __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N; + mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, matrix_N); } mempool_barrier(num_cores); mempool_stop_benchmark(); #endif - mempool_check_q16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 16, 0); + mempool_check_i16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 16, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c index be80f8c8b..aa2ed55a6 100644 --- a/software/apps/baremetal/cmatmul_f16/main.c +++ b/software/apps/baremetal/cmatmul_f16/main.c @@ -13,11 +13,13 @@ #include "synchronization.h" #include "data_cmatmul_f16.h" +#define dim_M (matrix_M) +#define dim_N (matrix_N) +#define dim_P (matrix_P) #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cmatmul_f16.h" -#define PARALLEL_2x4 -#define TEST +#define PARALLEL_4x4 #if defined(PARALLEL_4x4_COPIES_A) __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)] @@ -43,8 +45,8 @@ int main() { // Initialize Matrices if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t)); - dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t)); + dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t)); + dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t)); } // Wait at barrier until everyone is ready mempool_barrier(num_cores); @@ -104,10 +106,7 @@ int main() { mempool_stop_benchmark(); #endif -#if defined(TEST) - mempool_check_f16(matrix_c, C, 2 * dim_M * dim_P, 0.1f, 0); + mempool_check_f16(matrix_c, l2_C, 10, 0.1f, 0); mempool_barrier(num_cores); -#endif - return 0; } diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c index f7a6bd31d..0dcffbfc7 100644 --- a/software/apps/baremetal/cmatmul_q16/main.c +++ b/software/apps/baremetal/cmatmul_q16/main.c @@ -17,6 +17,9 @@ #include "data_cmatmul_q16.h" #define PARALLEL +#define dim_M (matrix_M) +#define dim_N (matrix_N) +#define dim_P (matrix_P) int16_t matrix_a[2 * dim_M * dim_N] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); @@ -33,8 +36,8 @@ int main() { // Initialize Matrices if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t)); - dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t)); + dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t)); + dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t)); } // Wait at barrier until everyone is ready mempool_barrier(num_cores); @@ -42,7 +45,7 @@ int main() { #ifdef SINGLE if (core_id == 0) { mempool_start_benchmark(); - cmatmul_2x4_q16s(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P); + cmatmul_2x2_q16s(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -56,7 +59,7 @@ int main() { mempool_barrier(num_cores); #endif - mempool_check_q16(matrix_c, C, 2 * dim_M * dim_P, 16, 0); + mempool_check_i16(matrix_c, l2_C, 2 * dim_M * dim_P, 16, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c index c579c8151..2091f0336 100644 --- a/software/apps/baremetal/dotp_f16/main.c +++ b/software/apps/baremetal/dotp_f16/main.c @@ -19,8 +19,8 @@ #define BINARY_REDUCTION // Vectors for kernel computation -__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); uint32_t red_barrier[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); __fp16 sum[2 * NUM_BANKS] @@ -38,8 +38,8 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t)); - dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t)); } for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { sum[k] = 0; @@ -49,19 +49,19 @@ int main() { // // SINGLE-CORE // time_init = mempool_get_timer(); - // dotp_f16s(l1_A, l1_B, sum, LEN); - // // dotp_f16s_unrolled4(l1_A, l1_B, sum, LEN); + // dotp_f16s(l1_X, l1_Y, sum, array_N); + // // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N); // time_end = mempool_get_timer(); // // PARALLEL // time_init = mempool_get_timer(); - // dotp_f16vecp_unrolled4(l1_A, l1_B, sum, LEN, num_cores); - // // dotp_f16p(l1_A, l1_B, sum, LEN, num_cores); + // dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores); + // // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores); // time_end = mempool_get_timer(); // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - dotp_f16vecp_local_unrolled4(l1_A, l1_B, sum, LEN); + dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N); time_end = mempool_get_timer(); // Check results @@ -70,7 +70,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); printf("Result ==> %x\n", *(uint32_t *)&sum[0]); - printf("Check ==> %x\n\n", *(uint32_t *)&l2_C); + printf("Check ==> %x\n\n", *(uint32_t *)&l2_Z); } mempool_barrier(num_cores); diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c index 731942eb7..3507795b1 100644 --- a/software/apps/baremetal/dotp_f32/main.c +++ b/software/apps/baremetal/dotp_f32/main.c @@ -20,8 +20,8 @@ #define BINARY_REDUCTION // Vectors for kernel computation -float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); uint32_t red_barrier[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); @@ -38,8 +38,8 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t)); - dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); } for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { sum[k] = 0; @@ -49,17 +49,17 @@ int main() { // // SINGLE-CORE // time_init = mempool_get_timer(); - // dotp_f32s_unrolled4(l1_A, l1_B, sum, LEN); + // dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N); // time_end = mempool_get_timer(); // // PARALLEL // time_init = mempool_get_timer(); - // dotp_f32p(l1_A, l1_B, sum, LEN, num_cores); + // dotp_f32p(l1_A, l1_B, sum, array_N, num_cores); // time_end = mempool_get_timer(); // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - dotp_f32p_local_unrolled4(l1_A, l1_B, sum, LEN); + dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N); time_end = mempool_get_timer(); // Check results @@ -68,7 +68,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); printf("Result ==> %d\n", sum[0]); - printf("Check ==> %d\n\n", l2_C); + printf("Check ==> %d\n\n", l2_Z); } mempool_barrier(num_cores); diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index ffd879c91..80309a1e0 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -19,8 +19,10 @@ #include "data_mimo_mmse_f16.h" #define ZF (0) // When asserted use zero-forcing -#define FOLD (0) // When asserted fold matrices in memory +#define FOLD (1) // When asserted fold matrices in memory #define NUM_BANKS (BANKING_FACTOR * NUM_CORES) +#define PARALLEL +#define VEC /********************************************************** ********************************************************** @@ -37,6 +39,8 @@ #if FOLD #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) +#define NUM_COL (NUM_BANKS / N_TX) + __fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW] @@ -68,6 +72,7 @@ int main() { #ifndef BANSHEE uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); // Initialize barrier and synchronize + uint32_t time_init, time_end; #endif /* Initialize matrices */ @@ -97,6 +102,7 @@ int main() { /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); + time_init = mempool_get_timer(); for (uint32_t itr = 0; itr < N_ITR; itr++) { __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX); __fp16 *Ptry = l1_y + itr * (2 * N_RX); @@ -107,24 +113,25 @@ int main() { __fp16 *Ptry3 = y3 + itr * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); #ifdef VEC - mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0); #else - mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF); + mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0); #endif mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } + time_end = mempool_get_timer(); mempool_stop_benchmark(); } #endif #ifdef PARALLEL mempool_start_benchmark(); - uint32_t time_init = mempool_get_timer(); + time_init = mempool_get_timer(); // Parallel subcarrier loop for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { @@ -133,14 +140,14 @@ int main() { __fp16 *PtrS = l1_S + itr * (2 * N_TX); // Auxiliary vectors #if FOLD - __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + - (itr / NUM_ROW) * (2 * N_TX); - __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + - (itr / NUM_ROW) * (2 * N_TX); + __fp16 *PtrG = l1_G + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) + + (itr % NUM_COL) * (2 * N_TX); + __fp16 *PtrL = l1_L + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) + + (itr % NUM_COL) * (2 * N_TX); __fp16 *Ptry2 = - y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + y2 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX); __fp16 *Ptry3 = - y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX); + y3 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); #else __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX); @@ -163,7 +170,7 @@ int main() { mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_barrier(num_cores); - uint32_t time_end = mempool_get_timer(); + time_end = mempool_get_timer(); mempool_stop_benchmark(); #endif @@ -179,6 +186,7 @@ int main() { if (core_id == 0) { printf("Runtime: %d\n", time_end - time_init); } + mempool_check_f16(l1_x, l2_x, 2 * N_RX * N_TX, 0.01f, 0); mempool_barrier(num_cores); #endif diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c index 194c4c71c..d243754fc 100644 --- a/software/apps/baremetal/mimo_mmse_f32/main.c +++ b/software/apps/baremetal/mimo_mmse_f32/main.c @@ -22,6 +22,8 @@ #include "data_mimo_mmse_f32.h" #define SINGLE +#define ZF (0) +#define FOLD (0) float l1_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); @@ -60,14 +62,14 @@ int main() { /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); - mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0); - mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0); + mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, ZF, FOLD); + mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX); #ifdef JACOBI mempool_jacobi_f32s(l1_G, y2, l1_x, N_TX, 0.005f, 20U); #else - mempool_cholesky_f32s(l1_G, l1_L, N_TX, 0); - mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, 0); - mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, 0); + mempool_cholesky_f32s(l1_G, l1_L, N_TX, FOLD); + mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, FOLD); + mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, FOLD); #endif mempool_stop_benchmark(); } @@ -75,7 +77,9 @@ int main() { #endif #if defined(PARALLEL) && defined(__XDIVSQRT) + // Each iteration is assigned to a processor + mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { @@ -83,7 +87,9 @@ int main() { float *PtrH = l1_H + itr * (2 * N_TX * N_RX); float *PtrS = l1_S + itr * (2 * N_TX); float *Ptry = l1_y + itr * (2 * N_RX); + // Intermediate results and outputs + #if FOLD __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) + (itr / NUM_ROW) * (2 * N_TX); @@ -102,7 +108,7 @@ int main() { float *Ptrx = l1_x + itr * (2 * N_TX); #endif - mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, FOLD); + mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD); mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f32s(PtrG, PtrL, N_TX, FOLD); mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); @@ -114,19 +120,24 @@ int main() { #if defined(PARALLEL_HERMITIAN) && defined(__XDIVSQRT) mempool_start_benchmark(); + // Each iteration is assigned to a pool of processors - // In a pool each PE gets a column of the H matrix, accumulating a row of the - // output matrix + // In a pool each PE gets a column of the H matrix, accumulating + // a row of the output matrix + uint32_t pool_id = core_id / N_TX; uint32_t num_pools = num_cores / N_TX; for (uint32_t itr = pool_id; itr < N_ITR; itr += num_pools) { float *PtrH = l1_H + itr * (2 * N_TX * N_RX); float *PtrG = l1_G + itr * (2 * N_TX * N_TX); float *PtrS = l1_S + itr * N_TX; - mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, 0, 0, core_id % N_TX, - N_TX); + mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD, + core_id % N_TX, N_TX); } mempool_stop_benchmark(); + + // Each iteration is assigned to a processor + mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { // Inputs @@ -138,10 +149,10 @@ int main() { float *Ptry2 = y2 + itr * (2 * N_TX); float *Ptry3 = y3 + itr * (2 * N_TX); float *Ptrx = l1_x + itr * (2 * N_TX); - mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); + mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f32s(PtrG, PtrL, N_TX, 0); - mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, 0); - mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, 0); + mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } mempool_log_barrier(2, core_id); mempool_stop_benchmark(); diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index 24fd9e44d..9bcb5e9db 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -28,7 +28,7 @@ int16_t l1_L[2 * N_TX * N_TX * N_ITR] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), section(".l1_prio"))); -int16_t l1_Sigma[2 * N_TX * N_ITR] +int16_t l1_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); int16_t l1_y[2 * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); @@ -50,7 +50,7 @@ int main() { if (core_id == 0) { dma_memcpy_blocking(l1_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); + dma_memcpy_blocking(l1_S, l2_S, N_TX * N_ITR * sizeof(int32_t)); } mempool_barrier(num_cores); @@ -79,7 +79,7 @@ int main() { int16_t *PtrH = l1_H + itr * (2 * N_TX * N_RX); int16_t *Ptry = l1_y + itr * (2 * N_RX); - int16_t *PtrSigma = l1_Sigma + itr * (2 * N_TX); + int16_t *PtrSigma = l1_S + itr * (2 * N_TX); int16_t *PtrG = l1_G + itr * (2 * N_TX * N_TX); int16_t *PtrL = l1_L + itr * (2 * N_TX * N_TX); diff --git a/software/apps/baremetal/ofdm/main.c b/software/apps/baremetal/ofdm_f16/main.c similarity index 98% rename from software/apps/baremetal/ofdm/main.c rename to software/apps/baremetal/ofdm_f16/main.c index 210501cad..264768199 100644 --- a/software/apps/baremetal/ofdm/main.c +++ b/software/apps/baremetal/ofdm_f16/main.c @@ -17,7 +17,8 @@ #include "runtime.h" #include "synchronization.h" -#include "data_ofdm.h" +#include "data_ofdm_f16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) // CFFT Parameters #define SCHEDULED diff --git a/software/data/README.md b/software/data/README.md index 9fdab87cf..066280965 100644 --- a/software/data/README.md +++ b/software/data/README.md @@ -6,7 +6,7 @@ The application parameters are passed to the script with the `gendata_params.hjs An example entry follows: `matmul_f32` is the name of MemPool application under test, the `type` refers to numpy precision, the `defines` are application parameters, turned into C constant declarations in the form `#define matrix_M (16)`, the `arrays` encode the C-type and name of input vectors for the application under test. -` +``` "matmul_f32": { "type": "float32", "defines": [ @@ -20,7 +20,7 @@ An example entry follows: `matmul_f32` is the name of MemPool application under ("float", "l2_C") ] } -` +``` ## To test a new application: If a new application requires to be tested with data generated from a reference golden model: diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl deleted file mode 100644 index 4c6034baf..000000000 --- a/software/data/data_axpy_f16.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4f}, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LEN (${Len}) - -__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl deleted file mode 100644 index f3fdc8b6a..000000000 --- a/software/data/data_axpy_f32.h.tpl +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - - -#define LEN (${Len}) - -float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/data_cfft_f16.h.tpl b/software/data/data_cfft_f16.h.tpl deleted file mode 100644 index d21829e88..000000000 --- a/software/data/data_cfft_f16.h.tpl +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Maximum number of independent FFT columns allowed -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) - -// Data arrays -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pSrc[${2 * Len}] = ${array_to_cstr(vector_inp)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pRes[${2 * Len}] = ${array_to_cstr(vector_res)}; - -// Twiddles -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_f16[${2 * Len}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_cfft_q16.h.tpl b/software/data/data_cfft_q16.h.tpl deleted file mode 100644 index fb1ba908a..000000000 --- a/software/data/data_cfft_q16.h.tpl +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_cfft_q16.py -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Maximum number of independent FFT columns allowed -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) -// Tolerance for correctness check -#define TOLERANCE (${tolerance}) - -// Data arrays -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pSrc[${2 * Len}] = ${array_to_cstr(vector_inp)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pRes[${2 * Len}] = ${array_to_cstr(vector_res)}; - -// Twiddles -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_chest_f16.h.tpl b/software/data/data_chest_f16.h.tpl deleted file mode 100644 index 25d9e420f..000000000 --- a/software/data/data_chest_f16.h.tpl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_chest_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.5}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${nb_tx}) -#define N_RX (${nb_rx}) -#define N_SAMPLES (${nb_samples}) - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotRX[${2*nb_rx*nb_samples}] = ${array_to_cstr(pilot_rx)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotTX[${2*nb_tx*nb_samples}] = ${array_to_cstr(pilot_tx)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_HEST[${2*nb_rx*nb_tx*nb_samples}] = ${array_to_cstr(Hest)}; diff --git a/software/data/data_cholesky_f16.h.tpl b/software/data/data_cholesky_f16.h.tpl deleted file mode 100644 index 32ad3e2fe..000000000 --- a/software/data/data_cholesky_f16.h.tpl +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:0.5f}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define dim_N (${n_matrix}) -#define N_SAMPLES (${n_samples}) -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(G)}; -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(L)}; diff --git a/software/data/data_cholesky_q16.h.tpl b/software/data/data_cholesky_q16.h.tpl deleted file mode 100644 index 0ba9cf5f0..000000000 --- a/software/data/data_cholesky_q16.h.tpl +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define dim_N (${n_matrix}) -#define N_SAMPLES (${n_samples}) -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(G)}; -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(L)}; diff --git a/software/data/data_cholesky_q32.h.tpl b/software/data/data_cholesky_q32.h.tpl deleted file mode 100644 index 0042f54c8..000000000 --- a/software/data/data_cholesky_q32.h.tpl +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int32_t) 0X{:08X}, '.format(a&0xffffffff) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N (${n_matrix}) - -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[${n_matrix * n_matrix}] = ${array_to_cstr(G)}; -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[${n_matrix * n_matrix}] = ${array_to_cstr(L)}; -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${n_matrix}] = ${array_to_cstr(y)}; diff --git a/software/data/data_cmatmul_f16.h.tpl b/software/data/data_cmatmul_f16.h.tpl deleted file mode 100644 index 15ed570a1..000000000 --- a/software/data/data_cmatmul_f16.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4f}, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define dim_M (${matrix_M}) -#define dim_N (${matrix_N}) -#define dim_P (${matrix_P}) - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${2 * matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${2 * matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${2 * matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_cmatmul_q16.h.tpl b/software/data/data_cmatmul_q16.h.tpl deleted file mode 100644 index b42c55f88..000000000 --- a/software/data/data_cmatmul_q16.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define dim_M (${matrix_M}) -#define dim_N (${matrix_N}) -#define dim_P (${matrix_P}) - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${2 * matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${2 * matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${2 * matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_dotp_f16.h.tpl b/software/data/data_dotp_f16.h.tpl deleted file mode 100644 index f7cacaed3..000000000 --- a/software/data/data_dotp_f16.h.tpl +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4f}, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LEN (${Len}) - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = (__fp16)${C}f; diff --git a/software/data/data_dotp_f32.h.tpl b/software/data/data_dotp_f32.h.tpl deleted file mode 100644 index 3af0fbe66..000000000 --- a/software/data/data_dotp_f32.h.tpl +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LEN (${Len}) - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}f; diff --git a/software/data/data_dotp_i32.h.tpl b/software/data/data_dotp_i32.h.tpl deleted file mode 100644 index d76d92a24..000000000 --- a/software/data/data_dotp_i32.h.tpl +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LEN (${Len}) - -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; - -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; - -int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}; diff --git a/software/data/data_mimo_mmse_f16.h.tpl b/software/data/data_mimo_mmse_f16.h.tpl deleted file mode 100644 index e6109b7f6..000000000 --- a/software/data/data_mimo_mmse_f16.h.tpl +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:0.5f}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${N_tx}) -#define N_RX (${N_rx}) -#define N_ITR (${N_itr}) - -// Inputs - -__fp16 l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)}; - -// Outputs - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)}; diff --git a/software/data/data_mimo_mmse_f32.h.tpl b/software/data/data_mimo_mmse_f32.h.tpl deleted file mode 100644 index c7bed1889..000000000 --- a/software/data/data_mimo_mmse_f32.h.tpl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${N_tx}) -#define N_RX (${N_rx}) -#define N_ITR (${N_itr}) - -// Inputs - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)}; - -// Outputs - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)}; diff --git a/software/data/data_mimo_mmse_f8.h.tpl b/software/data/data_mimo_mmse_f8.h.tpl deleted file mode 100644 index 780bcc041..000000000 --- a/software/data/data_mimo_mmse_f8.h.tpl +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp8)' + f'{hex(a.bits())}' +', ' - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_cstr16(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:0.5f}f, '.format(a) - i += 1 - if i % 5 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${N_tx}) -#define N_RX (${N_rx}) -#define N_ITR (${N_itr}) - -// Inputs - -__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)}; - -__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)}; - -__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr16(G)}; - -// Outputs - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr16(x)}; diff --git a/software/data/data_mimo_mmse_q16.h.tpl b/software/data/data_mimo_mmse_q16.h.tpl deleted file mode 100644 index ca2ed0193..000000000 --- a/software/data/data_mimo_mmse_q16.h.tpl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) {}, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${N_tx}) -#define N_RX (${N_rx}) -#define N_ITR (${N_itr}) - -// Inputs - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Sigma[${2 * N_tx * N_itr}] = ${array_to_cstr(N)}; - -// Outputs - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)}; diff --git a/software/data/data_ofdm.h.tpl b/software/data/data_ofdm.h.tpl deleted file mode 100644 index 06da2c045..000000000 --- a/software/data/data_ofdm.h.tpl +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:0.5}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_RX (${N_rx}) -#define N_BEAMS (${N_bs}) -#define N_SC (${N_sc}) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pFFT_Src[${2 * N_sc * N_rx}] = ${array_to_cstr(pFFT_src)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_f16[${2 * N_sc}] = ${array_to_cstr(pTw_coef)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pBF_Coef[${2 * N_bs * N_rx}] = ${array_to_cstr(pBF_coef)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pBF_Dst[${2 * N_bs * N_sc}] = ${array_to_cstr(pBF_dst)}; - -// Bitreversal -uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)}; diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py index 44749a4a0..c8fd8c3f8 100644 --- a/software/data/gendata_header.py +++ b/software/data/gendata_header.py @@ -14,6 +14,7 @@ import numpy import gendatalib as datalib +import pyflexfloat as ff header = """\ @@ -46,6 +47,10 @@ def format_type(typ, value): stringyfied_val = '({}) {:+.8f}'.format(typ, value) elif typ == '__fp16': stringyfied_val = '({}) {:+.4f}'.format(typ, value) + elif typ == '__fp8': + value = ff.FlexFloat("e5m2", value.astype(numpy.double)) + value = value.bits() + stringyfied_val = '({}) 0X{}'.format(typ, value) else: raise Exception("ERROR: Unsupported data type!!!") @@ -75,7 +80,7 @@ def print_array(arr, typ, name): output_string += "};\n\n" else: output_string += attr - output_string += (name + ' = ' + format_type(typ, arr)) + output_string += (name + ' = ' + format_type(typ, arr[0])) output_string += ";\n\n" return output_string @@ -125,6 +130,8 @@ def get_type(type_string): return numpy.float32 elif type_string == "float16": return numpy.float16 + elif type_string == "float8": + return numpy.float16 else: raise Exception("Input type is not valid") @@ -156,16 +163,32 @@ def get_type(type_string): # Define function mappings for each app_name function_map = { "axpy_i32": {"func": datalib.generate_iaxpy}, - "cfft_radix4_q16": {"func": datalib.generate_cfft_q16}, + "axpy_f16": {"func": datalib.generate_faxpy}, + "axpy_f32": {"func": datalib.generate_faxpy}, "cfft_radix2_q16": {"func": datalib.generate_cfft_q16}, + "cfft_radix4_f16": {"func": datalib.generate_fcfft}, + "cfft_radix4_q16": {"func": datalib.generate_cfft_q16}, + "chest_f16": {"func": datalib.generate_fchest}, "chest_q16": {"func": datalib.generate_qchest}, + "cholesky_f16": {"func": datalib.generate_fccholesky}, + "cholesky_q16": {"func": datalib.generate_qccholesky}, "cholesky_q32": {"func": datalib.generate_qcholesky}, + "cmatmul_f16": {"func": datalib.generate_fcmatmul}, + "cmatmul_q16": {"func": datalib.generate_qcmatmul}, + "dotp_f16": {"func": datalib.generate_fdotp}, + "dotp_f32": {"func": datalib.generate_fdotp}, "dotp_i32": {"func": datalib.generate_idotp}, "matmul_f16": {"func": datalib.generate_fmatmul}, + "matmul_f8": {"func": datalib.generate_fmatmul}, "matmul_f32": {"func": datalib.generate_fmatmul}, "matmul_i32": {"func": datalib.generate_imatmul}, "matmul_i16": {"func": datalib.generate_imatmul}, "matmul_i8": {"func": datalib.generate_imatmul}, + "mimo_mmse_q16": {"func": datalib.generate_qmmse}, + "mimo_mmse_f16": {"func": datalib.generate_fmmse}, + "mimo_mmse_f32": {"func": datalib.generate_fmmse}, + "mimo_mmse_f8": {"func": datalib.generate_fmmse}, + "ofdm_f16": {"func": datalib.generate_fofdm}, "fence": {"func": datalib.generate_iarray}, "memcpy": {"func": datalib.generate_iarray}, } diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson index 3a1de010e..e343a42be 100644 --- a/software/data/gendata_params.hjson +++ b/software/data/gendata_params.hjson @@ -9,32 +9,58 @@ "axpy_i32": { "type": "int32", "defines": [ - ("ALPHA", 6) ("array_N", 1024) ] "arrays": [ + ("int32_t", "l2_A") ("int32_t", "l2_X") ("int32_t", "l2_Y") ("int32_t", "l2_Z") ] }, - "dotp_i32": { - "type": "int32", + "axpy_f32": { + "type": "float32", "defines": [ ("array_N", 1024) ] "arrays": [ - ("int32_t", "l2_X") - ("int32_t", "l2_Y") - ("int32_t", "l2_Z") + ("float", "l2_A") + ("float", "l2_X") + ("float", "l2_Y") + ("float", "l2_Z") ] }, - "cfft_radix4_q16": { + "axpy_f16": { + "type": "float16", + "defines": [ + ("array_N", 1024) + ] + "arrays": [ + ("__fp16", "l2_A") + ("__fp16", "l2_X") + ("__fp16", "l2_Y") + ("__fp16", "l2_Z") + ] + }, + + "dotp_f16": { + "type": "float16", + "defines": [ + ("array_N", 1024) + ] + "arrays": [ + ("__fp16", "l2_X") + ("__fp16", "l2_Y") + ("__fp16", "l2_Z") + ] + }, + + "cfft_radix2_q16": { "type": "int16", "defines": [ - ("N_CSAMPLES", 64) + ("N_CSAMPLES", 256) ] "arrays": [ ("int16_t", "l2_pSrc") @@ -44,10 +70,23 @@ ] }, - "cfft_radix2_q16": { + "cfft_radix4_f16": { + "type": "float16", + "defines": [ + ("N_CSAMPLES", 64) + ] + "arrays": [ + ("__fp16", "l2_pSrc") + ("__fp16", "l2_pRes") + ("__fp16", "l2_twiddleCoef_f16") + ("int16_t", "l2_BitRevIndexTable") + ] + }, + + "cfft_radix4_q16": { "type": "int16", "defines": [ - ("N_CSAMPLES", 256) + ("N_CSAMPLES", 64) ] "arrays": [ ("int16_t", "l2_pSrc") @@ -57,6 +96,20 @@ ] }, + "chest_f16": { + "type": "float16", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_SAMPLES", 512) + ] + "arrays": [ + ("__fp16", "l2_PilotTX") + ("__fp16", "l2_PilotRX") + ("__fp16", "l2_HEST") + ] + }, + "chest_q16": { "type": "int32", "defines": [ @@ -71,11 +124,37 @@ ] }, + "cholesky_q16": { + "type": "int16", + "defines": [ + ("matrix_N", 4) + ("FIXED_POINT", 8) + ("N_SAMPLES", 32) + ] + "arrays": [ + ("int16_t", "l2_GIn") + ("int16_t", "l2_LOut") + ] + }, + + "cholesky_f16": { + "type": "float16", + "defines": [ + ("matrix_N", 4) + ("N_SAMPLES", 1024) + ] + "arrays": [ + ("__fp16", "l2_GIn") + ("__fp16", "l2_LOut") + ] + }, + "cholesky_q32": { "type": "int32", "defines": [ ("matrix_N", 32) ("FIXED_POINT", 10) + ("N_SAMPLES", 1) ] "arrays": [ ("int32_t", "l2_A") @@ -84,6 +163,58 @@ ] }, + "cmatmul_f16": { + "type": "float16", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("__fp16", "l2_A") + ("__fp16", "l2_B") + ("__fp16", "l2_C") + ] + }, + + "cmatmul_q16": { + "type": "int16", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("int16_t", "l2_A") + ("int16_t", "l2_B") + ("int16_t", "l2_C") + ] + }, + + "dotp_f32": { + "type": "float32", + "defines": [ + ("array_N", 1024) + ] + "arrays": [ + ("float", "l2_X") + ("float", "l2_Y") + ("float", "l2_Z") + ] + }, + + "dotp_i32": { + "type": "int32", + "defines": [ + ("array_N", 1024) + ] + "arrays": [ + ("int32_t", "l2_X") + ("int32_t", "l2_Y") + ("int32_t", "l2_Z") + ] + }, + "matmul_f16": { "type": "float16", "defines": [ @@ -112,6 +243,20 @@ ] } + "matmul_i16": { + "type": "int16", + "defines": [ + ("matrix_M", 64) + ("matrix_N", 64) + ("matrix_P", 64) + ] + "arrays": [ + ("int16_t", "l2_A") + ("int16_t", "l2_B") + ("int32_t", "l2_C") + ] + } + "matmul_i32": { "type": "int32", "defines": [ @@ -126,34 +271,101 @@ ] } - "matmul_i16": { - "type": "int16", + "matmul_i8": { + "type": "int8", "defines": [ ("matrix_M", 64) ("matrix_N", 64) ("matrix_P", 64) ] "arrays": [ - ("int16_t", "l2_A") - ("int16_t", "l2_B") + ("int8_t", "l2_A") + ("int8_t", "l2_B") ("int32_t", "l2_C") ] } - "matmul_i8": { - "type": "int8", + "mimo_mmse_f16": { + "type": "float16", "defines": [ - ("matrix_M", 64) - ("matrix_N", 64) - ("matrix_P", 64) + ("N_TX", 4) + ("N_RX", 4) + ("N_ITR", 32) ] "arrays": [ - ("int8_t", "l2_A") - ("int8_t", "l2_B") - ("int32_t", "l2_C") + ("__fp16", "l2_H") + ("__fp16", "l2_G") + ("__fp16", "l2_y") + ("__fp16", "l2_S") + ("__fp16", "l2_x") + ] + } + + "mimo_mmse_f32": { + "type": "float32", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_ITR", 32) + ] + "arrays": [ + ("float", "l2_H") + ("float", "l2_G") + ("float", "l2_y") + ("float", "l2_S") + ("float", "l2_x") ] } + "mimo_mmse_f8": { + "type": "float8", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_ITR", 32) + ] + "arrays": [ + ("__fp8", "l2_H") + ("__fp16", "l2_G") + ("__fp8", "l2_y") + ("__fp8", "l2_S") + ("__fp16", "l2_x") + ] + } + + "mimo_mmse_q16": { + "type": "int16", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_ITR", 32) + ("FIXED_POINT", 8) + ] + "arrays": [ + ("int16_t", "l2_H") + ("int16_t", "l2_G") + ("int16_t", "l2_y") + ("int16_t", "l2_S") + ("int16_t", "l2_x") + ] + } + + "ofdm_f16": { + "type": "float16", + "defines": [ + ("N_SC", 4096) + ("N_RX", 64) + ("N_BEAMS", 32) + ] + "arrays": [ + ("__fp16", "l2_pFFT_Src") + ("__fp16", "l2_pBF_Coef") + ("__fp16", "l2_pBF_Dst") + ("__fp16", "l2_twiddleCoef_f16") + ("__fp16", "l2_BitRevIndexTable") + ] + }, + "fence": { "type": "int32", "defines": [ diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py index c017415bf..cf65e4504 100644 --- a/software/data/gendatalib.py +++ b/software/data/gendatalib.py @@ -14,7 +14,10 @@ import numpy as np import math import qmath +#import pyflexfloat as ff + from scipy import signal +from scipy.linalg import solve_triangular def select_maxval(my_type=np.int32): @@ -58,52 +61,278 @@ def generate_iarray(my_type=np.float32, defines={}): return A, defines -def generate_fmatmul(my_type=np.float32, defines={}): +############################################################################## + + +def generate_faxpy(my_type=np.float32, defines={}): + + # Create matrix + array_N = defines['array_N'] + A = np.random.rand(1) - 0.5 + X = (np.random.rand(array_N) - 0.5).astype(my_type) + Y = (np.random.rand(array_N) - 0.5).astype(my_type) + Z = (Y + X * A).astype(my_type) + + return [A, X, Y, Z], defines + + +def generate_fdotp(my_type=np.float32, defines={}): + + # Create matrix + array_N = defines['array_N'] + A = np.random.rand(1) - 0.5 + X = (np.random.rand(array_N) - 0.5).astype(my_type) + Y = (np.random.rand(array_N) - 0.5).astype(my_type) + Z = np.dot(X, Y).astype(my_type) + Z = np.array(Z).astype(my_type) + Z = np.resize(Z, 1) + + return [X, Y, Z], defines + + +def ftwiddleCoef(N, my_type=np.float32): + PI = np.pi + twiddleCoeff16 = np.zeros((int)(2 * 3 * N / 4), my_type) + for i in range(0, int(3 * N / 4)): + twiddleCoeff16_sin = np.sin(i * 2 * PI / N).astype(my_type) + twiddleCoeff16_cos = np.cos(i * 2 * PI / N).astype(my_type) + twiddleCoeff16[2 * i] = twiddleCoeff16_sin + twiddleCoeff16[2 * i + 1] = twiddleCoeff16_cos + return twiddleCoeff16 + + +def generate_fcfft(my_type=np.float32, defines={}): + + N_CSAMPLES = defines['N_CSAMPLES'] +# src = np.cos(np.linspace(0, N_CSAMPLES / 4, num=N_CSAMPLES)).astype(my_type) +# src = src + 1.j * np.sin(np.linspace(0, N_CSAMPLES / 4, num=N_CSAMPLES)).astype(my_type) + + src_r = np.random.normal(0, 5, N_CSAMPLES).astype(np.float16) + src_i = np.random.normal(0, 5, N_CSAMPLES).astype(np.float16) + src = src_r + 1.j * src_i + src = np.fft.ifft(src) + dst = np.fft.fft(src) + src = np.column_stack((src.imag, src.real)).astype(my_type).flatten() + dst = np.column_stack((dst.imag, dst.real)).astype(my_type).flatten() + + twiddles = ftwiddleCoef(N_CSAMPLES, my_type) + bitrever = qmath.bitreversal(N_CSAMPLES, 2) + + defines['LOG2'] = int(math.log2(N_CSAMPLES)) + defines['N_TWIDDLES'] = 3 * N_CSAMPLES // 4 + defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever) + defines['TOLERANCE'] = 0.1 * np.max(dst) + + return [src, dst, twiddles, bitrever], defines + + +def generate_fchest(my_type=np.float32, defines={}, division=False): + + nb_tx = defines['N_TX'] + nb_rx = defines['N_RX'] + nb_samples = defines['N_SAMPLES'] + + H = np.random.randn(nb_rx, nb_tx) + H = H + 1j * np.random.randn(nb_rx, nb_tx) + + vpilot_tx = [] + vpilot_rx = [] + vHest = [] + for k in range(nb_samples): + if (division): + # Compute data division + pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx)) + pilot_rx = np.dot(H, pilot_tx) + Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :] + else: + # Compute data multiplication + pilot_tx = np.exp(1j * np.random.randn(nb_tx)) + pilot_rx = np.dot(H, pilot_tx) + pilot_tx = np.reciprocal(pilot_tx) + Hest = pilot_rx[:, np.newaxis] * pilot_tx[np.newaxis, :] + Hest = Hest.flatten() + + # Interleaved real and imaginary parts + pilot_tx = np.column_stack((pilot_tx.imag, pilot_tx.real)) + pilot_rx = np.column_stack((pilot_rx.imag, pilot_rx.real)) + Hest = np.column_stack((Hest.imag, Hest.real)) + # Flatten arrays + pilot_tx = pilot_tx.astype(my_type).flatten() + pilot_rx = pilot_rx.astype(my_type).flatten() + Hest = Hest.astype(my_type).flatten() + # Output vectors + vpilot_tx.append(pilot_tx) + vpilot_rx.append(pilot_rx) + vHest.append(Hest) + + vpilot_rx = np.concatenate(vpilot_rx, axis=0) + vpilot_tx = np.concatenate(vpilot_tx, axis=0) + vHest = np.concatenate(vHest, axis=0) + + return [vpilot_tx, vpilot_rx, vHest], defines + + +def generate_fccholesky(my_type=np.float32, defines={}): + + n_matrix = defines['matrix_N'] + n_samples = defines['N_SAMPLES'] + + vector_G = [] + vector_L = [] + for k in range(n_samples): + # Create hermitian matrix + H = np.random.rand(n_matrix, n_matrix) + 1.j * \ + np.random.rand(n_matrix, n_matrix) + # Matrix to be inverted + # H_H = np.asmatrix(H).H + G = np.matmul(H, np.asmatrix(H).H) + # Cholesky decomposition + L = np.linalg.cholesky(G) + # Reshape + G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C') + L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C') + G = np.column_stack((G.real, G.imag)).astype(my_type).flatten() + L = np.column_stack((L.real, L.imag)).astype(my_type).flatten() + # Output vectors + vector_G.append(G) + vector_L.append(L) + + vector_G = np.concatenate(vector_G, axis=0) + vector_L = np.concatenate(vector_L, axis=0) + return [vector_G, vector_L], defines + + +def generate_fcmatmul(my_type=np.float32, defines={}): # Create matrix matrix_M = defines['matrix_M'] matrix_N = defines['matrix_N'] matrix_P = defines['matrix_P'] - A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type) - B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type) + A = np.random.rand(matrix_M, matrix_N) + 1j * \ + np.random.rand(matrix_M, matrix_N) + B = np.random.rand(matrix_N, matrix_P) + 1j * \ + np.random.rand(matrix_N, matrix_P) C = np.matmul(A, B) - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type) + A = np.reshape(A, (matrix_M * matrix_N), order='C') + B = np.reshape(B, (matrix_N * matrix_P), order='C') + C = np.reshape(C, (matrix_M * matrix_P), order='C') + + A = np.column_stack((A.imag, A.real)).astype(my_type).flatten() + B = np.column_stack((B.imag, B.real)).astype(my_type).flatten() + C = np.column_stack((C.imag, C.real)).astype(my_type).flatten() return [A, B, C], defines -def generate_imatmul(my_type=np.int32, defines={}): +def generate_fmatmul(my_type=np.float32, defines={}): # Create matrix matrix_M = defines['matrix_M'] matrix_N = defines['matrix_N'] matrix_P = defines['matrix_P'] - MAX = select_maxval(my_type) - A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) - B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type) + B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type) C = np.matmul(A, B) A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type) return [A, B, C], defines +def generate_fmmse(my_type=np.float16, defines={}): + + N_tx = defines['N_TX'] + N_rx = defines['N_RX'] + N_itr = defines['N_ITR'] + vH = np.zeros([N_itr, N_tx * 2 * N_rx], dtype=my_type) + vG = np.zeros([N_itr, N_tx * 2 * N_tx], dtype=my_type) + vy = np.zeros([N_itr, 2 * N_rx], dtype=my_type) + vN = np.zeros([N_itr, 2 * N_tx], dtype=my_type) + vx = np.zeros([N_itr, 2 * N_tx], dtype=my_type) + + for k in range(N_itr): + + # Create input vector + y = np.random.rand(N_rx).astype(my_type) + 1.j * \ + np.random.rand(N_rx).astype(my_type) + + # Create channel matrix + H = np.random.rand(N_rx, N_tx).astype(my_type) + 1.j * \ + np.random.rand(N_rx, N_tx).astype(my_type) + # Generate noise variance + N = np.random.rand(1).astype(my_type) + + # Matrix to be inverted in MMSE estimator + H_h = np.asmatrix(H).H + G = np.matmul(H_h, H) + N * np.eye(H.shape[1]) + N = N * np.ones(N_tx) + + # Cholesky decomposition + L = np.linalg.cholesky(G) + # Linear system solution + y1 = np.transpose(np.dot(H_h, y)) + y2 = solve_triangular(L, y1, lower=True) + x = solve_triangular(np.asmatrix(L).H, y2) + + H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C') + G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') + N = np.column_stack((N.real, N.imag)).astype(my_type).flatten() + H = np.column_stack((H.real, H.imag)).astype(my_type).flatten() + G = np.column_stack((G.real, G.imag)).astype(my_type).flatten() + x = np.column_stack((x.real, x.imag)).astype(my_type).flatten() + y = np.column_stack((y.real, y.imag)).astype(my_type).flatten() + + vH[k, :] = H + vG[k, :] = G + vy[k, :] = y + vN[k, :] = N + vx[k, :] = x + + vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type) + vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type) + vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type) + vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type) + vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type) + + return [vH, vG, vy, vN, vx], defines + + +def generate_fofdm(my_type=np.float32, defines={}): + + N_sc = defines['N_SC'] + N_rx = defines['N_RX'] + N_bs = defines['N_BEAMS'] + + pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16) + pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16) + pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16) + twiddles = ftwiddleCoef(N_sc, my_type) + bitrever = qmath.bitreversal(N_sc, 2) + + defines['LOG2'] = int(math.log2(N_sc)) + defines['N_TWIDDLES'] = 3 * N_sc // 4 + defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever) + + return [pFFT_src, pBF_coef, pBF_dst, twiddles, bitrever], defines + + +############################################################################## + + def generate_iaxpy(my_type=np.int32, defines={}): # Create matrix - ALPHA = defines['ALPHA'] array_N = defines['array_N'] MAX = select_maxval(my_type) + A = np.random.randint(-MAX, MAX - 1, size=1, dtype=my_type) X = irandom(MAX=MAX, size=(array_N), my_type=my_type) Y = irandom(MAX=MAX, size=(array_N), my_type=my_type) - Z = (Y + X * ALPHA).astype(my_type) + Z = (Y + X * A).astype(my_type) - return [X, Y, Z], defines + return [A, X, Y, Z], defines def generate_idotp(my_type=np.int32, defines={}): @@ -113,7 +342,9 @@ def generate_idotp(my_type=np.int32, defines={}): MAX = select_maxval(my_type) X = irandom(MAX=MAX, size=(array_N), my_type=my_type) Y = irandom(MAX=MAX, size=(array_N), my_type=my_type) - Z = np.array((np.dot(X, Y))).astype(my_type) + Z = np.dot(X, Y) + Z = np.array(Z).astype(my_type) + Z = np.resize(Z, 1) return [X, Y, Z], defines @@ -136,6 +367,51 @@ def generate_iconv(my_type=np.int32, defines={}): return [X, K, Y], defines +def generate_imatmul(my_type=np.int32, defines={}): + + # Create matrix + matrix_M = defines['matrix_M'] + matrix_N = defines['matrix_N'] + matrix_P = defines['matrix_P'] + MAX = select_maxval(my_type) + A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + C = np.matmul(A, B) + + A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) + B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32) + + return [A, B, C], defines + + +############################################################################## + + +def generate_qcmatmul(my_type=np.int32, defines={}): + MAX = 2**15 + FIXED_POINT = 15 + + # Create matrix + matrix_M = defines['matrix_M'] + matrix_N = defines['matrix_N'] + matrix_P = defines['matrix_P'] + A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + 1j * \ + np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + B = np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) + 1j * \ + np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) + [Cr, Ci] = qmath.qcmatmul(A.real, A.imag, B.real, + B.imag, FIXED_POINT, my_type) + + A = np.reshape(A, (matrix_M * matrix_N), order='C') + B = np.reshape(B, (matrix_N * matrix_P), order='C') + A = np.column_stack((A.imag, A.real)).astype(my_type).flatten() + B = np.column_stack((B.imag, B.real)).astype(my_type).flatten() + C = np.column_stack((Ci, Cr)).astype(my_type).flatten() + + return [A, B, C], defines + + def generate_qchest(defines={}, fixed_point=15, my_type=np.int16): N_TX = defines['N_TX'] @@ -164,19 +440,137 @@ def generate_qchest(defines={}, fixed_point=15, my_type=np.int16): return [qvector_pilot_tx, qvector_pilot_rx, qvector_Hest], defines +def generate_qccholesky(defines={}, fixed_point=15, my_type=np.int32): + + matrix_N = defines['matrix_N'] + FIXED_POINT = defines['FIXED_POINT'] + N_SAMPLES = defines['N_SAMPLES'] + + vA = np.zeros([N_SAMPLES, 2 * matrix_N * matrix_N], dtype=my_type) + vL = np.zeros([N_SAMPLES, 2 * matrix_N * matrix_N], dtype=my_type) + vy = np.zeros([N_SAMPLES, 2 * matrix_N], dtype=my_type) + for k in range(N_SAMPLES): + + Ar = np.random.normal(0, 1, [matrix_N, matrix_N]).astype(np.float32) + Ai = np.random.normal(0, 1, [matrix_N, matrix_N]).astype(np.float32) + A = Ar + 1.j * Ai + G = np.matmul(A.conj().T, A) + MAX_A = max(np.abs(A.real).max(), np.abs(A.imag).max()) + MAX_G = max(np.abs(G.real).max(), np.abs(G.imag).max()) + MAX = max(MAX_A, MAX_G) + + Ar = np.round((Ar / MAX) * 2**FIXED_POINT).astype(int) + Ai = np.round((Ai / MAX) * 2**FIXED_POINT).astype(int) + Ar = Ar + np.eye(matrix_N, dtype=int) * 256 + Ai = Ai + np.eye(matrix_N, dtype=int) * 256 + + Ar, Ai = qmath.qcmatmul(Ar.T, -Ai.T, Ar, Ai, FIXED_POINT, my_type) + Lr, Li = qmath.qccholesky( + Ar, Ai, fixed_point=FIXED_POINT, mytype=my_type) + + A = np.column_stack((Ar, Ai)).astype(my_type).flatten() + L = np.column_stack((Lr, Li)).astype(my_type).flatten() + vA[k, :] = np.reshape(A, (2 * matrix_N * matrix_N), + order='C').astype(my_type) + vL[k, :] = np.reshape(L, (2 * matrix_N * matrix_N), + order='C').astype(my_type) + + vA = np.reshape(vA, (2 * matrix_N * matrix_N * N_SAMPLES)).astype(my_type) + vL = np.reshape(vL, (2 * matrix_N * matrix_N * N_SAMPLES)).astype(my_type) + return [vA, vL], defines + + def generate_qcholesky(defines={}, fixed_point=15, my_type=np.int32): matrix_N = defines['matrix_N'] FIXED_POINT = defines['FIXED_POINT'] + N_SAMPLES = defines['N_SAMPLES'] - A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type) - y = irandom(size=matrix_N, MAX=2**14, my_type=my_type) - A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type) - L = qmath.qcholesky(A, fixed_point=FIXED_POINT, mytype=my_type) + vA = np.zeros([N_SAMPLES, matrix_N * matrix_N], dtype=my_type) + vL = np.zeros([N_SAMPLES, matrix_N * matrix_N], dtype=my_type) + vy = np.zeros([N_SAMPLES, matrix_N], dtype=my_type) + for k in range(N_SAMPLES): + A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type) + y = irandom(size=matrix_N, MAX=2**14, my_type=my_type) + A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type) + L = qmath.qcholesky(A, FIXED_POINT, my_type) - A = np.reshape(A, (matrix_N * matrix_N), order='C').astype(my_type) - L = np.reshape(L, (matrix_N * matrix_N), order='C').astype(my_type) - return [A, L, y], defines + vA[k, :] = np.reshape(A, (matrix_N * matrix_N), + order='C').astype(my_type) + vL[k, :] = np.reshape(L, (matrix_N * matrix_N), + order='C').astype(my_type) + vy[k, :] = np.reshape(y, matrix_N, order='C').astype(my_type) + + vA = np.reshape(vA, (matrix_N * matrix_N * N_SAMPLES)).astype(my_type) + vL = np.reshape(vL, (matrix_N * matrix_N * N_SAMPLES)).astype(my_type) + vy = np.reshape(vy, (matrix_N * N_SAMPLES)).astype(my_type) + + return [vA, vL, vy], defines + + +def generate_qmmse(defines={}, fixed_point=15, my_type=np.int32): + + FIXED_POINT = defines['FIXED_POINT'] + N_tx = defines['N_TX'] + N_rx = defines['N_RX'] + N_itr = defines['N_ITR'] + + vN = np.zeros([N_itr, 2 * N_tx], dtype=np.int16) + vH = np.zeros([N_itr, 2 * N_tx * N_rx], dtype=np.int16) + vG = np.zeros([N_itr, 2 * N_tx * N_tx], dtype=np.int16) + vy = np.zeros([N_itr, 2 * N_rx], dtype=np.int16) + vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16) + + for k in range(N_itr): + + # Floating point inputs + rH = np.random.normal(0, 1, [N_rx, N_tx]).astype(np.float32) + iH = np.random.normal(0, 1, [N_rx, N_tx]).astype(np.float32) + rN = np.random.normal(0, 1, [N_rx]).astype(np.float32) + ry = np.random.normal(0, 1, [N_rx]).astype(np.float32) + iy = np.random.normal(0, 1, [N_rx]).astype(np.float32) + H = rH + 1j * iH + y = ry + 1j * iy + G = np.matmul(H.conj().T, H) + rN * np.eye(H.shape[1]) + y1 = np.dot(H.conj().T, y) + + # Rescale inputs + H_max = max(np.abs(H.real).max(), np.abs(H.imag).max()) + G_max = max(np.abs(G.real).max(), np.abs(G.imag).max()) + y_max = max(np.abs(y.real).max(), np.abs(y.imag).max()) + y1_max = max(np.abs(y1.real).max(), np.abs(y1.imag).max()) + N_max = np.abs(rN).max() + MAX = max(H_max, G_max, N_max, y_max) + SCALE_FACTOR = 2**FIXED_POINT + rH = np.round((H.real / MAX) * SCALE_FACTOR).astype(int) + iH = np.round((H.imag / MAX) * SCALE_FACTOR).astype(int) + ry = np.round((y.real / MAX) * SCALE_FACTOR).astype(int) + iy = np.round((y.imag / MAX) * SCALE_FACTOR).astype(int) + rN = np.round((rN / MAX) * SCALE_FACTOR).astype(int) + 1024 + + # Hermitian + rG, iG = qmath.qcmatmul(rH.T, -iH.T, rH, iH, FIXED_POINT, my_type) + ry1, iy1 = qmath.qcmvmul(rH.T, -iH.T, ry, iy, FIXED_POINT, my_type) + np.fill_diagonal(rG, rG.diagonal() + rN) + + # Solve linear system + rL, iL = qmath.qccholesky(rG, iG, FIXED_POINT, my_type) + ry2, iy2 = qmath.qinvertLt(rL, iL, ry1, iy1, FIXED_POINT, my_type) + rx, ix = qmath.qinvertUt(rL.T, -iL.T, ry2, iy2, FIXED_POINT, my_type) + + vN[k, :] = np.column_stack( + (rN, np.zeros(np.size(rN)))).astype(my_type).flatten() + vH[k, :] = np.column_stack((rH, iH)).astype(my_type).flatten() + vG[k, :] = np.column_stack((rG, iG)).astype(my_type).flatten() + vy[k, :] = np.column_stack((ry, iy)).astype(my_type).flatten() + vx[k, :] = np.column_stack((rx, ix)).astype(my_type).flatten() + + vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type) + vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type) + vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type) + vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type) + vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type) + return [vN, vH, vG, vy, vx], defines def generate_cfft_q16(defines={}, fixed_point=15, my_type=np.int16): diff --git a/software/data/generate_cfft.py b/software/data/generate_cfft.py deleted file mode 100755 index 2412c278d..000000000 --- a/software/data/generate_cfft.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the cfft kernel. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - -# Function to generate the expected result of the testcase. - - -def generate_cfft_q16(N): - # Q16: - # len=16: Q1.15 -> Q5.11 - # len=32: Q1.15 -> Q6.10 - # len=64: Q1.15 -> Q7.9 - # len=128: Q1.15 -> Q8.8 - # len=256: Q1.15 -> Q9.7 - # len=512: Q1.15 -> Q10.6 - # len=1024: Q1.15 -> Q11.5 - # len=2048: Q1.15 -> Q12.4 - # len=4096: Q1.15 -> Q13.3 - src = (np.random.randint(-2**(15), 2**(15) - 1, - 2 * N, dtype=np.int16)).astype(np.int16) - - bit_shift_dict_q16 = { - 16: 11, - 32: 10, - 64: 9, - 128: 8, - 256: 7, - 512: 6, - 1024: 5, - 2048: 4, - 4096: 3} - my_fixpoint = 15 - dst = np.zeros(2 * N, dtype=np.int16) - complex_src = np.zeros(N, dtype=np.csingle) - complex_dst = np.zeros(N, dtype=np.csingle) - for i in range(N): - shift = 2**(my_fixpoint) - complex_src[i] = (src[2 * i].astype(np.csingle) / shift) + \ - 1j * (src[2 * i + 1].astype(np.csingle) / shift) - complex_dst = np.fft.fft(complex_src) - for i in range(N): - shift = 2**(bit_shift_dict_q16[N]) - dst[2 * i] = (np.real(complex_dst[i]) * shift).astype(np.int16) - dst[2 * i + 1] = (np.imag(complex_dst[i]) * shift).astype(np.int16) - return src, dst - - -def generate_cfft_f16(N): - # src = np.random.rand(N).astype(np.float16) - # src = src + 1.j * np.random.rand(N).astype(np.float16) - src = np.cos(np.linspace(0, N / 4, num=N)).astype(np.float16) - src = src + 1.j * np.sin(np.linspace(0, N / 4, num=N)).astype(np.float16) - dst = np.fft.fft(src) - src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten() - dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten() - return src, dst - - -def generate_twiddleCoefq15(N): - PI = 3.14159265358979 - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def generate_twiddleCoeff16(N): - PI = np.pi - twiddleCoeff16 = np.zeros((int)(2 * 3 * N / 4), np.float16) - for i in range(0, int(3 * N / 4)): - twiddleCoeff16_sin = np.sin(i * 2 * PI / N).astype(np.float16) - twiddleCoeff16_cos = np.cos(i * 2 * PI / N).astype(np.float16) - twiddleCoeff16[2 * i] = twiddleCoeff16_sin - twiddleCoeff16[2 * i + 1] = twiddleCoeff16_cos - return twiddleCoeff16 - - -def generate_bitreversal(N, R): - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - return np.ndarray.flatten(np.array(tps)) - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - file = outdir / f"{kwargs['name']}.h" - print(tpl, outdir, kwargs['name']) - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=64, - help='Input dimension' - ) - - args = parser.parse_args() - - # Create inputs cfft_q16 - Len = args.dimension - src_cfft_q16, dst_cfft_q16 = generate_cfft_q16(Len) - twi_cfft_q16 = generate_twiddleCoefq15(Len) - brv_cfft_q16 = generate_bitreversal(Len, 2) - tolerance = { - 16: 16, - 32: 20, - 64: 24, - 128: 28, - 256: 32, - 512: 48, - 1024: 64, - 2048: 96, - 4096: 128} - - kwargs = {'name': 'data_cfft_radix4_q16', - 'vector_inp': src_cfft_q16, - 'vector_res': dst_cfft_q16, - 'vector_twi': twi_cfft_q16, - 'vector_bitrev': brv_cfft_q16, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': len(brv_cfft_q16), - 'tolerance': tolerance[int(Len)]} - gen_data_header_file( - args.outdir, - pathlib.Path(__file__).parent.absolute() / - "data_cfft_q16.h.tpl", - **kwargs) - - kwargs = {'name': 'data_cfft_radix2_q16', - 'vector_inp': src_cfft_q16, - 'vector_res': dst_cfft_q16, - 'vector_twi': twi_cfft_q16, - 'vector_bitrev': brv_cfft_q16, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': int(2 * len(brv_cfft_q16)), - 'tolerance': tolerance[int(Len)]} - gen_data_header_file( - args.outdir, - pathlib.Path(__file__).parent.absolute() / - "data_cfft_q16.h.tpl", - **kwargs) - - # Create inputs cfft_f16 - Len = args.dimension - src_cfft_f16, dst_cfft_f16 = generate_cfft_f16(Len) - twi_cfft_f16 = generate_twiddleCoeff16(Len) - - kwargs = {'name': 'data_cfft_radix4_f16', - 'vector_inp': src_cfft_f16, - 'vector_res': dst_cfft_f16, - 'vector_twi': twi_cfft_f16, - 'vector_bitrev': brv_cfft_q16, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': len(brv_cfft_q16)} - gen_data_header_file( - args.outdir, - pathlib.Path(__file__).parent.absolute() / - "data_cfft_f16.h.tpl", - **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_chest.py b/software/data/generate_chest.py deleted file mode 100755 index e11eb8b62..000000000 --- a/software/data/generate_chest.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the Channel estimation. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib - -from mako.template import Template - -################## -# write_result # -################## - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def q_sat(x): - if x > 2**15 - 1: - return x - 2**16 - elif x < -2**15: - return x + 2**16 - else: - return x - - -def generate_chest_f16(nb_tx, nb_rx, nb_samples): - H = np.random.randn(nb_rx, nb_tx) + 1j * np.random.randn(nb_rx, nb_tx) - vector_pilot_tx = [] - vector_pilot_rx = [] - vector_Hest = [] - for k in range(nb_samples): - - # Compute data division - # pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx)) - # pilot_rx = np.dot(H, pilot_tx) - # Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :] - - # Compute data multiplication - pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx)) - pilot_rx = np.dot(H, pilot_tx) - pilot_tx = np.reciprocal(pilot_tx) - Hest = pilot_rx[:, np.newaxis] * pilot_tx[np.newaxis, :] - - # Interleaved real and imaginary parts - pilot_tx = np.column_stack( - (pilot_tx.imag, pilot_tx.real)).astype( - np.float16).flatten() - pilot_rx = np.column_stack( - (pilot_rx.imag, pilot_rx.real)).astype( - np.float16).flatten() - Hest = Hest.flatten() - Hest = np.column_stack( - (Hest.imag, Hest.real)).astype( - np.float16).flatten() - - # Output vectors - vector_pilot_tx.append(pilot_tx) - vector_pilot_rx.append(pilot_rx) - vector_Hest.append(Hest) - - vector_pilot_rx = np.concatenate(vector_pilot_rx, axis=0) - vector_pilot_tx = np.concatenate(vector_pilot_tx, axis=0) - vector_Hest = np.concatenate(vector_Hest, axis=0) - return vector_pilot_tx, vector_pilot_rx, vector_Hest - -# Compute the channel estimate - - -def compute_chest_q16(in_rx, in_tx, p): - n_rx = in_rx.size - n_tx = in_tx.size - result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16) - for i in range(n_rx): - a_r = in_rx[i].real - a_i = in_rx[i].imag - for j in range(n_tx): - b_r = in_tx[j].real - b_i = in_tx[j].imag - -# # Compute data division -# den = (2**16) // (b_r * b_r + b_i * b_i) -# num_r = (a_r * b_r) + (a_i * b_i) -# num_i = (a_i * b_r) - (a_r * b_i) -# result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p) -# result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p) - - # Compute data multiplication - num_r = (a_r * b_r) - (a_i * b_i) - num_i = (a_i * b_r) + (a_r * b_i) - result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p) - result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p) - return result - - -def generate_chest_q16(nb_tx, nb_rx, nb_samples): - FIXED_POINT = 8 - MAX = 2**7 - - qvector_pilot_tx = [] - qvector_pilot_rx = [] - qvector_Hest = [] - for k in range(nb_samples): - # Create pilots - pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_rx) - pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_tx) - # Compute Hest - Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT) - - pilot_tx = np.column_stack( - (pilot_tx.imag, pilot_tx.real)).astype( - np.int16).flatten() - pilot_rx = np.column_stack( - (pilot_rx.imag, pilot_rx.real)).astype( - np.int16).flatten() - qvector_pilot_tx.append(pilot_tx) - qvector_pilot_rx.append(pilot_rx) - qvector_Hest.append(Hest) - - qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples]) - qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples]) - qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples]) - return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-b", - "--num_rx", - type=int, - required=False, - default=32, - help='Number beams' - ) - parser.add_argument( - "-l", - "--num_tx", - type=int, - required=False, - default=4, - help='Number layers' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=32, - help='Number samples' - ) - - args = parser.parse_args() - nb_tx = args.num_tx - nb_rx = args.num_rx - nb_samples = args.num_samples - - pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl" - kwargs = {'name': 'data_chest_q16', - 'pilot_tx': pilot_tx, - 'pilot_rx': pilot_rx, - 'Hest': Hest, - 'nb_tx': nb_tx, - 'nb_rx': nb_rx, - 'nb_samples': nb_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - pilot_tx, pilot_rx, Hest = generate_chest_f16(nb_tx, nb_rx, nb_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_f16.h.tpl" - kwargs = {'name': 'data_chest_f16', - 'pilot_rx': pilot_rx, - 'pilot_tx': pilot_tx, - 'Hest': Hest, - 'nb_tx': nb_tx, - 'nb_rx': nb_rx, - 'nb_samples': nb_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_cholesky.py b/software/data/generate_cholesky.py deleted file mode 100644 index 1a25c4206..000000000 --- a/software/data/generate_cholesky.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 cholesky. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from scipy.linalg import solve_triangular -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def generate_cholesky_q32(n_matrix): - # Create hermitian matrix - L = np.random.randint(-2**(15), 2**(15) - 1, - size=(n_matrix, n_matrix), dtype=np.int32) - L = np.tril(L).astype(np.int32) - G = np.dot(np.asmatrix(L), np.asmatrix(L).transpose()) - - y = np.random.randint(-2**(15), 2**(15) - 1, n_matrix, dtype=np.int32) - - # Linear system solution - y = solve_triangular(L, y, lower=True) - # x = solve_triangular(np.asmatrix(L).T, y) - - # Reshape - G = np.reshape( - np.asarray(G), - (n_matrix * n_matrix), - order='C').astype( - np.int32) - L = np.reshape( - np.asarray(L), - (n_matrix * n_matrix), - order='C').astype( - np.int32) - y = np.reshape(np.asarray(y), (n_matrix), order='C').astype(np.int32) - - return G, L, y - - -def generate_cholesky_q16(n_matrix, n_samples): - vector_G = [] - vector_L = [] - for k in range(n_samples): - # Create hermitian matrix - H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, - dtype=np.int16) + \ - 1.j * np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, - dtype=np.int16) - H = H.reshape(n_matrix, n_matrix) - # Matrix to be inverted - H_h = (np.asmatrix(H).H) - # H_H = np.asmatrix(H).H - G = H_h * H - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Reshape - G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C') - L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C') - G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten() - # Output vectors - vector_G.append(G) - vector_L.append(L) - - vector_G = np.concatenate(vector_G, axis=0) - vector_L = np.concatenate(vector_L, axis=0) - return vector_G, vector_L - - -def generate_cholesky_f16(n_matrix, n_samples): - vector_G = [] - vector_L = [] - for k in range(n_samples): - # Create hermitian matrix - H = np.random.rand(n_matrix, n_matrix) + 1.j * \ - np.random.rand(n_matrix, n_matrix) - # Matrix to be inverted - # H_H = np.asmatrix(H).H - G = np.matmul(H, np.asmatrix(H).H) - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Reshape - G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C') - L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C') - G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten() - # Output vectors - vector_G.append(G) - vector_L.append(L) - - vector_G = np.concatenate(vector_G, axis=0) - vector_L = np.concatenate(vector_L, axis=0) - return vector_G, vector_L - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-n", - "--dimension", - type=int, - required=False, - default=4, - help='Matrix dimension' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=256, - help='Number samples' - ) - - args = parser.parse_args() - n_matrix = args.dimension - n_samples = args.num_samples - - G, L, y = generate_cholesky_q32(n_matrix) - tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_q32.h.tpl" - kwargs = {'name': 'data_cholesky_q32', - 'G': G, - 'L': L, - 'y': y, - 'n_matrix': n_matrix} - gen_data_header_file(args.outdir, tpl, **kwargs) - - vector_G, vector_L = generate_cholesky_q16(n_matrix, n_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_q16.h.tpl" - kwargs = {'name': 'data_cholesky_q16', - 'G': vector_G, - 'L': vector_L, - 'n_matrix': n_matrix, - 'n_samples': n_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - vector_G, vector_L = generate_cholesky_f16(n_matrix, n_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_f16.h.tpl" - kwargs = {'name': 'data_cholesky_f16', - 'G': vector_G, - 'L': vector_L, - 'n_matrix': n_matrix, - 'n_samples': n_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py deleted file mode 100644 index b5e7410af..000000000 --- a/software/data/generate_dotp.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -def generate_dotp_i32(Len): - - # Create matrix - MAX = 2**7 - 1 - A = np.random.randint(-MAX, MAX - 1, size=Len) - B = np.random.randint(-MAX, MAX - 1, size=Len) - C = np.dot(A, B) - return A, B, C - - -def generate_dotp_f32(Len): - - # Create matrix - A = np.random.randn(Len).astype(np.float32) - B = np.random.randn(Len).astype(np.float32) - C = (np.dot(A, B)).astype(np.float32) - return A, B, C - - -def generate_dotp_f16(Len): - - # Create matrix - A = np.random.randn(Len).astype(np.float16) - B = np.random.randn(Len).astype(np.float16) - C = (np.dot(A, B)).astype(np.float16) - return A, B, C - - -def generate_axpy_f32(Len): - - # Create matrix - X = np.random.rand(Len).astype(np.float32) - Y = np.random.rand(Len).astype(np.float32) - A = np.float32(3.14) - out = Y + A * X - return A, X, Y, out - - -def generate_axpy_f16(Len): - - # Create matrix - X = np.random.rand(Len).astype(np.float16) - Y = np.random.rand(Len).astype(np.float16) - A = np.float16(3.14) - out = Y + A * X - return A, X, Y, out - -################## -# compute_result # -################## - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-n", - "--length", - type=int, - required=False, - default=1024, - help='First dimension.' - ) - - args = parser.parse_args() - Len = args.length - - A, B, C = generate_dotp_i32(Len) - tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_i32.h.tpl" - kwargs = { - 'name': 'data_dotp_i32', - 'A': A, - 'B': B, - 'C': C, - 'Len': Len} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, B, C = generate_dotp_f32(Len) - tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f32.h.tpl" - kwargs = { - 'name': 'data_dotp_f32', - 'A': A, - 'B': B, - 'C': C, - 'Len': Len} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, B, C = generate_dotp_f16(Len) - tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f16.h.tpl" - kwargs = { - 'name': 'data_dotp_f16', - 'A': A, - 'B': B, - 'C': C, - 'Len': Len} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, X, Y, out = generate_axpy_f32(Len) - tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl" - kwargs = { - 'name': 'data_axpy_f32', - 'A': A, - 'X': X, - 'Y': Y, - 'out': out, - 'Len': Len} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, X, Y, out = generate_axpy_f16(Len) - tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl" - kwargs = { - 'name': 'data_axpy_f16', - 'A': A, - 'X': X, - 'Y': Y, - 'out': out, - 'Len': Len} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_matmul.py b/software/data/generate_matmul.py deleted file mode 100644 index 1b2edc9bc..000000000 --- a/software/data/generate_matmul.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -def generate_cmatmul_f16(matrix_M, matrix_N, matrix_P): - - # Create matrix - A = np.random.rand(matrix_M, matrix_N) + 1j * \ - np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) + 1j * \ - np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C') - B = np.reshape(B, (matrix_N * matrix_P), order='C') - C = np.reshape(C, (matrix_M * matrix_P), order='C') - - A = np.column_stack((A.imag, A.real)).astype(np.float16).flatten() - B = np.column_stack((B.imag, B.real)).astype(np.float16).flatten() - C = np.column_stack((C.imag, C.real)).astype(np.float16).flatten() - - return A, B, C - - -def generate_cmatmul_q16(matrix_M, matrix_N, matrix_P): - MAX = 2**15 - FIXED_POINT = 15 - - # Create matrix - A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) - B = np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) - - C = np.zeros((matrix_M, matrix_P), dtype=complex) - for k in range(matrix_P): - for i in range(matrix_M): - for j in range(matrix_N): - a = A[i][j].real - b = A[i][j].imag - c = B[j][k].real - d = B[j][k].imag - C[i][k] += (a * c - b * d) // (1 << FIXED_POINT) - C[i][k] += (b * c + a * d) // (1 << FIXED_POINT) * 1j - - A = np.reshape(A, (matrix_M * matrix_N), order='C') - B = np.reshape(B, (matrix_N * matrix_P), order='C') - C = np.reshape(C, (matrix_M * matrix_P), order='C') - - A = np.column_stack((A.imag, A.real)).astype(np.int16).flatten() - B = np.column_stack((B.imag, B.real)).astype(np.int16).flatten() - C = np.column_stack((C.imag, C.real)).astype(np.int16).flatten() - - return A, B, C - - -def generate_matmul_f16(matrix_M, matrix_N, matrix_P): - - # Create matrix - A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16) - B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16) - - return A, B, C - - -def generate_matmul_f32(matrix_M, matrix_N, matrix_P): - - # Create matrix - A = np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32) - - return A, B, C - -################## -# compute_result # -################## - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - A, B, C = generate_cmatmul_f16(matrix_M, matrix_N, matrix_P) - tpl = pathlib.Path(__file__).parent.absolute() / "data_cmatmul_f16.h.tpl" - kwargs = { - 'name': 'data_cmatmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, B, C = generate_cmatmul_q16(matrix_M, matrix_N, matrix_P) - tpl = pathlib.Path(__file__).parent.absolute() / "data_cmatmul_q16.h.tpl" - kwargs = { - 'name': 'data_cmatmul_q16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, B, C = generate_matmul_f16(matrix_M, matrix_N, matrix_P) - tpl = pathlib.Path(__file__).parent.absolute() / "data_matmul_f16.h.tpl" - kwargs = { - 'name': 'data_matmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - gen_data_header_file(args.outdir, tpl, **kwargs) - - A, B, C = generate_matmul_f32(matrix_M, matrix_N, matrix_P) - tpl = pathlib.Path(__file__).parent.absolute() / "data_matmul_f32.h.tpl" - kwargs = { - 'name': 'data_matmul_f32', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py deleted file mode 100644 index f8918f561..000000000 --- a/software/data/generate_mimo_mmse.py +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 mmse. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template -import pyflexfloat as ff -from scipy.linalg import solve_triangular - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def generate_fmmse(N_tx, N_rx, N_itr, my_type): - - vH = np.zeros([N_itr, N_tx * 2 * N_rx], dtype=my_type) - vG = np.zeros([N_itr, N_tx * 2 * N_tx], dtype=my_type) - vy = np.zeros([N_itr, 2 * N_rx], dtype=my_type) - vN = np.zeros([N_itr, 2 * N_tx], dtype=my_type) - vx = np.zeros([N_itr, 2 * N_tx], dtype=my_type) - - for k in range(N_itr): - - # Create input vector - y = np.random.rand(N_rx).astype(my_type) + 1.j * \ - np.random.rand(N_rx).astype(my_type) - - # Create channel matrix - H = np.random.rand(N_rx, N_tx).astype(my_type) + 1.j * \ - np.random.rand(N_rx, N_tx).astype(my_type) - # Generate noise variance - N = np.random.rand(1).astype(my_type) - - # Matrix to be inverted in MMSE estimator - H_h = np.asmatrix(H).H - G = np.matmul(H_h, H) + N * np.eye(H.shape[1]) - N = N * np.ones(N_tx) - - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Linear system solution - y1 = np.transpose(np.dot(H_h, y)) - y2 = solve_triangular(L, y1, lower=True) - x = solve_triangular(np.asmatrix(L).H, y2) - - H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C') - G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') - N = np.column_stack((N.real, N.imag)).astype(my_type).flatten() - H = np.column_stack((H.real, H.imag)).astype(my_type).flatten() - G = np.column_stack((G.real, G.imag)).astype(my_type).flatten() - x = np.column_stack((x.real, x.imag)).astype(my_type).flatten() - y = np.column_stack((y.real, y.imag)).astype(my_type).flatten() - - vH[k, :] = H - vG[k, :] = G - vy[k, :] = y - vN[k, :] = N - vx[k, :] = x - - vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type) - vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type) - vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type) - vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type) - vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type) - - return vN, vH, vG, vy, vx - - -def generate_mimo_mmse_q16(N_tx, N_rx, N_itr): - - vN = np.zeros([N_itr, 2 * N_tx], dtype=np.int16) - vH = np.zeros([N_itr, 2 * N_tx * N_rx], dtype=np.int16) - vG = np.zeros([N_itr, 2 * N_tx * N_tx], dtype=np.int16) - vy = np.zeros([N_itr, 2 * N_rx], dtype=np.int16) - vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16) - MAX = 2**15 - for k in range(N_itr): - # Create channel matrix - rH = np.random.randint(-MAX, MAX - 1, N_rx * N_tx, dtype=np.int16) - iH = np.random.randint(-MAX, MAX - 1, N_rx * N_tx, dtype=np.int16) - H = rH + 1.j * iH - # Create input vector - y = np.random.randint(-MAX, MAX - 1, N_rx, dtype=np.int16) + 1.j * \ - np.random.randint(-MAX, MAX - 1, N_rx, dtype=np.int16) - # Generate noise variance - N = np.random.randint(-MAX, MAX - 1, N_tx, dtype=np.int16) - - H = H.reshape(N_rx, N_tx) - # Matrix to be inverted in MMSE estimator - H_h = (np.asmatrix(H).H) - # Hermitian - G = np.matmul(H_h, H) + N - - # Matrix vector product - y1 = np.transpose(np.dot(H_h, y)) - # Cholesky decomposition - # L = np.linalg.cholesky(G) - L = G - # Linear system solution - y2 = solve_triangular(L, y1, lower=True) - x = solve_triangular(np.asmatrix(L).H, y2) - - vN[k, :] = np.column_stack((N.real, N.imag)).astype(np.int16).flatten() - vH[k, :] = np.column_stack((H.real, H.imag)).astype(np.int16).flatten() - vG[k, :] = np.column_stack((G.real, G.imag)).astype(np.int16).flatten() - vy[k, :] = np.column_stack((y.real, y.imag)).astype(np.int16).flatten() - vx[k, :] = np.column_stack((x.real, x.imag)).astype(np.int16).flatten() - - vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(np.int16) - vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(np.int16) - vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(np.int16) - vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(np.int16) - vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(np.int16) - - return vN, vH, vG, vy, vx - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-n", - "--transmitters", - type=int, - required=False, - default=4, - help='First dimension.' - ) - parser.add_argument( - "-m", - "--receivers", - type=int, - required=False, - default=32, - help='First dimension.' - ) - parser.add_argument( - "-k", - "--iterations", - type=int, - required=False, - default=1, - help='Iterations.' - ) - - args = parser.parse_args() - N_tx = args.transmitters - N_rx = args.receivers - N_itr = args.iterations - - vN, vH, vG, vy, vx = generate_fmmse( - N_tx, N_rx, N_itr, np.float32) - tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f32.h.tpl" - kwargs = {'name': 'data_mimo_mmse_f32', - 'H': vH, - 'G': vG, - 'N': vN, - 'y': vy, - 'x': vx, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': N_itr} - gen_data_header_file(args.outdir, tpl, **kwargs) - - vN, vH, vG, vy, vx = generate_fmmse( - N_tx, N_rx, N_itr, np.float16) - tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f16.h.tpl" - kwargs = {'name': 'data_mimo_mmse_f16', - 'H': vH, - 'G': vG, - 'N': vN, - 'y': vy, - 'x': vx, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': N_itr} - gen_data_header_file(args.outdir, tpl, **kwargs) - - vN, vH, vG, vy, vx = generate_fmmse( - N_tx, N_rx, N_itr, np.float16) - tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f8.h.tpl" - kwargs = {'name': 'data_mimo_mmse_f8', - 'H': ff.array(vH, "e5m2"), - 'G': vG, - 'N': ff.array(vN, "e5m2"), - 'y': ff.array(vy, "e5m2"), - 'x': vx, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': N_itr} - gen_data_header_file(args.outdir, tpl, **kwargs) - - vN, vH, vG, vy, vx = generate_mimo_mmse_q16(N_tx, N_rx, N_itr) - tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_q16.h.tpl" - kwargs = {'name': 'data_mimo_mmse_q16', - 'H': vH, - 'G': vG, - 'N': vN, - 'y': vy, - 'x': vx, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': N_itr} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/generate_ofdm.py b/software/data/generate_ofdm.py deleted file mode 100644 index 64b0a7ca6..000000000 --- a/software/data/generate_ofdm.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Author: Marco Bertuletti, ETH Zurich - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - -################## -# compute_result # -################## - - -def compute_bitreversal(N, R): - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - return tps - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"data_{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-rx", - "--receivers", - type=int, - required=False, - default=64, - help='First dimension.' - ) - parser.add_argument( - "-bs", - "--beams", - type=int, - required=False, - default=32, - help='Second dimension.' - ) - parser.add_argument( - "-sc", - "--subcarriers", - type=int, - required=False, - default=4096, - help='Iterations.' - ) - - args = parser.parse_args() - N_rx = args.receivers - N_bs = args.beams - N_sc = args.subcarriers - - pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16) - pTw_coef = (np.random.rand(int(3 * N_sc / 4))).astype(np.float16) - pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16) - pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16) - - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2))) - - kwargs = {'name': 'ofdm', - 'pFFT_src': pFFT_src, - 'pTw_coef': pTw_coef, - 'pBF_coef': pBF_coef, - 'pBF_dst': pBF_dst, - 'bitrev': Bitreversal, - 'N_rx': N_rx, - 'N_bs': N_bs, - 'N_sc': N_sc, - 'Log2Len': int(np.log2(N_sc)), - 'BitrevLen': len(Bitreversal)} - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h index 121267545..3b42bdb80 100644 --- a/software/kernels/baremetal/mempool_cholesky_f16s.h +++ b/software/kernels/baremetal/mempool_cholesky_f16s.h @@ -135,6 +135,9 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n, asm volatile("fccdotpex.s.h %0, %1, %2;" : "+&r"(asbs) : "r"(cd), "r"(ab)); + // asm volatile("fcndotpex.s.h %0, %1, %2;" + // : "+&r"(asbs) + // : "r"(cd), "r"(ab)); } asm volatile("pv.shuffle2.h %0, %0, %[mask];" : "+&r"(asbs) diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h index cd9134968..39bf46394 100644 --- a/software/kernels/baremetal/mempool_linearsolver_q16s.h +++ b/software/kernels/baremetal/mempool_linearsolver_q16s.h @@ -28,7 +28,7 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, // Solve for each variable x[i] in loop for (i = 0; i < n; i++) { uint32_t ridx = transposed ? (n - i - 1) : i; - diag = pL[2U * (ridx * offset + ridx)]; + diag = pL[2U * (ridx + ridx)]; // Initialize the sums as = 0; bs = 0; diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h index 45076f9fe..91e3aa789 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h @@ -254,38 +254,40 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS, for (i = 0; i < n_tx; i++) { if (n_tx % 4 != 0) { - as0 = 0.0f; // Initialize the real part of sums - bs0 = 0.0f; // Initialize the imag part of sums - // Inner Loop - for (k = 0; k < n_rx; k++) { - ab = (*(v2h *)&pH[2U * (k * n_tx + i)]); - cd0 = (*(v2h *)&pH[2U * (k * n_tx + j)]); - // dotproducts (ac + bd) + j (ad - bc) - asm volatile( - // a * c + b * d - "vfdotpex.s.h %[as0], %[ab], %[cd0];" - "pv.shuffle2.h %[cd0], %[cd0], %[shuffle_mask];" - "xor %[cd0], %[neg_mask], %[cd0];" - // a * d - b * c - "vfdotpex.s.h %[bs0], %[ab], %[cd0];" - : [cd0] "+&r"(cd0), [as0] "+&r"(as0), [bs0] "+&r"(bs0) - : [ab] "r"(ab), [neg_mask] "r"(neg_mask), - [shuffle_mask] "r"(shuffle_mask) - :); - } - // Store - v2h res0; - asm volatile("vfcpka.h.s %0, %1, %2;" - : "=&r"(res0) - : "r"(as0), "r"(bs0) - :); - if (zf == 0) { - asm volatile("and %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF)); - asm volatile("fadd.h %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i])); + for (j = 0; j < n_tx; j++) { + as0 = 0.0f; // Initialize the real part of sums + bs0 = 0.0f; // Initialize the imag part of sums + // Inner Loop + for (k = 0; k < n_rx; k++) { + ab = (*(v2h *)&pH[2U * (k * n_tx + i)]); + cd0 = (*(v2h *)&pH[2U * (k * n_tx + j)]); + // dotproducts (ac + bd) + j (ad - bc) + asm volatile( + // a * c + b * d + "vfdotpex.s.h %[as0], %[ab], %[cd0];" + "pv.shuffle2.h %[cd0], %[cd0], %[shuffle_mask];" + "xor %[cd0], %[neg_mask], %[cd0];" + // a * d - b * c + "vfdotpex.s.h %[bs0], %[ab], %[cd0];" + : [cd0] "+&r"(cd0), [as0] "+&r"(as0), [bs0] "+&r"(bs0) + : [ab] "r"(ab), [neg_mask] "r"(neg_mask), + [shuffle_mask] "r"(shuffle_mask) + :); + } + // Store + v2h res0; + asm volatile("vfcpka.h.s %0, %1, %2;" + : "=&r"(res0) + : "r"(as0), "r"(bs0) + :); + if (zf == 0) { + asm volatile("and %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF)); + asm volatile("fadd.h %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i])); + } + // Store + uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j); + (*(v2h *)&pG[addr]) = res0; } - // Store - uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j); - (*(v2h *)&pG[addr]) = res0; } else { // UNROLL_4 diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h index baad28e0d..70d77b82d 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h @@ -127,8 +127,7 @@ void mempool_hermitian_f32s(float *pH, float *pG, float *pS, @return none */ void mempool_MVP_conjtransp_f32s(float *pH, float *px, float *py, - const uint32_t n_rx, const uint32_t n_tx, - const uint32_t folded) { + const uint32_t n_rx, const uint32_t n_tx) { uint32_t i, j; float a0, a1, a2, a3; diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h index 0d68e3d9d..3ce36f3b6 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h @@ -99,25 +99,22 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb) // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb) - "vfdotpex.s.h %[s0],%[CoSi1],%[D];" - "vfdotpex.s.h %[s1],%[C1],%[D];" - + "vfdotpex.s.r.h %[s0],%[CoSi1],%[D];" + "vfdotpex.s.r.h %[s1],%[C1],%[D];" // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd) - "vfdotpex.s.h %[s2],%[CoSi2],%[B];" - "vfdotpex.s.h %[s3],%[C2],%[B];" - + "vfdotpex.s.r.h %[s2],%[CoSi2],%[B];" + "vfdotpex.s.r.h %[s3],%[C2],%[B];" // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd) // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd) - "vfdotpex.s.h %[s4],%[CoSi3],%[C];" - "vfdotpex.s.h %[s5],%[C3],%[C];" - + "vfdotpex.s.r.h %[s4],%[CoSi3],%[C];" + "vfdotpex.s.r.h %[s5],%[C3],%[C];" // xb', yb' - "vfcpka.h.s %[B], %[s1], %[s0];" + "vfcpka.h.s %[D], %[s1], %[s0];" // xc', yc' - "vfcpka.h.s %[C], %[s3], %[s2];" + "vfcpka.h.s %[B], %[s3], %[s2];" // xd', yd' - "vfcpka.h.s %[D], %[s5], %[s4];" + "vfcpka.h.s %[C], %[s5], %[s4];" : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0), [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), @@ -127,9 +124,9 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, [neg_mask] "r"(0x3C00BC00) :); *((v2h *)&pOut[i0_store * 2U]) = A; - *((v2h *)&pOut[i1_store * 2U]) = C; - *((v2h *)&pOut[i2_store * 2U]) = B; - *((v2h *)&pOut[i3_store * 2U]) = D; + *((v2h *)&pOut[i1_store * 2U]) = B; + *((v2h *)&pOut[i2_store * 2U]) = D; + *((v2h *)&pOut[i3_store * 2U]) = C; } /** @@ -227,18 +224,18 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb) // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb) - "vfdotpex.s.h %[s0],%[CoSi1],%[D];" - "vfdotpex.s.h %[s1],%[C1],%[D];" + "vfdotpex.s.r.h %[s0],%[CoSi1],%[D];" + "vfdotpex.s.r.h %[s1],%[C1],%[D];" // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd) - "vfdotpex.s.h %[s2],%[CoSi2],%[B];" - "vfdotpex.s.h %[s3],%[C2],%[B];" + "vfdotpex.s.r.h %[s2],%[CoSi2],%[B];" + "vfdotpex.s.r.h %[s3],%[C2],%[B];" // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd) // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd) - "vfdotpex.s.h %[s4],%[CoSi3],%[C];" - "vfdotpex.s.h %[s5],%[C3],%[C];" + "vfdotpex.s.r.h %[s4],%[CoSi3],%[C];" + "vfdotpex.s.r.h %[s5],%[C3],%[C];" // xb', yb' "vfcpka.h.s %[B], %[s1], %[s0];" diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h index e7bd7edc5..c6b4acf6b 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h @@ -60,14 +60,14 @@ #endif void mempool_radix4_cfft_f16p(__fp16 *pSrc16, uint32_t fftLen, - const __fp16 *pCoef16, uint32_t twidCoefModifier, - uint32_t nPE) { + const __fp16 *pCoef16, uint32_t nPE) { uint32_t absolute_core_id = mempool_get_core_id(); uint32_t core_id = absolute_core_id % nPE; __fp16 t0, t1, t2, t3, t4, t5; v2h CoSi1, CoSi2, CoSi3; v2h C1, C2, C3; uint32_t n1, n2, ic, i0, j, k; + uint32_t twidCoefModifier = 1; uint32_t step, steps; /* START OF FIRST STAGE PROCESSING */ @@ -165,17 +165,17 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, uint32_t n1, n2; uint32_t i0, k, ic; __fp16 *pTmp; - uint32_t twidCoefModifier = 1U; + uint32_t twidCoefModifier = 1; #endif /* START OF FIRST STAGE PROCESSING */ n1 = fftLen; - n2 = n1 >> 2U; + n2 = n1 >> 2; for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { #ifdef FOLDED_TWIDDLES ic = i0; - ic_store = ic >> 2U; - n2_store = n2 >> 2U; + ic_store = ic >> 2; + n2_store = n2 >> 2; #else ic = i0; #endif @@ -192,22 +192,22 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, pCoef_src = pCoef_dst; pCoef_dst = pTmp; #else - twidCoefModifier <<= 2U; + twidCoefModifier <<= 2; #endif mempool_log_partial_barrier(2, absolute_core_id, nPE); /* END OF FIRST STAGE PROCESSING */ /* START OF MIDDLE STAGE PROCESSING */ - for (k = fftLen / 4U; k > 4U; k >>= 2U) { + for (k = fftLen / 4U; k > 4; k >>= 2) { n1 = n2; - n2 >>= 2U; + n2 >>= 2; for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) { #ifdef FOLDED_TWIDDLES ic = i0; // (ic % n2) / 4 take only every 4th index in the wing // (ic / n2) * n2 shift of the wing size ic_store = ((ic % n2) >> 2) + (ic / n2) * n2; - n2_store = n2 >> 2U; + n2_store = n2 >> 2; #else ic = (i0 % n2) * twidCoefModifier; #endif @@ -224,7 +224,7 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, pCoef_src = pCoef_dst; pCoef_dst = pTmp; #else - twidCoefModifier <<= 2U; + twidCoefModifier <<= 2; #endif mempool_log_partial_barrier(2, absolute_core_id, nPE); }