Skip to content

Commit

Permalink
[software] Adapt to new folder structure
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Jul 5, 2024
1 parent d3f5650 commit f3f9212
Show file tree
Hide file tree
Showing 69 changed files with 1,102 additions and 4,496 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ toolchain/riscv-opcodes/*:

format:
$(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR)
find ./software/runtime/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} +

clean: clean-riscv-tests
rm -rf $(INSTALL_DIR)
4 changes: 2 additions & 2 deletions software/apps/baremetal/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
ALL := $(APPS)

ALL_GCC := $(filter-out matmul_f16 matmul_f32, $(ALL))
ALL_LLVM := $(filter-out synth_i32 chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL))
ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL))

# Make all applications
all: $(ALL_GCC)
Expand Down
117 changes: 66 additions & 51 deletions software/apps/baremetal/cfft_radix4_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,53 @@
#include <string.h>

/* Mempool runtime libraries */
#include "builtins_v2.h"
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "xpulp/builtins_v2.h"

/* CFFT data libraries */
#include "data/data_cfft_radix4_f16.h"

/*
- FOLDED: Parallel FFT with "memory-aware" load/store scheme
- SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
load/store scheme
- N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
- N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
- FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
*/

#define FOLDED
#include "data_cfft_radix4_f16.h"

/* CHOOSE ONE */
//#define SINGLE // Single core FFT.
//#define PARALLEL // Parallel FFT not "memory-aware".
//#define FOLDED // Parallel FFT with "memory-aware" load/store.
#define SCHEDULED // Folded FFTs arranged in rows and cols.'''

// Bitreversal index from table.
#define BITREVERSETABLE
// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 2
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 2
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#endif
// Also the twiddles have "memory-aware" load/stores.
#define FOLDED_TWIDDLES
#define N_FFTs_ROW 1
#define N_FFTs_COL 1

#include "kernel/mempool_checks.h"
#include "kernel/mempool_radix4_cfft_butterfly_f16.h"
#include "kernel/mempool_radix4_cfft_f16p.h"
#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
#include "baremetal/mempool_cfft_q16_bitreversal.h"
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_radix4_cfft_butterfly_f16.h"
#include "baremetal/mempool_radix4_cfft_f16p.h"

#if (defined(SINGLE) || defined(PARALLEL))
__fp16 l1_pSrc[2 * N_CSAMPLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_pDst[2 * N_CSAMPLES]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
#endif

#if (defined(SCHEDULED) || defined(FOLDED))
__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
Expand All @@ -49,49 +67,44 @@ __fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
#endif

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);
__fp16 *pRes = (__fp16 *)0;
mempool_barrier_init(core_id);

/* INITIALIZATION */

if (core_id == 0) {
// Each FFT is folded over 4 memory rows
// Each memory row is 2 * N_BANKS (real-imag) samples
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
(N_CSAMPLES * N_FFTs_COL) * sizeof(int32_t));
for (uint32_t i = 0; i < N_FFTs_COL; i++) {
dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
l2_pSrc, N_CSAMPLES * sizeof(int32_t));
}
}
dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
}
// Initialize the Twiddles folded
#ifdef FOLDED_TWIDDLES
mempool_barrier(num_cores);
for (uint32_t j = 0; j < N_FFTs_COL; j++) {
uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
*(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * N_WORDS_COL)] =
*(v2h *)&l2_twiddleCoef_f16[2U * i];
*(v2h *)&l1_twiddleCoef_f16_src[2U *
(i + j * N_WORDS_COL + 1 * N_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)];
*(v2h *)&l1_twiddleCoef_f16_src[2U *
(i + j * N_WORDS_COL + 2 * N_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)];
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
*(v2h *)&l2_twiddleCoef_f16[2 * i];
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
}
}
#endif
if (core_id == 0) {
printf("01: END INITIALIZATION\n");
}
mempool_barrier(num_cores);

#if (defined(FOLDED) && defined(FOLDED_TWIDDLES))
#ifdef FOLDED
if (core_id < (N_CSAMPLES / 16)) {
mempool_start_benchmark();
mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES,
Expand All @@ -105,26 +118,28 @@ int main() {
#endif

#ifdef SCHEDULED
uint32_t nPE = (N_CSAMPLES / 16);
if (core_id < N_FFTs_COL * nPE) {
uint32_t CORES_USED = (N_CSAMPLES / 4) / BANKING_FACTOR;
if (core_id < N_FFTs_COL * CORES_USED) {
mempool_start_benchmark();
uint32_t N_WORDS_COL = N_CSAMPLES / 4;
uint32_t col_id = core_id / nPE;
mempool_radix4_cfft_f16p_scheduler(
l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL,
l1_twiddleCoef_f16_src + 2 * col_id * N_WORDS_COL,
l1_twiddleCoef_f16_dst + 2 * col_id * N_WORDS_COL, l1_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH, 1, nPE);
pRes = l1_pDst;
mempool_log_partial_barrier(2, core_id, N_FFTs_COL * nPE);
l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH, 1, CORES_USED);
mempool_log_partial_barrier(2, core_id, N_FFTs_COL * CORES_USED);
mempool_stop_benchmark();
}
#ifdef BITREVERSETABLE
pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
#else
pRes = ((LOG2 / 2) % 2) == 0 ? l1_pDst : l1_pSrc;
#endif
#endif

mempool_barrier(num_cores);
if (core_id == 0) {
printf("02: END COMPUTATION\n");
}

mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
mempool_barrier(num_cores);
return 0;
Expand Down
7 changes: 3 additions & 4 deletions software/apps/baremetal/chest_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@
#include <stdlib.h>
#include <string.h>

#include "builtins_v2.h"
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "builtins_v2.h"

#include "data_chest_f16.h"
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_chest_f16p.h"
#include "baremetal/mempool_chest_f16s.h"
#include "baremetal/mempool_chest_f16.h"
#include "data_chest_f16.h"

//#define SINGLE
#define PARALLEL
Expand Down
4 changes: 2 additions & 2 deletions software/apps/baremetal/chest_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ int main() {
#endif
#ifdef PARALLEL
mempool_start_benchmark();
mempool_chest_q16p_unrolled4_local(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX,
N_TX, N_SAMPLES, core_id, num_cores);
mempool_chest_q16p_unrolled4(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX, N_TX,
N_SAMPLES, core_id, num_cores);
mempool_stop_benchmark();
mempool_barrier(num_cores);
#endif
Expand Down
1 change: 0 additions & 1 deletion software/apps/baremetal/cholesky_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "builtins_v2.h"

#include "data_cholesky_f16.h"

Expand Down
5 changes: 2 additions & 3 deletions software/apps/baremetal/cholesky_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "xpulp/builtins_v2.h"

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cholesky_q16s.h"
#include "data_cholesky_q16.h"
#include "kernel/mempool_checks.h"
#include "kernel/mempool_cholesky_q16s.h"

#define SINGLE

Expand Down
7 changes: 4 additions & 3 deletions software/apps/baremetal/cmatmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
#include "synchronization.h"

#include "data_cmatmul_f16.h"

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cmatmul_f16.h"
#define PARALLEL_2x2
#define PARALLEL_2x4
#define TEST

__fp16 matrix_a[2 * dim_M * dim_N]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
Expand All @@ -26,7 +28,7 @@ __fp16 matrix_b[2 * dim_N * dim_P]
__fp16 matrix_c[2 * dim_M * dim_P]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
section(".l1_prio")));
__fp16 matrix_a_folded[2 * dim_M * (4 * NUM_CORES)]
__fp16 matrix_a_folded[2 * (BANKING_FACTOR * NUM_CORES)]
__attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
section(".l1_prio")));

Expand Down Expand Up @@ -73,7 +75,6 @@ int main() {
mempool_start_benchmark();
cmatmul_2x4_f16p(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P, core_id,
nPE);
mempool_log_partial_barrier(2, core_id, nPE);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
#include "runtime.h"
#include "synchronization.h"

#include "data/data_cmatmul_q16.h"
#include "kernel/mempool_checks.h"
#include "kernel/mempool_cmatmul_q16.h"
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cmatmul_q16.h"
#include "data_cmatmul_q16.h"

#define PARALLEL

Expand Down
29 changes: 18 additions & 11 deletions software/apps/baremetal/mimo_mmse_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,26 @@
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "builtins_v2.h"

#include "data_mimo_mmse_f16.h"
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cholesky_f16s.h"
#include "baremetal/mempool_linearsolver_f16s.h"
#include "baremetal/mempool_mimo_mmse_f16s.h"

//#define DOUBLE_BUFFERING
#include "data_mimo_mmse_f16.h"

// #define DOUBLE_BUFFERING
// #define N_ROUNDS (1)
// #define DMA_TRANSFER2

#ifndef DOUBLE_BUFFERING

#define SINGLE
//#define PARALLEL
/**********************************************/
/* TEST OF THE KERNELS WITH NO DATA MOVEMENTS */
/**********************************************/

//#define SINGLE
#define PARALLEL
//#define FOLDED

__fp16 l1_H[2 * N_TX * N_RX * N_ITR]
Expand Down Expand Up @@ -110,7 +117,7 @@ int main() {
Ptrx += 2 * itr_bg * N_TX_bg;
}
}
mempool_log_barrier(2, core_id);
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

Expand Down Expand Up @@ -139,7 +146,7 @@ int main() {
mempool_Ltrisol_folded_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_folded_f16s(PtrL, Ptry3, Ptrx, N_TX);
}
mempool_log_barrier(2, core_id);
mempool_barrier(num_cores);
mempool_stop_benchmark();
#endif

Expand Down Expand Up @@ -244,8 +251,8 @@ int main() {
__fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
__fp16 *Ptry3 = y3 + itr * (2 * N_TX);
mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
Expand Down Expand Up @@ -294,8 +301,8 @@ int main() {
__fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX);
__fp16 *PtrG = G + itr * (2 * N_TX * N_TX);
__fp16 *Ptry2 = y2 + itr * (2 * N_TX);
mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
}
mempool_log_barrier(2, core_id);

Expand Down
Loading

0 comments on commit f3f9212

Please sign in to comment.