diff --git a/config/terapool.mk b/config/terapool.mk index 0f1c264f8..30153749e 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -32,7 +32,7 @@ banking_factor ?= 4 # Access latency between remote groups # Options: "7", "9" or "11": -remote_group_latency_cycles ?= 7 +remote_group_latency_cycles ?= 11 # Radix for hierarchical AXI interconnect axi_hier_radix ?= 9 @@ -45,4 +45,4 @@ dmas_per_group ?= 4 # Brust Length = 16 # L2 Banks/Channels l2_banks = 16 -l2_size ?= 16777216 # 1000000 \ No newline at end of file +l2_size ?= 16777216 # 1000000 diff --git a/hardware/Makefile b/hardware/Makefile index 6de18eb4f..2f5f75a71 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -46,7 +46,7 @@ verilator_top ?= mempool_tb_verilator # Python python ?= python3 # Enable tracing -snitch_trace ?= 1 +snitch_trace ?= 0 # Path to DRAMsys dramsys_resouces_path ?= $(MEMPOOL_DIR)/hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs diff --git a/software/apps/baremetal/conv2d_db/main.c b/software/apps/baremetal/conv2d_db/main.c index 8e5510dff..71eca3644 100644 --- a/software/apps/baremetal/conv2d_db/main.c +++ b/software/apps/baremetal/conv2d_db/main.c @@ -26,75 +26,13 @@ dump(time, 0); volatile int32_t in[M * N] __attribute__((section(".l1_prio"))); volatile int32_t out[M * N] __attribute__((section(".l1_prio"))); -volatile int32_t round_barrier[NUM_CORES*BANKING_FACTOR] __attribute__((section(".l1_prio"))); -volatile int32_t out_l2[M * N] __attribute__((section(".l2"))) -__attribute__((aligned(NUM_CORES * BANKING_FACTOR * 4))); - -uint32_t final_log_barrier(uint32_t* round_barrier, uint32_t step, uint32_t log2_radix, - uint32_t core_id) { - uint32_t next_step = step << log2_radix; - uint32_t barrier_idx = (core_id / next_step) * next_step + step - 1; - uint32_t *log_barrier = &round_barrier[barrier_idx*BANKING_FACTOR]; - - uint32_t val = __atomic_fetch_add(log_barrier, step, __ATOMIC_RELAXED); - if (val == NUM_CORES - step) { - // Last core of last stage - dump_time(2); - return (uint32_t)log_barrier; - } else if (val == (uint32_t)(next_step - step)) { - // Last core of this stage - __atomic_store_n(log_barrier, 0, __ATOMIC_RELAXED); - return final_log_barrier(round_barrier, step << log2_radix, log2_radix, core_id); - } else { - if (val == 0 && log_barrier == &round_barrier[0]) { - dump_time(1); - } - // Middle cores, sleep - mempool_wfi(); - } - return 0; -} - -uint32_t dma_log_barrier(uint32_t* round_barrier, uint32_t step, uint32_t log2_radix, uint32_t core_id) { - uint32_t next_step = step << log2_radix; - uint32_t barrier_idx = (core_id / next_step) * next_step + step - 1; - uint32_t *log_barrier = &round_barrier[barrier_idx*BANKING_FACTOR]; - - uint32_t val = __atomic_fetch_add(log_barrier, step, __ATOMIC_RELAXED); - if (val == NUM_CORES - step) { - // Last core of last stage - dump_time(2); - // Clear wfi that was triggered by the first core - mempool_wfi(); - return (uint32_t)log_barrier; - } else if (val == (uint32_t)(next_step - step)) { - // Last core of this stage - __atomic_store_n(log_barrier, 0, __ATOMIC_RELAXED); - return dma_log_barrier(round_barrier, step << log2_radix, log2_radix, core_id); - } else if (val == 0 && log_barrier == &round_barrier[0]) { - // First core of first barrier in first stage - dump_time(1); - // Check that the DMA from the previous iteration is done - dma_wait(); - // Wake up all cores to get to work - wake_up_all(); - mempool_wfi(); - dump_time(0); - } else { - // Middle cores, sleep - mempool_wfi(); - } - return 0; -} +volatile int32_t out_l2[M * N] __attribute__((section(".l2"))); int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); - // Initial setup - round_barrier[core_id*BANKING_FACTOR] = 0; - int32_t kernel[KERNEL_N * KERNEL_N]; kernel[0] = 1; @@ -109,89 +47,24 @@ int main() { kernel[7] = 2; kernel[8] = 1; - // Double-buffered convolution - const int last_round = 4; - const uint32_t log2_radix = LOG_RADIX; - - const int32_t *in_comp; - const int32_t *in_dma; - int32_t *out_comp; - int32_t *out_dma; - uint32_t bar; - - // Wait at barrier until everyone is ready - mempool_barrier(num_cores); - mempool_start_benchmark(); - - // Initialize img + // Initialize Matrices if (core_id == 0) { - dma_memcpy_nonblocking((void *)in, (void *)in_l2, M * N / 2 * sizeof(int32_t)); - // Initial launch, Core 0 launched the data transfer - wake_up_all(); - dump_time(0); + dma_memcpy_blocking((void *)in, (void *)in_l2, M * N * sizeof(int32_t)); } + mempool_barrier(num_cores); mempool_start_benchmark(); - - for (int round = 0; round < last_round; ++round) { - if (round % 2 == 0) { - in_comp = (const int32_t *)&in[0]; - out_comp = (int32_t *)&out[0]; - in_dma = (const int32_t *)&in[N * M / 2]; - out_dma = (int32_t *)&out[N * M / 2]; - } else { - in_dma = (const int32_t *)&in[0]; - out_dma = (int32_t *)&out[0]; - in_comp = (const int32_t *)&in[N * M / 2]; - out_comp = (int32_t *)&out[N * M / 2]; - } - mempool_wfi(); - // Barrier, launch DMA for next iteration - bar = dma_log_barrier(round_barrier, 1, log2_radix, core_id); - mempool_start_benchmark(); - if (bar) { - // We are the last one, reset the barrier - // The old data can now be overwritten with a new DMA request - if (round != last_round - 1) { - dma_memcpy_nonblocking((void *)in_dma, (void *)in_l2, - M * N / 2 * sizeof(int32_t)); - } - if (round != 0) { - dma_memcpy_nonblocking((void *)out_l2, (void *)out_dma, - M * N / 2 * sizeof(int32_t)); - } - // We are the last one, reset the barrier - __atomic_store_n((uint32_t *)bar, 0, __ATOMIC_RELAXED); - if (round != last_round - 1) { - wake_up_all(); - } - } else { - // Wait until the core checking the DMA gives the signal - // mempool_wfi(); - } - mempool_start_benchmark(); - conv2d_3x3_crazy_parallel((const int32_t *)in_comp, N, M / 2, - (const int32_t *)kernel, (int32_t *)out_comp, - core_id, NUM_CORES); - mempool_start_benchmark(); - } - - // Last write back - bar = final_log_barrier(round_barrier, 1, log2_radix, core_id); + conv2d_3x3_crazy_parallel((const int32_t *)in, N, M, + (const int32_t *)kernel, (int32_t *)out, + core_id, NUM_CORES); mempool_start_benchmark(); - if (bar) { - // We are the last one, reset the barrier - // The old data can now be overwritten with a new DMA request - dma_memcpy_blocking((void *)out_l2, (void *)out_dma, - M * N / 2 * sizeof(int32_t)); - // We are the last one, reset the barrier - __atomic_store_n((uint32_t *)bar, 0, __ATOMIC_RELAXED); - wake_up_all(); - mempool_wfi(); - } - + mempool_log_barrier(8, core_id); mempool_start_benchmark(); - mempool_barrier(num_cores); + conv2d_3x3_crazy_parallel((const int32_t *)in, N, M, + (const int32_t *)kernel, (int32_t *)out, + core_id, NUM_CORES); + mempool_start_benchmark(); + mempool_log_barrier(8, core_id); mempool_stop_benchmark(); return 0;