From edc87cfd35afcfa5a7688220ca1b6af3b6c2cdea Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 15 Sep 2023 10:21:30 +0200 Subject: [PATCH] sw: Add `experimental_offload` app --- target/sim/sw/device/Makefile | 1 + .../device/apps/experimental_offload/Makefile | 19 ++ .../apps/experimental_offload/src/axpy_job.h | 134 ++++++++++++++ .../apps/experimental_offload/src/offload.c | 123 +++++++++++++ target/sim/sw/host/Makefile | 1 + .../host/apps/experimental_offload/Makefile | 23 +++ .../apps/experimental_offload/src/offload.c | 167 ++++++++++++++++++ .../apps/experimental_offload/src/offload.h | 81 +++++++++ 8 files changed, 549 insertions(+) create mode 100644 target/sim/sw/device/apps/experimental_offload/Makefile create mode 100644 target/sim/sw/device/apps/experimental_offload/src/axpy_job.h create mode 100644 target/sim/sw/device/apps/experimental_offload/src/offload.c create mode 100644 target/sim/sw/host/apps/experimental_offload/Makefile create mode 100644 target/sim/sw/host/apps/experimental_offload/src/offload.c create mode 100644 target/sim/sw/host/apps/experimental_offload/src/offload.h diff --git a/target/sim/sw/device/Makefile b/target/sim/sw/device/Makefile index 5e3b27ee5..c6e2d8b96 100644 --- a/target/sim/sw/device/Makefile +++ b/target/sim/sw/device/Makefile @@ -7,6 +7,7 @@ # Add user applications to APPS variable APPS = blas/axpy APPS += blas/gemm +APPS += experimental_offload TARGET ?= all diff --git a/target/sim/sw/device/apps/experimental_offload/Makefile b/target/sim/sw/device/apps/experimental_offload/Makefile new file mode 100644 index 000000000..777aad23e --- /dev/null +++ b/target/sim/sw/device/apps/experimental_offload/Makefile @@ -0,0 +1,19 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +SNITCH_CLUSTER_DIR = $(shell bender path snitch_cluster) + +APP = experimental_offload +SRCS = src/offload.c +INCDIRS = ../../../host/apps/experimental_offload/src +INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/blas/axpy/src +INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/lcg +INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/montecarlo + +# Define number of clusters to use +RISCV_CFLAGS ?= -DN_CLUSTERS_TO_USE=1 + +include ../common.mk diff --git a/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h new file mode 100644 index 000000000..5c7f74aee --- /dev/null +++ b/target/sim/sw/device/apps/experimental_offload/src/axpy_job.h @@ -0,0 +1,134 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#define XSSR +#include "axpy.h" + +void axpy_job_dm_core(job_t* job) { +#ifdef MULTICAST + axpy_local_job_t* axpy_job = (axpy_local_job_t*)job; +#else + axpy_local_job_t* axpy_job = (axpy_local_job_t*)l1_job_ptr; +#endif + + snrt_mcycle(); // Retrieve job information (get job arguments) + +#ifndef MULTICAST + // Copy job info (cluster 0 already has the data, no need to copy) + if (snrt_cluster_idx() != (N_CLUSTERS_TO_USE - 1)) + snrt_dma_start_1d(axpy_job, job, sizeof(axpy_job_t)); + + // Get pointer to next free slot in l1 alloc + double* x = (double*)(ALIGN_UP( + (uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096)); + + // Wait for job info transfer to complete + snrt_dma_wait_all(); + snrt_mcycle(); // Retrieve job operands +#else + snrt_mcycle(); // Retrieve job operands + // Get pointer to next free slot in l1 alloc + double* x = (double*)(ALIGN_UP( + (uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096)); +#endif + + // Copy operand x + size_t size = axpy_job->args.l * 8; + size_t offset = snrt_cluster_idx() * size; + void* x_l3_ptr = (void*)(uint32_t)(axpy_job->args.x_l3_ptr + offset); + snrt_dma_start_1d(x, x_l3_ptr, size); + +#ifndef MULTICAST + // Synchronize with compute cores before updating the l1 alloc pointer + // such that they can retrieve the local job pointer. + // Also ensures compute cores see the transferred job information. + snrt_cluster_hw_barrier(); +#endif + + // Copy operand y + double* y = (double*)((uint32_t)x + size); + void* y_l3_ptr = (void*)(uint32_t)(axpy_job->args.y_l3_ptr + offset); + snrt_dma_start_1d(y, y_l3_ptr, size); + + // Set pointers to local job operands + axpy_job->args.x = x; + axpy_job->args.y = y; + axpy_job->args.z = (double*)((uint32_t)y + size); + + // Synchronize with compute cores again such that they see + // also the local job operands locations (x, y, z) + snrt_cluster_hw_barrier(); + + // Update the L1 alloc pointer + void* next = (void*)((uint32_t)(axpy_job->args.z) + size); + snrt_l1_update_next(next); + + // Wait for DMA transfers to complete + snrt_dma_wait_all(); + + snrt_mcycle(); // Barrier + + // Synchronize with compute cores to make sure the data + // is available before they can start computing on it + snrt_cluster_hw_barrier(); + + snrt_mcycle(); // Job execution + + // Synchronize cores to make sure results are available before + // DMA starts transfer to L3 + snrt_cluster_hw_barrier(); + + snrt_mcycle(); // Writeback job outputs + + // Transfer data out + void* z_l3_ptr = (void*)(uint32_t)(axpy_job->args.z_l3_ptr + offset); + snrt_dma_start_1d(z_l3_ptr, axpy_job->args.z, size); + snrt_dma_wait_all(); + + snrt_mcycle(); + +#ifdef MULTICAST + return_to_cva6_accelerated(axpy_job->offload_id); +#else + return_to_cva6(SYNC_CLUSTERS); +#endif +} + +void axpy_job_compute_core(job_t* job) { + // Cast local job + axpy_local_job_t* axpy_job = (axpy_local_job_t*)job; + + snrt_mcycle(); + + // Get args + uint32_t l = axpy_job->args.l; + double a = axpy_job->args.a; + + // Synchronize with DM core to wait for local job + // operand pointers (x, y, z) to be up to date + snrt_cluster_hw_barrier(); + + double* x = axpy_job->args.x; + double* y = axpy_job->args.y; + double* z = axpy_job->args.z; + + snrt_mcycle(); + + // Synchronize with DM core to wait for operands + // to be fully transferred in L1 + snrt_cluster_hw_barrier(); + + snrt_mcycle(); + + // Run kernel + axpy(l, a, x, y, z); + + snrt_mcycle(); + + // Synchronize with DM core to make sure results are available + // before DMA starts transfer to L3 + snrt_cluster_hw_barrier(); + + snrt_mcycle(); +} diff --git a/target/sim/sw/device/apps/experimental_offload/src/offload.c b/target/sim/sw/device/apps/experimental_offload/src/offload.c new file mode 100644 index 000000000..7eb2e398c --- /dev/null +++ b/target/sim/sw/device/apps/experimental_offload/src/offload.c @@ -0,0 +1,123 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "offload.h" +#include "snrt.h" + +#define N_JOB_TYPES 2 + +// Other variables +__thread usr_data_t* volatile usr_data_ptr; +__thread uint32_t l1_job_ptr; +__thread uint32_t remote_job_ptr; + +#include "axpy_job.h" +// #include "montecarlo_job.h" + +// Job function type +typedef void (*job_func_t)(job_t* job); + +// Job function arrays +__thread job_func_t jobs_dm_core[N_JOB_TYPES] = { + axpy_job_dm_core /*, mc_job_dm_core*/}; +__thread job_func_t jobs_compute_core[N_JOB_TYPES] = { + axpy_job_compute_core /*, mc_job_compute_core*/}; + +static inline void run_job() { + // Force compiler to assign fallthrough path of the branch to + // the DM core. This way the cache miss latency due to the branch + // is incurred by the compute cores, and overlaps with the data + // movement performed by the DM core. + asm goto("bnez %0, %l[run_job_compute_core]" + : + : "r"(snrt_is_compute_core()) + : + : run_job_compute_core); + +#ifndef OFFLOAD_NONE + // Retrieve job data pointer +#ifdef MULTICAST + job_t* job = (job_t*)l1_job_ptr; +#else + job_t* job = (job_t*)remote_job_ptr; +#endif + + // Invoke job + uint32_t job_id = job->id; + jobs_dm_core[job_id](job); + +#else + return_to_cva6(SYNC_ALL); +#endif + + goto run_job_end; + +run_job_compute_core:; + +#ifndef OFFLOAD_NONE + // Get pointer to local copy of job + job_t* job_local = (job_t*)l1_job_ptr; + +#ifndef MULTICAST + // Synchronize with DM core such that it knows + // it can update the l1 alloc pointer, and we know + // job information is locally available + snrt_cluster_hw_barrier(); +#endif + + // Invoke job + jobs_compute_core[job_local->id](job_local); +#else + snrt_cluster_hw_barrier(); + snrt_int_wait_mcip_clr(); +#endif + +run_job_end:; +} + +int main() { + // Get user data pointer + usr_data_ptr = + (usr_data_t * volatile) get_communication_buffer()->usr_data_ptr; + + // Tell CVA6 where it can store the job ID + l1_job_ptr = (uint32_t)snrt_l1_next(); + snrt_cluster_hw_barrier(); + if (snrt_is_dm_core()) { + // Only one core sends the data for all clusters +#ifdef MULTICAST + if (snrt_cluster_idx() == 0) +#else + if (snrt_cluster_idx() == (N_CLUSTERS_TO_USE - 1)) +#endif + usr_data_ptr->l1_job_ptr = l1_job_ptr; + } + snrt_cluster_hw_barrier(); + +#ifdef OFFLOAD_MONTECARLO + if (snrt_is_compute_core()) mc_init(); +#endif + + // Notify CVA6 when snRuntime initialization is done + snrt_int_clr_mcip(); + return_to_cva6(SYNC_ALL); + snrt_wfi(); + +#ifndef MULTICAST + // Get pointer to remote job in cluster 0's TCDM + remote_job_ptr = usr_data_ptr->l1_job_ptr; +#endif + + // Job loop + while (1) { + snrt_mcycle(); // Clear interrupt + snrt_int_clr_mcip_unsafe(); + + snrt_mcycle(); // Retrieve job information (get job pointer) + run_job(); + + snrt_mcycle(); // Sleep + snrt_wfi(); + } +} diff --git a/target/sim/sw/host/Makefile b/target/sim/sw/host/Makefile index 217287043..441baf416 100644 --- a/target/sim/sw/host/Makefile +++ b/target/sim/sw/host/Makefile @@ -7,6 +7,7 @@ # Add user applications to APPS variable APPS = hello_world APPS += offload +APPS += experimental_offload TARGET ?= all diff --git a/target/sim/sw/host/apps/experimental_offload/Makefile b/target/sim/sw/host/apps/experimental_offload/Makefile new file mode 100644 index 000000000..e6d6da643 --- /dev/null +++ b/target/sim/sw/host/apps/experimental_offload/Makefile @@ -0,0 +1,23 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +BLAS_DIR = $(shell bender path snitch_cluster)/sw/blas + +APP = experimental_offload +SRCS = src/offload.c +INCDIRS = $(BLAS_DIR) +INCL_DEVICE_BINARY = true + +# Define application and number of clusters to use +RISCV_CFLAGS ?= -DOFFLOAD_AXPY -DN_CLUSTERS_TO_USE=1 + +SECTION = .wide_spm + +include $(BLAS_DIR)/axpy/Makefile +include ../common.mk + +# Create data.h dependency +$(DEP): $(DATA_DIR)/data.h diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.c b/target/sim/sw/host/apps/experimental_offload/src/offload.c new file mode 100644 index 000000000..98075f372 --- /dev/null +++ b/target/sim/sw/host/apps/experimental_offload/src/offload.c @@ -0,0 +1,167 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "offload.h" +#include +#include "host.c" + +#include "axpy/data/data.h" +#define N_JOBS 2 + +#define WIDE_SPM_ADDR(X) \ + ((X) - (uint64_t)(&__wide_spm_start) + SPM_WIDE_BASE_ADDR) + +#ifdef N_CLUSTERS_TO_USE +const int n_clusters_to_use = N_CLUSTERS_TO_USE; +#else +const int n_clusters_to_use = N_CLUSTERS; +#endif + +extern volatile uint64_t __wide_spm_start; + +usr_data_t usr_data __attribute__((section(".nc_spm"))); + +double pi __attribute__((section(".wide_spm"))); + +static inline void send_job_and_wakeup(job_t *job, uint64_t l1_job_ptr) { + // *((volatile uint32_t*)(CLINT_BASE_ADDR + CLINT_OFFLOAD0_REG_OFFSET)) = + // n_clusters_to_use; + + switch (job->id) { + case J_AXPY: { + axpy_args_t args = job->args.axpy; + +#ifdef MULTICAST + uint64_t mask = ((n_clusters_to_use - 1) << 18); + enable_multicast(mask); +#endif + *((volatile uint64_t *)(l1_job_ptr)) = job->id; + *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) = + job->offload_id; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(axpy_args_t, l))) = args.l; + *((volatile double *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(axpy_args_t, a))) = args.a; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(axpy_args_t, x_ptr))) = args.x_ptr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(axpy_args_t, y_ptr))) = args.y_ptr; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(axpy_args_t, z_ptr))) = args.z_ptr; + + mcycle(); // Wakeup +#ifdef MULTICAST + *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511; + disable_multicast(); +#else + wakeup_snitches(); +#endif + break; + } + case J_MONTECARLO: { + mc_args_t args = job->args.mc; + +#ifdef MULTICAST + uint64_t mask = ((n_clusters_to_use - 1) << 18); + enable_multicast(mask); +#endif + *((volatile uint64_t *)(l1_job_ptr)) = job->id; + *((volatile uint8_t *)(l1_job_ptr + offsetof(job_t, offload_id))) = + job->offload_id; + *((volatile uint32_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(mc_args_t, n_samples))) = + args.n_samples; + *((volatile uint64_t *)(l1_job_ptr + offsetof(job_t, args) + + offsetof(mc_args_t, result_ptr))) = + args.result_ptr; + + mcycle(); // Wakeup +#ifdef MULTICAST + *((volatile uint32_t *)cluster_clint_set_addr(0)) = 511; + disable_multicast(); +#else + wakeup_snitches(); +#endif + break; + } + } +} + +int main() { + set_d_cache_enable(1); + + comm_buffer.usr_data_ptr = (uint32_t)(uint64_t)&usr_data; + fence(); + + axpy_args_t axpy_args = { + l / n_clusters_to_use, a, WIDE_SPM_ADDR((uint64_t)x), + WIDE_SPM_ADDR((uint64_t)y), WIDE_SPM_ADDR((uint64_t)z)}; + job_t axpy = {J_AXPY, 0, axpy_args}; + + mc_args_t mc_args = {l / (8 * n_clusters_to_use), + WIDE_SPM_ADDR((uint64_t)&pi)}; + job_args_t job_args; + job_args.mc = mc_args; + job_t mc = {J_MONTECARLO, 0, job_args}; + +#if defined(OFFLOAD_AXPY) + job_t jobs[N_JOBS] = {axpy, axpy}; +#elif defined(OFFLOAD_MONTECARLO) + job_t jobs[N_JOBS] = {mc, mc}; +#endif + + volatile uint32_t n_jobs = N_JOBS; + + // Reset and ungate quadrant 0, deisolate + reset_and_ungate_quadrants(); + deisolate_all(); + + // Enable interrupts to receive notice of job termination + enable_sw_interrupts(); + + // Program Snitch entry point and communication buffer + program_snitches(); + + // Wakeup Snitches for snRuntime initialization + // (memory fence ensures compiler does not reorder + // this and previous function calls) + asm volatile("" : : : "memory"); + wakeup_snitches(); + + // Wait for snRuntime initialization to be over + wait_snitches_done(); + + // Retrieve destination for job information in cluster 0's TCDM + uint64_t l1_job_ptr = (uint64_t)usr_data.l1_job_ptr; + + // Send jobs (first iteration just to heat up I$) + for (uint32_t i = 0; i < n_jobs; i++) { +#ifndef OFFLOAD_NONE + mcycle(); // Send job information + send_job_and_wakeup(&jobs[i], l1_job_ptr); +#else + mcycle(); // Wakeup + wakeup_snitches(); +#endif + mcycle(); // Wait for job done + wait_sw_interrupt(); + + mcycle(); // Resume operation on host +#ifdef OFFLOAD_NONE + clear_host_sw_interrupt_unsafe(); + mcycle(); + wait_host_sw_interrupt_clear(); +#else + clear_host_sw_interrupt_unsafe(); + mcycle(); +#endif + } + + // Copy results from wide SPM to DRAM for verification + sys_dma_blk_memcpy((uint64_t)z, WIDE_SPM_ADDR((uint64_t)z), + l * sizeof(double)); + + // Exit routine + mcycle(); +} diff --git a/target/sim/sw/host/apps/experimental_offload/src/offload.h b/target/sim/sw/host/apps/experimental_offload/src/offload.h new file mode 100644 index 000000000..35e8b8ac2 --- /dev/null +++ b/target/sim/sw/host/apps/experimental_offload/src/offload.h @@ -0,0 +1,81 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +typedef struct { + volatile uint32_t l3_job_ptr; + volatile uint32_t l1_job_ptr; +} usr_data_t; + +typedef enum { J_AXPY = 0, J_MONTECARLO = 1 } job_id_t; + +////////// +// AXPY // +////////// + +typedef struct { + uint32_t l; + double a; + uint64_t x_ptr; + uint64_t y_ptr; + uint64_t z_ptr; +} axpy_args_t; + +typedef struct { + uint32_t l; + double a; + uint64_t x_l3_ptr; + uint64_t y_l3_ptr; + uint64_t z_l3_ptr; + double* x; + double* y; + double* z; +} axpy_local_args_t; + +typedef struct { + job_id_t id; + uint8_t offload_id; + axpy_args_t args; +} axpy_job_t; + +typedef struct { + job_id_t id; + uint8_t offload_id; + axpy_local_args_t args; +} axpy_local_job_t; + +///////////////// +// Monte Carlo // +///////////////// + +typedef struct { + uint32_t n_samples; + uint64_t result_ptr; +} mc_args_t; + +typedef struct { + job_id_t id; + uint8_t offload_id; + mc_args_t args; +} mc_job_t; + +///////////// +// Generic // +///////////// + +typedef struct { + uint64_t job_ptr; +} user_data_t; + +typedef union { + axpy_args_t axpy; + mc_args_t mc; +} job_args_t; + +typedef struct { + job_id_t id; + uint8_t offload_id; + job_args_t args; +} job_t;