-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
549 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Copyright 2023 ETH Zurich and University of Bologna. | ||
# Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Luca Colagrande <colluca@iis.ee.ethz.ch> | ||
|
||
SNITCH_CLUSTER_DIR = $(shell bender path snitch_cluster) | ||
|
||
APP = experimental_offload | ||
SRCS = src/offload.c | ||
INCDIRS = ../../../host/apps/experimental_offload/src | ||
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/blas/axpy/src | ||
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/lcg | ||
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/montecarlo | ||
|
||
# Define number of clusters to use | ||
RISCV_CFLAGS ?= -DN_CLUSTERS_TO_USE=1 | ||
|
||
include ../common.mk |
134 changes: 134 additions & 0 deletions
134
target/sim/sw/device/apps/experimental_offload/src/axpy_job.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
// Copyright 2023 ETH Zurich and University of Bologna. | ||
// Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#define XSSR | ||
#include "axpy.h" | ||
|
||
void axpy_job_dm_core(job_t* job) { | ||
#ifdef MULTICAST | ||
axpy_local_job_t* axpy_job = (axpy_local_job_t*)job; | ||
#else | ||
axpy_local_job_t* axpy_job = (axpy_local_job_t*)l1_job_ptr; | ||
#endif | ||
|
||
snrt_mcycle(); // Retrieve job information (get job arguments) | ||
|
||
#ifndef MULTICAST | ||
// Copy job info (cluster 0 already has the data, no need to copy) | ||
if (snrt_cluster_idx() != (N_CLUSTERS_TO_USE - 1)) | ||
snrt_dma_start_1d(axpy_job, job, sizeof(axpy_job_t)); | ||
|
||
// Get pointer to next free slot in l1 alloc | ||
double* x = (double*)(ALIGN_UP( | ||
(uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096)); | ||
|
||
// Wait for job info transfer to complete | ||
snrt_dma_wait_all(); | ||
snrt_mcycle(); // Retrieve job operands | ||
#else | ||
snrt_mcycle(); // Retrieve job operands | ||
// Get pointer to next free slot in l1 alloc | ||
double* x = (double*)(ALIGN_UP( | ||
(uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096)); | ||
#endif | ||
|
||
// Copy operand x | ||
size_t size = axpy_job->args.l * 8; | ||
size_t offset = snrt_cluster_idx() * size; | ||
void* x_l3_ptr = (void*)(uint32_t)(axpy_job->args.x_l3_ptr + offset); | ||
snrt_dma_start_1d(x, x_l3_ptr, size); | ||
|
||
#ifndef MULTICAST | ||
// Synchronize with compute cores before updating the l1 alloc pointer | ||
// such that they can retrieve the local job pointer. | ||
// Also ensures compute cores see the transferred job information. | ||
snrt_cluster_hw_barrier(); | ||
#endif | ||
|
||
// Copy operand y | ||
double* y = (double*)((uint32_t)x + size); | ||
void* y_l3_ptr = (void*)(uint32_t)(axpy_job->args.y_l3_ptr + offset); | ||
snrt_dma_start_1d(y, y_l3_ptr, size); | ||
|
||
// Set pointers to local job operands | ||
axpy_job->args.x = x; | ||
axpy_job->args.y = y; | ||
axpy_job->args.z = (double*)((uint32_t)y + size); | ||
|
||
// Synchronize with compute cores again such that they see | ||
// also the local job operands locations (x, y, z) | ||
snrt_cluster_hw_barrier(); | ||
|
||
// Update the L1 alloc pointer | ||
void* next = (void*)((uint32_t)(axpy_job->args.z) + size); | ||
snrt_l1_update_next(next); | ||
|
||
// Wait for DMA transfers to complete | ||
snrt_dma_wait_all(); | ||
|
||
snrt_mcycle(); // Barrier | ||
|
||
// Synchronize with compute cores to make sure the data | ||
// is available before they can start computing on it | ||
snrt_cluster_hw_barrier(); | ||
|
||
snrt_mcycle(); // Job execution | ||
|
||
// Synchronize cores to make sure results are available before | ||
// DMA starts transfer to L3 | ||
snrt_cluster_hw_barrier(); | ||
|
||
snrt_mcycle(); // Writeback job outputs | ||
|
||
// Transfer data out | ||
void* z_l3_ptr = (void*)(uint32_t)(axpy_job->args.z_l3_ptr + offset); | ||
snrt_dma_start_1d(z_l3_ptr, axpy_job->args.z, size); | ||
snrt_dma_wait_all(); | ||
|
||
snrt_mcycle(); | ||
|
||
#ifdef MULTICAST | ||
return_to_cva6_accelerated(axpy_job->offload_id); | ||
#else | ||
return_to_cva6(SYNC_CLUSTERS); | ||
#endif | ||
} | ||
|
||
void axpy_job_compute_core(job_t* job) { | ||
// Cast local job | ||
axpy_local_job_t* axpy_job = (axpy_local_job_t*)job; | ||
|
||
snrt_mcycle(); | ||
|
||
// Get args | ||
uint32_t l = axpy_job->args.l; | ||
double a = axpy_job->args.a; | ||
|
||
// Synchronize with DM core to wait for local job | ||
// operand pointers (x, y, z) to be up to date | ||
snrt_cluster_hw_barrier(); | ||
|
||
double* x = axpy_job->args.x; | ||
double* y = axpy_job->args.y; | ||
double* z = axpy_job->args.z; | ||
|
||
snrt_mcycle(); | ||
|
||
// Synchronize with DM core to wait for operands | ||
// to be fully transferred in L1 | ||
snrt_cluster_hw_barrier(); | ||
|
||
snrt_mcycle(); | ||
|
||
// Run kernel | ||
axpy(l, a, x, y, z); | ||
|
||
snrt_mcycle(); | ||
|
||
// Synchronize with DM core to make sure results are available | ||
// before DMA starts transfer to L3 | ||
snrt_cluster_hw_barrier(); | ||
|
||
snrt_mcycle(); | ||
} |
123 changes: 123 additions & 0 deletions
123
target/sim/sw/device/apps/experimental_offload/src/offload.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// Copyright 2023 ETH Zurich and University of Bologna. | ||
// Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include "offload.h" | ||
#include "snrt.h" | ||
|
||
#define N_JOB_TYPES 2 | ||
|
||
// Other variables | ||
__thread usr_data_t* volatile usr_data_ptr; | ||
__thread uint32_t l1_job_ptr; | ||
__thread uint32_t remote_job_ptr; | ||
|
||
#include "axpy_job.h" | ||
// #include "montecarlo_job.h" | ||
|
||
// Job function type | ||
typedef void (*job_func_t)(job_t* job); | ||
|
||
// Job function arrays | ||
__thread job_func_t jobs_dm_core[N_JOB_TYPES] = { | ||
axpy_job_dm_core /*, mc_job_dm_core*/}; | ||
__thread job_func_t jobs_compute_core[N_JOB_TYPES] = { | ||
axpy_job_compute_core /*, mc_job_compute_core*/}; | ||
|
||
static inline void run_job() { | ||
// Force compiler to assign fallthrough path of the branch to | ||
// the DM core. This way the cache miss latency due to the branch | ||
// is incurred by the compute cores, and overlaps with the data | ||
// movement performed by the DM core. | ||
asm goto("bnez %0, %l[run_job_compute_core]" | ||
: | ||
: "r"(snrt_is_compute_core()) | ||
: | ||
: run_job_compute_core); | ||
|
||
#ifndef OFFLOAD_NONE | ||
// Retrieve job data pointer | ||
#ifdef MULTICAST | ||
job_t* job = (job_t*)l1_job_ptr; | ||
#else | ||
job_t* job = (job_t*)remote_job_ptr; | ||
#endif | ||
|
||
// Invoke job | ||
uint32_t job_id = job->id; | ||
jobs_dm_core[job_id](job); | ||
|
||
#else | ||
return_to_cva6(SYNC_ALL); | ||
#endif | ||
|
||
goto run_job_end; | ||
|
||
run_job_compute_core:; | ||
|
||
#ifndef OFFLOAD_NONE | ||
// Get pointer to local copy of job | ||
job_t* job_local = (job_t*)l1_job_ptr; | ||
|
||
#ifndef MULTICAST | ||
// Synchronize with DM core such that it knows | ||
// it can update the l1 alloc pointer, and we know | ||
// job information is locally available | ||
snrt_cluster_hw_barrier(); | ||
#endif | ||
|
||
// Invoke job | ||
jobs_compute_core[job_local->id](job_local); | ||
#else | ||
snrt_cluster_hw_barrier(); | ||
snrt_int_wait_mcip_clr(); | ||
#endif | ||
|
||
run_job_end:; | ||
} | ||
|
||
int main() { | ||
// Get user data pointer | ||
usr_data_ptr = | ||
(usr_data_t * volatile) get_communication_buffer()->usr_data_ptr; | ||
|
||
// Tell CVA6 where it can store the job ID | ||
l1_job_ptr = (uint32_t)snrt_l1_next(); | ||
snrt_cluster_hw_barrier(); | ||
if (snrt_is_dm_core()) { | ||
// Only one core sends the data for all clusters | ||
#ifdef MULTICAST | ||
if (snrt_cluster_idx() == 0) | ||
#else | ||
if (snrt_cluster_idx() == (N_CLUSTERS_TO_USE - 1)) | ||
#endif | ||
usr_data_ptr->l1_job_ptr = l1_job_ptr; | ||
} | ||
snrt_cluster_hw_barrier(); | ||
|
||
#ifdef OFFLOAD_MONTECARLO | ||
if (snrt_is_compute_core()) mc_init(); | ||
#endif | ||
|
||
// Notify CVA6 when snRuntime initialization is done | ||
snrt_int_clr_mcip(); | ||
return_to_cva6(SYNC_ALL); | ||
snrt_wfi(); | ||
|
||
#ifndef MULTICAST | ||
// Get pointer to remote job in cluster 0's TCDM | ||
remote_job_ptr = usr_data_ptr->l1_job_ptr; | ||
#endif | ||
|
||
// Job loop | ||
while (1) { | ||
snrt_mcycle(); // Clear interrupt | ||
snrt_int_clr_mcip_unsafe(); | ||
|
||
snrt_mcycle(); // Retrieve job information (get job pointer) | ||
run_job(); | ||
|
||
snrt_mcycle(); // Sleep | ||
snrt_wfi(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Copyright 2023 ETH Zurich and University of Bologna. | ||
# Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Luca Colagrande <colluca@iis.ee.ethz.ch> | ||
|
||
BLAS_DIR = $(shell bender path snitch_cluster)/sw/blas | ||
|
||
APP = experimental_offload | ||
SRCS = src/offload.c | ||
INCDIRS = $(BLAS_DIR) | ||
INCL_DEVICE_BINARY = true | ||
|
||
# Define application and number of clusters to use | ||
RISCV_CFLAGS ?= -DOFFLOAD_AXPY -DN_CLUSTERS_TO_USE=1 | ||
|
||
SECTION = .wide_spm | ||
|
||
include $(BLAS_DIR)/axpy/Makefile | ||
include ../common.mk | ||
|
||
# Create data.h dependency | ||
$(DEP): $(DATA_DIR)/data.h |
Oops, something went wrong.