Skip to content

Commit

Permalink
sw: Add experimental_offload app
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Sep 25, 2023
1 parent e6be1dd commit edc87cf
Show file tree
Hide file tree
Showing 8 changed files with 549 additions and 0 deletions.
1 change: 1 addition & 0 deletions target/sim/sw/device/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Add user applications to APPS variable
APPS = blas/axpy
APPS += blas/gemm
APPS += experimental_offload

TARGET ?= all

Expand Down
19 changes: 19 additions & 0 deletions target/sim/sw/device/apps/experimental_offload/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <colluca@iis.ee.ethz.ch>

SNITCH_CLUSTER_DIR = $(shell bender path snitch_cluster)

APP = experimental_offload
SRCS = src/offload.c
INCDIRS = ../../../host/apps/experimental_offload/src
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/blas/axpy/src
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/lcg
INCDIRS += $(SNITCH_CLUSTER_DIR)/sw/apps/montecarlo

# Define number of clusters to use
RISCV_CFLAGS ?= -DN_CLUSTERS_TO_USE=1

include ../common.mk
134 changes: 134 additions & 0 deletions target/sim/sw/device/apps/experimental_offload/src/axpy_job.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#define XSSR
#include "axpy.h"

void axpy_job_dm_core(job_t* job) {
#ifdef MULTICAST
axpy_local_job_t* axpy_job = (axpy_local_job_t*)job;
#else
axpy_local_job_t* axpy_job = (axpy_local_job_t*)l1_job_ptr;
#endif

snrt_mcycle(); // Retrieve job information (get job arguments)

#ifndef MULTICAST
// Copy job info (cluster 0 already has the data, no need to copy)
if (snrt_cluster_idx() != (N_CLUSTERS_TO_USE - 1))
snrt_dma_start_1d(axpy_job, job, sizeof(axpy_job_t));

// Get pointer to next free slot in l1 alloc
double* x = (double*)(ALIGN_UP(
(uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096));

// Wait for job info transfer to complete
snrt_dma_wait_all();
snrt_mcycle(); // Retrieve job operands
#else
snrt_mcycle(); // Retrieve job operands
// Get pointer to next free slot in l1 alloc
double* x = (double*)(ALIGN_UP(
(uint32_t)axpy_job + sizeof(axpy_local_job_t), 4096));
#endif

// Copy operand x
size_t size = axpy_job->args.l * 8;
size_t offset = snrt_cluster_idx() * size;
void* x_l3_ptr = (void*)(uint32_t)(axpy_job->args.x_l3_ptr + offset);
snrt_dma_start_1d(x, x_l3_ptr, size);

#ifndef MULTICAST
// Synchronize with compute cores before updating the l1 alloc pointer
// such that they can retrieve the local job pointer.
// Also ensures compute cores see the transferred job information.
snrt_cluster_hw_barrier();
#endif

// Copy operand y
double* y = (double*)((uint32_t)x + size);
void* y_l3_ptr = (void*)(uint32_t)(axpy_job->args.y_l3_ptr + offset);
snrt_dma_start_1d(y, y_l3_ptr, size);

// Set pointers to local job operands
axpy_job->args.x = x;
axpy_job->args.y = y;
axpy_job->args.z = (double*)((uint32_t)y + size);

// Synchronize with compute cores again such that they see
// also the local job operands locations (x, y, z)
snrt_cluster_hw_barrier();

// Update the L1 alloc pointer
void* next = (void*)((uint32_t)(axpy_job->args.z) + size);
snrt_l1_update_next(next);

// Wait for DMA transfers to complete
snrt_dma_wait_all();

snrt_mcycle(); // Barrier

// Synchronize with compute cores to make sure the data
// is available before they can start computing on it
snrt_cluster_hw_barrier();

snrt_mcycle(); // Job execution

// Synchronize cores to make sure results are available before
// DMA starts transfer to L3
snrt_cluster_hw_barrier();

snrt_mcycle(); // Writeback job outputs

// Transfer data out
void* z_l3_ptr = (void*)(uint32_t)(axpy_job->args.z_l3_ptr + offset);
snrt_dma_start_1d(z_l3_ptr, axpy_job->args.z, size);
snrt_dma_wait_all();

snrt_mcycle();

#ifdef MULTICAST
return_to_cva6_accelerated(axpy_job->offload_id);
#else
return_to_cva6(SYNC_CLUSTERS);
#endif
}

void axpy_job_compute_core(job_t* job) {
// Cast local job
axpy_local_job_t* axpy_job = (axpy_local_job_t*)job;

snrt_mcycle();

// Get args
uint32_t l = axpy_job->args.l;
double a = axpy_job->args.a;

// Synchronize with DM core to wait for local job
// operand pointers (x, y, z) to be up to date
snrt_cluster_hw_barrier();

double* x = axpy_job->args.x;
double* y = axpy_job->args.y;
double* z = axpy_job->args.z;

snrt_mcycle();

// Synchronize with DM core to wait for operands
// to be fully transferred in L1
snrt_cluster_hw_barrier();

snrt_mcycle();

// Run kernel
axpy(l, a, x, y, z);

snrt_mcycle();

// Synchronize with DM core to make sure results are available
// before DMA starts transfer to L3
snrt_cluster_hw_barrier();

snrt_mcycle();
}
123 changes: 123 additions & 0 deletions target/sim/sw/device/apps/experimental_offload/src/offload.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "offload.h"
#include "snrt.h"

#define N_JOB_TYPES 2

// Other variables
__thread usr_data_t* volatile usr_data_ptr;
__thread uint32_t l1_job_ptr;
__thread uint32_t remote_job_ptr;

#include "axpy_job.h"
// #include "montecarlo_job.h"

// Job function type
typedef void (*job_func_t)(job_t* job);

// Job function arrays
__thread job_func_t jobs_dm_core[N_JOB_TYPES] = {
axpy_job_dm_core /*, mc_job_dm_core*/};
__thread job_func_t jobs_compute_core[N_JOB_TYPES] = {
axpy_job_compute_core /*, mc_job_compute_core*/};

static inline void run_job() {
// Force compiler to assign fallthrough path of the branch to
// the DM core. This way the cache miss latency due to the branch
// is incurred by the compute cores, and overlaps with the data
// movement performed by the DM core.
asm goto("bnez %0, %l[run_job_compute_core]"
:
: "r"(snrt_is_compute_core())
:
: run_job_compute_core);

#ifndef OFFLOAD_NONE
// Retrieve job data pointer
#ifdef MULTICAST
job_t* job = (job_t*)l1_job_ptr;
#else
job_t* job = (job_t*)remote_job_ptr;
#endif

// Invoke job
uint32_t job_id = job->id;
jobs_dm_core[job_id](job);

#else
return_to_cva6(SYNC_ALL);
#endif

goto run_job_end;

run_job_compute_core:;

#ifndef OFFLOAD_NONE
// Get pointer to local copy of job
job_t* job_local = (job_t*)l1_job_ptr;

#ifndef MULTICAST
// Synchronize with DM core such that it knows
// it can update the l1 alloc pointer, and we know
// job information is locally available
snrt_cluster_hw_barrier();
#endif

// Invoke job
jobs_compute_core[job_local->id](job_local);
#else
snrt_cluster_hw_barrier();
snrt_int_wait_mcip_clr();
#endif

run_job_end:;
}

int main() {
// Get user data pointer
usr_data_ptr =
(usr_data_t * volatile) get_communication_buffer()->usr_data_ptr;

// Tell CVA6 where it can store the job ID
l1_job_ptr = (uint32_t)snrt_l1_next();
snrt_cluster_hw_barrier();
if (snrt_is_dm_core()) {
// Only one core sends the data for all clusters
#ifdef MULTICAST
if (snrt_cluster_idx() == 0)
#else
if (snrt_cluster_idx() == (N_CLUSTERS_TO_USE - 1))
#endif
usr_data_ptr->l1_job_ptr = l1_job_ptr;
}
snrt_cluster_hw_barrier();

#ifdef OFFLOAD_MONTECARLO
if (snrt_is_compute_core()) mc_init();
#endif

// Notify CVA6 when snRuntime initialization is done
snrt_int_clr_mcip();
return_to_cva6(SYNC_ALL);
snrt_wfi();

#ifndef MULTICAST
// Get pointer to remote job in cluster 0's TCDM
remote_job_ptr = usr_data_ptr->l1_job_ptr;
#endif

// Job loop
while (1) {
snrt_mcycle(); // Clear interrupt
snrt_int_clr_mcip_unsafe();

snrt_mcycle(); // Retrieve job information (get job pointer)
run_job();

snrt_mcycle(); // Sleep
snrt_wfi();
}
}
1 change: 1 addition & 0 deletions target/sim/sw/host/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Add user applications to APPS variable
APPS = hello_world
APPS += offload
APPS += experimental_offload

TARGET ?= all

Expand Down
23 changes: 23 additions & 0 deletions target/sim/sw/host/apps/experimental_offload/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <colluca@iis.ee.ethz.ch>

BLAS_DIR = $(shell bender path snitch_cluster)/sw/blas

APP = experimental_offload
SRCS = src/offload.c
INCDIRS = $(BLAS_DIR)
INCL_DEVICE_BINARY = true

# Define application and number of clusters to use
RISCV_CFLAGS ?= -DOFFLOAD_AXPY -DN_CLUSTERS_TO_USE=1

SECTION = .wide_spm

include $(BLAS_DIR)/axpy/Makefile
include ../common.mk

# Create data.h dependency
$(DEP): $(DATA_DIR)/data.h
Loading

0 comments on commit edc87cf

Please sign in to comment.