From df6afdac9df35bc203a9bed31f22944bc2e8b965 Mon Sep 17 00:00:00 2001 From: Cyril Koenig Date: Thu, 22 Feb 2024 18:02:15 +0100 Subject: [PATCH] hero: Working on runtime --- target/fpga/Makefile | 21 +- target/fpga/bootrom/Makefile | 3 + target/fpga/bootrom/occamy.dts | 15 +- target/fpga/bootrom/occamy_pcie.dts | 145 ++++++++ target/fpga/occamy_vcu128_bd.tcl | 5 +- target/fpga/occamy_vcu128_procs.tcl | 13 +- target/sim/Makefile | 4 +- target/sim/sw/device/apps/common.mk | 2 + .../device/apps/libomptarget_device/Makefile | 10 +- .../device/apps/libomptarget_device/link.ld | 265 ++++++--------- .../apps/libomptarget_device/src/debug.h | 77 +++++ .../apps/libomptarget_device/src/main.c | 317 +++++++++++++++++- .../apps/libomptarget_device/src/main.o | Bin 1200 -> 0 bytes .../apps/libomptarget_device/src/sw_mailbox.c | 94 ++++++ .../apps/libomptarget_device/src/sw_mailbox.h | 197 +++++++++++ target/sim/sw/device/runtime/Makefile | 2 + .../sim/sw/device/runtime/src/occamy_start.c | 3 + target/sim/sw/device/runtime/src/putchar.c | 2 +- target/sim/sw/device/runtime/src/snrt.c | 4 + target/sim/sw/device/runtime/src/snrt.h | 7 + target/sim/sw/device/toolchain.mk | 4 +- 21 files changed, 1003 insertions(+), 187 deletions(-) create mode 100644 target/fpga/bootrom/occamy_pcie.dts create mode 100644 target/sim/sw/device/apps/libomptarget_device/src/debug.h delete mode 100644 target/sim/sw/device/apps/libomptarget_device/src/main.o create mode 100644 target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.c create mode 100644 target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.h diff --git a/target/fpga/Makefile b/target/fpga/Makefile index 7dbf43b82..47733aa7a 100644 --- a/target/fpga/Makefile +++ b/target/fpga/Makefile @@ -11,19 +11,13 @@ CVA6_SDK ?= ${ROOT}/../cva6-sdk DEBUG ?= 0 EXT_JTAG ?= 0 VCU ?= 01 -FPGA_ID := 091847100576A -HW_SERVER := bordcomputer:3231 +FPGA_ID := +HW_SERVER := BENDER ?= bender VIVADO ?= vitis-2020.2 vivado # Do not proceed with implem (CI) XILINX_SYNTHESIS_ONLY ?= 0 -# Select VCU128-02 -ifeq ($(VCU),02) - FPGA_ID := 091847100638A - HW_SERVER := bordcomputer:3232 -endif - VIVADO ?= vivado VIVADO_ARGS := XILINX_SYNTHESIS_ONLY=$(XILINX_SYNTHESIS_ONLY) MKIMAGE ?= $(CURDIR)/br2_external/install/bin/mkimage @@ -36,20 +30,25 @@ LINUX_UIMAGE ?= ${CVA6_SDK}/uImage DTB = bootrom/occamy.dtb +BENDER_TARGETS += -t cv64a6_imafdc_sv39 -t occamy +ifeq ($(EXT_JTAG), 0) + BENDER_TARGETS += -t bscane +endif + default: all all: occamy_vcu128 vivado_ips/occamy_xilinx: - ${MAKE} -C vivado_ips occamy_xilinx + ${MAKE} -C vivado_ips occamy_xilinx DEBUG=$(DEBUG) EXT_JTAG=$(EXT_JTAG) bootrom/bootrom-spl.coe: ${MAKE} -C bootrom occamy_vcu128: vivado_ips/occamy_xilinx bootrom/bootrom-spl.coe define_defines_includes_no_simset.tcl - $(VIVADO_ARGS) ${VIVADO} -mode batch -source occamy_vcu128.tcl -tclargs $(DEBUG) $(EXT_JTAG) $(NPROC) ${MKFILE_DIR}/bootrom/bootrom-spl.coe + $(VIVADO_ARGS) ${VIVADO} -mode gui -source occamy_vcu128.tcl -tclargs $(DEBUG) $(EXT_JTAG) $(NPROC) ${MKFILE_DIR}/bootrom/bootrom-spl.coe define_defines_includes_no_simset.tcl: $(BENDER_FILES) - ${BENDER} script vivado -t occamy -t cv64a6_imafdc_sv39 --only-defines --only-includes --no-simset > $@ + ${BENDER} script vivado $(BENDER_TARGETS) --only-defines --only-includes --no-simset > $@ program: ${VIVADO} -mode batch -source occamy_vcu128_program.tcl -tclargs ${VCU} diff --git a/target/fpga/bootrom/Makefile b/target/fpga/bootrom/Makefile index 267aa355f..d156130cb 100644 --- a/target/fpga/bootrom/Makefile +++ b/target/fpga/bootrom/Makefile @@ -55,6 +55,9 @@ bootrom-spl.bin: bootrom.S $(OBJS_C) bootrom.ld occamy.dtb $(OBJDUMP) -d bootrom-spl.elf > bootrom-spl.dump $(OBJCOPY) -O binary bootrom-spl.elf bootrom-spl.bin +%.coe: %.bin + bin2coe -i $< -o $@ -w 32 + clean: rm -rf *.bin *.coe *.dump src/*.o *.dtb *.elf *.tcl diff --git a/target/fpga/bootrom/occamy.dts b/target/fpga/bootrom/occamy.dts index 5ecfd3305..dcd2d3954 100644 --- a/target/fpga/bootrom/occamy.dts +++ b/target/fpga/bootrom/occamy.dts @@ -25,7 +25,7 @@ #address-cells = <2>; #size-cells = <2>; ranges; - snitch_mem: buffer@c0000000 { + snitch_mem: l3_mem@c0000000 { reg = <0x0 0xc0000000 0x0 0x10000000>; }; }; @@ -63,7 +63,7 @@ soc: soc { #address-cells = <2>; #size-cells = <2>; - compatible = "simple-bus"; + compatible = "eth,occamy-soc", "simple-bus"; ranges; debug@0 { compatible = "riscv,debug-013"; @@ -191,9 +191,8 @@ clock-names = "s_axi_lite_clk", "axis_clk"; // interrupt and mac_irq interrupts-extended = <&PLIC0 1 &PLIC0 6>; - // local-mac-address = [ 00 0A 35 04 E1 60 ]; // hero-vcu128-01 - local-mac-address = [ 00 0A 35 04 E1 52 ]; // hero-vcu128-02 - mac-address = [ 00 0A 35 04 E1 52 ]; + local-mac-address = [ 00 0A 35 07 D5 DD ]; // hero-vcu128-03 + mac-address = [ 00 0A 35 07 D5 DD ]; // hero-vcu128-03 device_type = "network"; axistream-connected = <ð_dma0>; axistream-control-connected = <ð_dma0>; @@ -238,6 +237,12 @@ reg-names = "quadrant-control"; reg = <0x0 0x0b000000 0x0 0x10000>; }; + // We do not use the spm-narrow (contains OpenSBI code) + spm_wide: spm-wide@71000000 { + compatible = "eth,occamy-spm-wide"; + reg-names = "spm-wide"; + reg = <0x0 0x71000000 0x0 0x100000>; + }; // Instantiate a snitch cluster snitch-cluster@10000000 { compatible = "eth,snitch-cluster"; diff --git a/target/fpga/bootrom/occamy_pcie.dts b/target/fpga/bootrom/occamy_pcie.dts new file mode 100644 index 000000000..0ccfcf4ec --- /dev/null +++ b/target/fpga/bootrom/occamy_pcie.dts @@ -0,0 +1,145 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + + +// TODO(niwis) auto generate +/dts-v1/; +/plugin/; +&{/dev@0,0} { + axi-bus { + #address-cells = <1>; + #size-cells = <2>; + compatible = "simple-bus"; + ranges; + // Create a reserved memory region for Snitch program memory + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + snitch_mem: buffer@c0000000 { + reg = <0x0 0xc0000000 0x0 0x10000000>; + }; + }; + cpus { + #address-cells = <1>; + #size-cells = <0>; + timebase-frequency = <12500000>; + CPU0: cpu@0 { + device_type = "cpu"; + status = "okay"; + compatible = "eth,ariane", "riscv"; + clock-frequency = <25000000>; + riscv,isa = "rv64fimafd"; + mmu-type = "riscv,sv39"; + tlb-split; + reg = <0>; + // represents the destination of the mcause bits + // ariane has 3 interrupt inputs: + // - software (ipi_i[0], IRQ_M_SOFT) + // - timer (time_irq_i[0], IRQ_M_TIMER) + // - external (irq_i[1:0], {IRQ_S_EXT, IRQ_M_EXT}) + CPU0_intc: interrupt-controller { + #interrupt-cells = <1>; + #address-cells = <1>; + interrupt-controller; + compatible = "riscv,cpu-intc"; + }; + }; + }; + sysclk: virt_25mhz { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <25000000>; + }; + soc: soc { + #address-cells = <2>; + #size-cells = <2>; + compatible = "eth,occamy-soc", "simple-bus"; + ranges; + debug@0 { + compatible = "riscv,debug-013"; + // interrupts-extended = <&CPU0_intc 65535>; + reg-names = "control"; + reg = <0x0 0x0 0x0 0x1000>; + }; + serial@2002000 { + compatible = "ns16550a"; + reg = <0x0 0x2002000 0x0 0x1000>; + clock-frequency = <25000000>; + current-speed = <115200>; + interrupt-parent = <&PLIC0>; + interrupts = <36>; + reg-offset = <0>; + reg-shift = <2>; // regs are spaced on 32 bit boundary + reg-io-width = <4>; // only 32-bit access are supported + // fifo-size = <64>; + }; + timer@2006000 { + compatible = "pulp,apb_timer"; + interrupt-parent = <&PLIC0>; + interrupts = <0x00000068 0x00000069 0x00000070 0x00000071>; + reg = <0x00000000 0x2006000 0x00000000 0x00001000>; + reg-names = "control"; + }; + clint0: clint@4000000 { + clock-frequency = <12500000>; + compatible = "riscv,clint0"; + // clint generates software and timer interrupts to the core. Attach them + // to the CPU + // bits in mip and exception code in mcause: + // - IRQ_M_SOFT = 3: Machine software interrupt + // - IRQ_M_TIMER = 7: Machine timer interrupt + interrupts-extended = <&CPU0_intc 3 &CPU0_intc 7>; + reg-names = "clint"; + reg = <0x0 0x4000000 0x0 0x100000>; + }; + PLIC0: interrupt-controller@c000000 { + compatible = "riscv,plic0"; + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + // PLIC generates external interrupts to the core, M and S mode + // - IRQ_M_EXT = 11: Machine external interrupt + // - IRQ_S_EXT = 9: Supervisor external interrupt + interrupts-extended = <&CPU0_intc 11 &CPU0_intc 9>; + riscv,max-priority = <6>; + riscv,ndev = <72>; + reg = <0x0 0xc000000 0x0 0x4000000>; + }; + soc_ctl0: soc-control@2000000 { + compatible = "eth,occamy-soc-control"; + reg-names = "soc-control"; + reg = <0x0 0x02000000 0x0 0x1000>; + }; + quadrant_ctrl0: quadrant-control@b000000 { + compatible = "eth,occamy-quadrant-control"; + reg-names = "quadrant-control"; + reg = <0x0 0x0b000000 0x0 0x10000>; + }; + scratchpad-narrow@70000000 { + compatible = "eth,scratchpad-narrow"; + reg = <0x0 0x70000000 0x0 0x80000>; + }; + // Instantiate a snitch cluster + snitch-cluster@10000000 { + compatible = "eth,snitch-cluster"; + // TCDM and Peripheral spaces + reg = <0x0 0x10000000 0x0 0x40000>; + // points to a memory region reserved for use by the cluster + memory-region = <&snitch_mem>; + // cluster specific properties + eth,compute-cores = <8>; + eth,dm-cores = <1>; + eth,quadrant-idx = <0>; + eth,cluster-idx = <0>; // Used to calculate offsets in clint, soc-ctrl etc.. + // A handle to the soc-control register where isolates etc are located + eth,soc-ctl = <&soc_ctl0>; + // Handle to the associated quadrant controller + eth,quadrant-ctrl = <&quadrant_ctrl0>; + // handle to the clint where IPI interrupts are attached + eth,clint = <&clint0>; + }; + }; + }; +}; diff --git a/target/fpga/occamy_vcu128_bd.tcl b/target/fpga/occamy_vcu128_bd.tcl index 3e64fdb64..7d66e6c18 100644 --- a/target/fpga/occamy_vcu128_bd.tcl +++ b/target/fpga/occamy_vcu128_bd.tcl @@ -873,7 +873,7 @@ proc create_root_design { parentCell } { assign_bd_address -offset 0x0011F0000000 -range 0x10000000 -target_address_space [get_bd_addr_spaces occamy/m_axi_hbm_7] [get_bd_addr_segs hbm_0/SAXI_28/HBM_MEM31] -force assign_bd_address -offset 0x4CC00000 -range 0x00400000 -target_address_space [get_bd_addr_spaces occamy/m_axi_pcie] [get_bd_addr_segs hbm_0/SAPB_0/Reg] -force assign_bd_address -offset 0x4C800000 -range 0x00400000 -target_address_space [get_bd_addr_spaces occamy/m_axi_pcie] [get_bd_addr_segs hbm_0/SAPB_1/Reg] -force - assign_bd_address -offset 0x20000000 -range 0x00100000 -target_address_space [get_bd_addr_spaces occamy/m_axi_pcie] [get_bd_addr_segs xdma_0/S_AXI_B/BAR0] -force + assign_bd_address -offset 0x20000000 -range 0x20000000 -target_address_space [get_bd_addr_spaces occamy/m_axi_pcie] [get_bd_addr_segs xdma_0/S_AXI_B/BAR0] -force assign_bd_address -offset 0x00000000 -range 0x20000000 -target_address_space [get_bd_addr_spaces occamy/m_axi_pcie] [get_bd_addr_segs xdma_0/S_AXI_LITE/CTL0] -force assign_bd_address -offset 0x00000000 -range 0x0001000000000000 -target_address_space [get_bd_addr_spaces xdma_0/M_AXI_B] [get_bd_addr_segs occamy/s_axi_pcie/reg0] -force @@ -1163,7 +1163,6 @@ proc create_root_design { parentCell } { # Restore current instance current_bd_instance $oldCurInst - validate_bd_design save_bd_design } # End of create_root_design() @@ -1176,3 +1175,5 @@ proc create_root_design { parentCell } { create_root_design "" +common::send_gid_msg -ssname BD::TCL -id 2053 -severity "WARNING" "This Tcl script was generated from a block design that has not been validated. It is possible that design <$design_name> may result in errors during validation." + diff --git a/target/fpga/occamy_vcu128_procs.tcl b/target/fpga/occamy_vcu128_procs.tcl index 7a3a90c9f..6e2054c39 100644 --- a/target/fpga/occamy_vcu128_procs.tcl +++ b/target/fpga/occamy_vcu128_procs.tcl @@ -24,6 +24,17 @@ proc target_02 {} { set occ_bit_stem occamy_vcu128/occamy_vcu128.runs/impl_1/occamy_vcu128_wrapper } +proc target_03 {} { + global occ_hw_server + global occ_target_serial + global occ_hw_device + global occ_bit_stem + set occ_hw_server bordcomputer:3233 + set occ_target_serial 12309159258A + set occ_hw_device xcvu37p_0 + set occ_bit_stem occamy_vcu128/occamy_vcu128.runs/impl_1/occamy_vcu128_wrapper +} + proc occ_connect { } { global occ_hw_server global occ_target_serial @@ -144,4 +155,4 @@ proc occ_flash_spi { mcs_file flash_offset flash_file } { # Program SPI flash puts "Programing SPI flash" program_hw_cfgmem -hw_cfgmem $hw_cfgmem -} \ No newline at end of file +} diff --git a/target/sim/Makefile b/target/sim/Makefile index fae9ac2f0..4585b2e17 100644 --- a/target/sim/Makefile +++ b/target/sim/Makefile @@ -424,7 +424,9 @@ PLATFORM_HEADERS += $(PLATFORM_HEADERS_DIR)/snitch_cluster_peripheral.h PLATFORM_HEADERS += $(PLATFORM_HEADERS_DIR)/snitch_quad_peripheral.h PLATFORM_HEADERS += $(PLATFORM_HEADERS_DIR)/snitch_hbm_xbar_peripheral.h -.PHONY: sw clean-headers clean-sw +.PHONY: sw all-headers clean-headers clean-sw + +all-headers: $(PLATFORM_HEADERS) sw: $(PLATFORM_HEADERS) $(MAKE) -C sw/ all diff --git a/target/sim/sw/device/apps/common.mk b/target/sim/sw/device/apps/common.mk index c29aa94d8..f6b5d7333 100644 --- a/target/sim/sw/device/apps/common.mk +++ b/target/sim/sw/device/apps/common.mk @@ -32,7 +32,9 @@ BUILDDIR = $(abspath build) # Dependencies INCDIRS += $(RUNTIME_DIR)/src INCDIRS += $(SNRT_DIR)/api +INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/src +INCDIRS += $(SNRT_DIR)/src/omp INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes INCDIRS += $(SW_DIR)/shared/platform/generated INCDIRS += $(SW_DIR)/shared/platform diff --git a/target/sim/sw/device/apps/libomptarget_device/Makefile b/target/sim/sw/device/apps/libomptarget_device/Makefile index f08716972..171b4f9a1 100644 --- a/target/sim/sw/device/apps/libomptarget_device/Makefile +++ b/target/sim/sw/device/apps/libomptarget_device/Makefile @@ -9,7 +9,7 @@ MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) SRC_DIR := $(realpath $(MK_DIR)/src) APP ?= omptarget_device -SRCS ?= $(SRC_DIR)/main.c +SRCS ?= $(SRC_DIR)/main.c $(SRC_DIR)/sw_mailbox.c INCDIRS += $(SRC_DIR) .PHONY: clean @@ -21,10 +21,12 @@ OBJS := $(subst $(SRC_DIR), $(BUILDDIR), $(SRCS:.c=.o)) LIB := $(BUILDDIR)/libomptarget_device.a $(BUILDDIR)/origin.ld: | $(BUILDDIR) - echo "L3_ORIGIN = 0x80000000;" > $(BUILDDIR)/origin.ld + echo "L3_ORIGIN = 0xC0000000;" > $(BUILDDIR)/origin.ld -$(LIB): $(OBJS) | $(BUILDDIR) - $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ +# We first extract objects from libnnruntime and then link them with our objects +$(BUILDDIR)/libomptarget_device.a: $(OBJS) | $(BUILDDIR) + cd $(BUILDDIR) && $(RISCV_AR) -x $(SNRT_LIB_DIR)/lib$(SNRT_LIB_NAME).a + $(RISCV_AR) $(RISCV_ARFLAGS) $@ $(BUILDDIR)/*.o # For this target, only build the library all: $(LIB) diff --git a/target/sim/sw/device/apps/libomptarget_device/link.ld b/target/sim/sw/device/apps/libomptarget_device/link.ld index df1730b69..0f54b2016 100644 --- a/target/sim/sw/device/apps/libomptarget_device/link.ld +++ b/target/sim/sw/device/apps/libomptarget_device/link.ld @@ -1,181 +1,130 @@ -OUTPUT_ARCH(riscv) -ENTRY( _start ) +/* Copyright 2020 ETH Zurich and University of Bologna. */ +/* Solderpad Hardware License, Version 0.51, see LICENSE for details. */ +/* SPDX-License-Identifier: SHL-0.51 */ + +OUTPUT_ARCH( "riscv" ) +ENTRY(_start) + +/* Memory section should be provided in a separate, platform-specific */ +/* file. It should define at least the L1 and L3 memory blocks. */ MEMORY { - LOCAL : ORIGIN = 0x70010000, LENGTH = 0x70000 - L2 : ORIGIN = 0x80000000, LENGTH = 0x0fffffff + L3 : ORIGIN = 0xC0000000, LENGTH = 0x800000 } - - SECTIONS { - .init : - { - . = ALIGN(4); - KEEP( *(.init) ) - } > LOCAL - - .fini : + /* Program code goes into L3 */ + .text : { . = ALIGN(4); - KEEP( *(.fini) ) - } > LOCAL - - .preinit_array : { - . = ALIGN(4); - PROVIDE_HIDDEN (__preinit_array_start = .); - KEEP (*(.preinit_array)) - PROVIDE_HIDDEN (__preinit_array_end = .); - } > LOCAL - - .init_array : { - . = ALIGN(4); - PROVIDE_HIDDEN (__init_array_start = .); - __CTOR_LIST__ = .; - LONG((__CTOR_END__ - __CTOR_LIST__) / 4 - 2) - KEEP(*(.ctors.start)) - KEEP(*(.ctors)) - KEEP (*(SORT(.init_array.*))) - KEEP (*(.init_array )) - LONG(0) - __CTOR_END__ = .; - PROVIDE_HIDDEN (__init_array_end = .); - } > LOCAL - - .fini_array : { - . = ALIGN(4); - PROVIDE_HIDDEN (__fini_array_start = .); - __DTOR_LIST__ = .; - LONG((__DTOR_END__ - __DTOR_LIST__) / 4 - 2) - KEEP(*(.dtors.start)) - KEEP(*(.dtors)) - LONG(0) - __DTOR_END__ = .; - KEEP (*(SORT(.fini_array.*))) - KEEP (*(.fini_array )) - PROVIDE_HIDDEN (__fini_array_end = .); - } > LOCAL - - .boot : { - . = ALIGN(4); - *(.boot) - *(.boot.data) - } > LOCAL - - .rodata : { - . = ALIGN(4); - *(.rodata); - *(.rodata.*) - *(.srodata); - *(.srodata.*) - *(.eh_frame*) - } > LOCAL - - .shbss : { - . = ALIGN(4); - *(.shbss) - } > LOCAL - - .talias : { - } > LOCAL - - .gnu.offload_funcs : { - . = ALIGN(4); - KEEP(*(.gnu.offload_funcs)) - } > LOCAL - - .gnu.offload_vars : { - . = ALIGN(4); - KEEP(*(.gnu.offload_vars)) - } > LOCAL - - .stack : { + *(.init) + *(.text.init) + *(.text.startup) + *(.text) + *(.text*) + *(.text) . = ALIGN(4); - . = ALIGN(16); - stack_start = .; - . = . + 0x1000; - stack = .; - } > LOCAL + _etext = .; + } >L3 - .data : { - . = ALIGN(4); - sdata = .; - _sdata = .; - *(.data_fc) - *(.data_fc.*) - *(.data); - *(.data.*) - *(.sdata); - *(.sdata.*) - *(.heapl2ram) - *(.fcTcdm) - *(.fcTcdm.*) - *(.fcTcdm_g) - *(.fcTcdm_g.*) + /* By default, constant data goes into L3, right after code section */ + .rodata : + { . = ALIGN(4); - edata = .; - _edata = .; - } > LOCAL - - .bss : { - . = ALIGN(8); - _bss_start = .; - *(.bss) - *(.bss.*) - *(.sbss) - *(.sbss.*) - *(COMMON) + *(.rodata) + *(.rodata*) . = ALIGN(4); - _bss_end = .; - } > LOCAL - - __l2_priv0_end = ALIGN(4); + } >L3 - /* - * LOCAL PRIVATE BANK1 - * - * Contains FC code - */ + /* HTIF section for FESVR */ + .htif : { } >L3 - .vectors MAX(0x60010000,ALIGN(256)) : + /* Thread Local Storage sections */ + .tdata : { - __irq_vector_base = .; - KEEP(*(.vectors)) - } > LOCAL - - .text : + __tdata_start = .; + *(.tdata .tdata.* .gnu.linkonce.td.*) + __tdata_end = .; + } >L3 + .tbss : { - . = ALIGN(4); - _stext = .; - *(.text) - *(.text.*) - _etext = .; - *(.lit) - *(.shdata) - _endtext = .; - . = ALIGN(4); - } > LOCAL - - __l2_priv1_end = ALIGN(4); + __tbss_start = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + __tbss_end = .; + } >L3 + + /* Cluster Local Storage sections */ + .cdata : + { + __cdata_start = .; + *(.cdata .cdata.*) + __cdata_end = .; + } >L3 + .cbss : + { + __cbss_start = .; + *(.cbss .cbss.*) + __cbss_end = .; + } >L3 - /* - * L2 SHARED BANKS - * - * Contains other data such as peripheral data and cluster code and data - */ + /* used by the startup to initialize data */ + _sidata = LOADADDR(.data); - .l2_data : + /* small data section that can be addressed through the global pointer */ + .sdata : { - . = ALIGN(4); - *(.l2_data) - *(.l2_data.*) - *(.data_fc_shared) - *(.data_fc_shared.*) - . = ALIGN(4); - } > L2 + __SDATA_BEGIN__ = .; + __global_pointer$ = . + 0x7f0; + *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*) + *(.sdata .sdata.* .gnu.linkonce.s.*) + } >L3 + + /* Initialized data sections goes into L3 */ + .data : + { + __DATA_BEGIN__ = .; + *(.data .data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } >L3 + _edata = .; PROVIDE (edata = .); + + /* small bss section */ + . = .; + __bss_start = .; + .sbss : + { + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } >L3 - __l2_shared_end = ALIGN(4); + /* Uninitialized data section */ + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. */ + . = ALIGN(. != 0 ? 32 / 8 : 1); + } >L3 + . = ALIGN(32 / 8); + . = SEGMENT_START("ldata-segment", .); + . = ALIGN(32 / 8); + __BSS_END__ = .; + __bss_end = .; + _end = .; PROVIDE (end = .); + + /* Uninitialized data section in L3 */ + .dram : + { + *(.dram) + _edram = .; + } >L3 + __uart = 0x2002000; } diff --git a/target/sim/sw/device/apps/libomptarget_device/src/debug.h b/target/sim/sw/device/apps/libomptarget_device/src/debug.h new file mode 100644 index 000000000..dd439f4c7 --- /dev/null +++ b/target/sim/sw/device/apps/libomptarget_device/src/debug.h @@ -0,0 +1,77 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t snrt_log_level; + +static inline void snrt_debug_set_loglevel(uint32_t lvl) { snrt_log_level = lvl; }; + +#define LOG_ERROR 0 +#define LOG_WARN 1 +#define LOG_INFO 2 +#define LOG_DEBUG 3 +#define LOG_TRACE 4 + +#if defined(DEBUG) + +#define snrt_error(fmt, ...) \ + ({ \ + if (LOG_ERROR <= snrt_log_level) \ + snrt_printf("[\033[31msnrt(%d,%d):error:%s\033[0m] " fmt, snrt_cluster_idx(), \ + snrt_cluster_core_idx(), __func__, ##__VA_ARGS__); \ + }) +#define snrt_warn(fmt, ...) \ + ({ \ + if (LOG_WARN <= snrt_log_level) \ + snrt_printf("[\033[91msnrt(%d,%d):warn:%s\033[0m] " fmt, snrt_cluster_idx(), \ + snrt_cluster_core_idx(), __func__, ##__VA_ARGS__); \ + }) +#define snrt_info(fmt, ...) \ + ({ \ + if (LOG_INFO <= snrt_log_level) \ + snrt_printf("[\033[33msnrt(%d,%d):info:%s\033[0m] " fmt, snrt_cluster_idx(), \ + snrt_cluster_core_idx(), __func__, ##__VA_ARGS__); \ + }) +#define snrt_debug(fmt, ...) \ + ({ \ + if (LOG_DEBUG <= snrt_log_level) \ + snrt_printf("[\033[35msnrt(%d,%d):debug:%s\033[0m] " fmt, snrt_cluster_idx(), \ + snrt_cluster_core_idx(), __func__, ##__VA_ARGS__); \ + }) +#define snrt_trace(fmt, ...) \ + ({ \ + if (LOG_TRACE <= snrt_log_level) \ + snrt_printf("[\033[96msnrt(%d,%d):trace:%s\033[0m] " fmt, snrt_cluster_idx(), \ + snrt_cluster_core_idx(), __func__, ##__VA_ARGS__); \ + }) + +#else // #if defined(DEBUG) + +#define snrt_error(x...) \ + do { \ + } while (0) +#define snrt_warn(x...) \ + do { \ + } while (0) +#define snrt_info(x...) \ + do { \ + } while (0) +#define snrt_debug(x...) \ + do { \ + } while (0) +#define snrt_trace(x...) \ + do { \ + } while (0) + +#endif // defined(SNRT_DEBUG) + +#ifdef __cplusplus +} +#endif diff --git a/target/sim/sw/device/apps/libomptarget_device/src/main.c b/target/sim/sw/device/apps/libomptarget_device/src/main.c index 379d14a6e..dc28136fa 100644 --- a/target/sim/sw/device/apps/libomptarget_device/src/main.c +++ b/target/sim/sw/device/apps/libomptarget_device/src/main.c @@ -1,4 +1,315 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#include -int main() { - return 0; -} \ No newline at end of file +#include "sw_mailbox.h" +#include "snrt.h" + +//================================================================================ +// MACROS AND SETTINGS +//================================================================================ + +// set to >0 for debugging +#define DEBUG_LEVEL_OFFLOAD_MANAGER 1 + +const uint32_t active_pe = 8; + +/* MAILBOX SIGNALING */ +#define MBOX_DEVICE_READY (0x01U) +#define MBOX_DEVICE_START (0x02U) +#define MBOX_DEVICE_BUSY (0x03U) +#define MBOX_DEVICE_DONE (0x04U) +#define MBOX_DEVICE_STOP (0x0FU) +#define MBOX_DEVICE_LOGLVL (0x10U) +#define MBOX_HOST_READY (0x1000U) +#define MBOX_HOST_DONE (0x3000U) + +#define TO_RUNTIME (0x10000000U) // bypass PULP driver +#define RAB_UPDATE (0x20000000U) // handled by PULP driver +#define RAB_SWITCH (0x30000000U) // handled by PULP driver + +//================================================================================ +// TYPES +//================================================================================ + +// Shrinked gomp_team_t descriptor +typedef struct offload_rab_miss_handler_desc_s { + void (*omp_task_f)(void *arg, uint32_t argc); + void *omp_args; + void *omp_argc; + int barrier_id; +} offload_rab_miss_handler_desc_t; + +typedef uint32_t virt_addr_t; +typedef uint32_t virt_pfn_t; + +// This struct represents a miss in the RAB Miss Hardware FIFO. +typedef struct rab_miss_t { + virt_addr_t virt_addr; + int core_id; + int cluster_id; + int intra_cluster_id; + uint8_t is_prefetch; +} rab_miss_t; + +//================================================================================ +// Data +//================================================================================ +static volatile uint32_t g_printf_mutex = 0; + +static volatile uint32_t *soc_scratch = (uint32_t *)(0x02000014); +struct l3_layout l3l; + +const uint32_t snrt_stack_size __attribute__((weak, section(".rodata"))) = 12; + +// The boot data generated along with the system RTL. +// See `hw/system/snitch_cluster/test/tb_lib.hh` for details. +struct snrt_cluster_bootdata { + uint32_t boot_addr; + uint32_t core_count; + uint32_t hartid_base; + uint32_t tcdm_start; + uint32_t tcdm_size; + uint32_t tcdm_offset; + uint64_t global_mem_start; + uint64_t global_mem_end; + uint32_t cluster_count; + uint32_t s1_quadrant_count; + uint32_t clint_base; +}; + +/** + * @brief Called by each hart before the pre-main barrier in snrt crt0 + * + */ +void _snrt_hier_wakeup(void) { + const uint32_t core_id = snrt_cluster_core_idx(); + + // master core wakes other cluster cores through cluster local clint + if (core_id == 0) { + // clear the interrupt from cva6 + snrt_int_sw_clear(snrt_hartid()); + // wake remaining cluster cores + const unsigned cluster_core_num = snrt_cluster_core_num(); + snrt_int_cluster_set(~0x1 & ((1 << cluster_core_num) - 1)); + } else { + // clear my interrupt + snrt_int_cluster_clr(1 << core_id); + } +} + +//================================================================================ +// TODO: Symbols to declare somewhere else on a merge +//================================================================================ +/** + * @brief A re-entrant wrapper to printf + * + */ +void snrt_printf(const char *format, ...) { + va_list args; + + snrt_mutex_acquire(&g_printf_mutex); + + va_start(args, format); + vprintf(format, args); + va_end(args); + + snrt_mutex_release(&g_printf_mutex); +} + +//================================================================================ +// HERO Functions +//================================================================================ + +static void offload_rab_misses_handler(void *arg, uint32_t argc) { + (void)arg; + (void)argc; + snrt_err("unimplemented!\r\n"); + // static void offload_rab_misses_handler(uint32_t *status) { + // uint32_t *status = (uint32_t)arg; + // if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("offload_rab_misses_handler: synch @%p (0x%x)\n", status, + // *(volatile unsigned int *)status); + // do { + // handle_rab_misses(); + // } while (*((volatile uint32_t *)status) != 0xdeadbeefU); + // if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("offload_rab_misses_handler: synch @%p (0x%x)\n", status, + // *(volatile unsigned int *)status); +} + +static int gomp_offload_manager() { + const uint32_t core_id = snrt_cluster_core_idx(); + + // Init the manager (handshake btw host and accelerator is here) + // gomp_init_offload_manager(); + + // FIXME For the momenent we are not using the cmd sended as trigger. + // It should be used to perform the deactivation of the accelerator, + // as well as other operations, like local data allocation or movement. + // FIXME Note that the offload at the moment use several time the mailbox. + // We should compact the offload descriptor and just sent a pointer to + // that descriptor. + uint32_t cmd = (uint32_t)NULL; + uint32_t data; + + // Offloaded function pointer and arguments + void (*offloadFn)(uint64_t) = NULL; + uint64_t offloadArgs = 0x0; + unsigned nbOffloadRabMissHandlers = 0x0; + uint32_t offload_rab_miss_sync = 0x0U; + // offload_rab_miss_handler_desc_t rab_miss_handler = {.omp_task_f = offload_rab_misses_handler, + // .omp_args = (void *)&offload_rab_miss_sync, + // .omp_argc = 1, + // .barrier_id = -1}; + + int cycles = 0; + uint32_t issue_fpu, dma_busy; + rab_miss_t rab_miss; + // reset_vmm(); + + while (1) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Waiting for command...\n"); + + // (1) Wait for the offload trigger cmd == MBOX_DEVICE_START + mailbox_read((unsigned int *)&cmd, 1); + if (MBOX_DEVICE_STOP == cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got MBOX_DEVICE_STOP from host, stopping execution now.\n"); + break; + } else if (MBOX_DEVICE_LOGLVL == cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got command 0x%x, setting log level.\n", cmd); + mailbox_read((unsigned int *)&data, 1); + //snrt_debug_set_loglevel(data); + continue; + } else if (MBOX_DEVICE_START != cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got unexpected command 0x%x, stopping execution now.\n", cmd); + break; + } + + // (2) The host sends through the mailbox the pointer to the function that should be + // executed on the accelerator. + mailbox_read((unsigned int *)&offloadFn, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("tgt_fn @ 0x%x\n", (unsigned int)offloadFn); + + // (3) The host sends through the mailbox the pointer to the arguments that should + // be used. + mailbox_read((unsigned int *)&offloadArgs, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("tgt_vars @ 0x%x\n", (unsigned int)offloadArgs); + + // (3b) The host sends through the mailbox the number of rab misses handlers threads + mailbox_read((unsigned int *)&nbOffloadRabMissHandlers, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("nbOffloadRabMissHandlers %d/%d\n", nbOffloadRabMissHandlers, active_pe); + + // (3c) Spawning nbOffloadRabMissHandlers + unsigned mhCoreMask = 0; + nbOffloadRabMissHandlers = + nbOffloadRabMissHandlers < active_pe - 1 ? nbOffloadRabMissHandlers : active_pe - 1; + if (nbOffloadRabMissHandlers) { + offload_rab_miss_sync = 0x0U; + for (int pid = active_pe - 1, i = nbOffloadRabMissHandlers; i > 0; i--, pid--) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("enabling RAB miss handler on %d\n", pid); + mhCoreMask |= (1 << pid); + } + } + omp_getData()->maxThreads = active_pe - nbOffloadRabMissHandlers; + omp_getData()->numThreads = active_pe - nbOffloadRabMissHandlers; + // eu_dispatch_team_config(mhCoreMask); + // eu_dispatch_push((unsigned int)&offload_rab_misses_handler); + // eu_dispatch_push((unsigned int)&offload_rab_miss_sync); + // eu_dispatch_team_config(omp_getData()->coreMask); + + // (4) Ensure access to offloadArgs. It might be in SVM. + if (offloadArgs != 0x0) { + // FIXME + // pulp_tryread((unsigned int *)offloadArgs); + } + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("begin offloading\n"); + // reset_timer(); + // start_timer(); + + //for (unsigned i = 0; i < 16; i += 2) { + // snrt_trace(" %2d: 0x%08x = ... ; %2d: 0x%08x = ...\n", i, ((uint32_t *)offloadArgs)[i], + // /* *((uint32_t *)(((uint32_t *)offloadArgs)[i])) ,*/ i + 1, + // ((uint32_t *)offloadArgs)[i + 1] /*, *((uint32_t *)(((uint32_t *)offloadArgs)[i + 1]))*/ ); + //} + + // (5) Execute the offloaded function. + // snrt_reset_perf_counter(SNRT_PERF_CNT0); + // snrt_reset_perf_counter(SNRT_PERF_CNT1); + // snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_ISSUE_FPU, core_id); + // snrt_start_perf_counter(SNRT_PERF_CNT1, SNRT_PERF_CNT_DMA_BUSY, core_id); + cycles = read_csr(mcycle); + + offloadFn(offloadArgs); + + cycles = read_csr(mcycle) - cycles; + // snrt_stop_perf_counter(SNRT_PERF_CNT0); + // snrt_stop_perf_counter(SNRT_PERF_CNT1); + // issue_fpu = snrt_get_perf_counter(SNRT_PERF_CNT0); + // dma_busy = snrt_get_perf_counter(SNRT_PERF_CNT1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("end offloading\n"); + + // (6) Report EOC and profiling + //snrt_info("cycles: %d\r\n", cycles); + + mailbox_write(MBOX_DEVICE_DONE); + mailbox_write(cycles); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Kernel execution time [Snitch cycles] = %d\n", cycles); + + if (nbOffloadRabMissHandlers) { + offload_rab_miss_sync = 0xdeadbeefU; + // gomp_atomic_add_thread_pool_idle_cores(nbOffloadRabMissHandlers); + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + (void)argc; + (void)argv; + unsigned core_idx = snrt_cluster_core_idx(); + unsigned core_num = snrt_cluster_core_num(); + + /** + * One core initializes the global data structures + */ + if (snrt_is_dm_core()) { + // read memory layout from scratch2 + memcpy(&l3l, (void *)soc_scratch[2], sizeof(struct l3_layout)); + g_a2h_rb = (struct ring_buf *)l3l.a2h_rb; + g_a2h_mbox = (struct ring_buf *)l3l.a2h_mbox; + g_h2a_mbox = (struct ring_buf *)l3l.h2a_mbox; + } + + snrt_cluster_hw_barrier(); + + __snrt_omp_bootstrap(core_idx); + + //snrt_trace("omp_bootstrap complete, core_idx: %d core_num: %d\n", core_idx, core_num); + + gomp_offload_manager(); + + //snrt_trace("bye\n"); + // exit + __snrt_omp_destroy(core_idx); + snrt_hero_exit(0); + return 0; +} diff --git a/target/sim/sw/device/apps/libomptarget_device/src/main.o b/target/sim/sw/device/apps/libomptarget_device/src/main.o deleted file mode 100644 index f7685fc0f66ac871631e2620f16810365c78337c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1200 zcmbtTJx>Bb5S;_W55yp`Fu?*VHPMYRF;NS1h5$y5g^`lzfoCv!V7M#LMk^CLOMZ&I zf5JatufIU&9lN@$ca2WMyqUN2c4l{vyYfLLVOgZiqHX%g6if83k#;g~OT|sGCRRbuz_D4kM`ZPKR$+2rsi^m!A+C1SIQRO)oZP5$?X%UVM6An!& z{2MQD8YKLnJHVET0=E~CICBDrM8o$wwY^p0$1<@toVV`yd=?euN>jFPR_fI z=Z1a$>k7O9-oU>WWAK%xl^gxkf^}?FO+!!Uz@E8B;LzLvoePcZ>02!N153h pl0F>U>i}q+5n59Z-n@HAu4Vr$63xu>PxD%ozAp=pH7!C@{{{c4ILQD2 diff --git a/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.c b/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.c new file mode 100644 index 000000000..4ebfe486e --- /dev/null +++ b/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.c @@ -0,0 +1,94 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "sw_mailbox.h" +#include "snrt.h" // snrt_mcycle + +/*********************************************************************************** + * MACROS + ***********************************************************************************/ + +#define SYS_exit 60 +#define SYS_write 64 +#define SYS_read 63 +#define SYS_wake 1235 +#define SYS_cycle 1236 + +/*********************************************************************************** + * DATA + ***********************************************************************************/ +volatile struct ring_buf *g_a2h_rb; +volatile struct ring_buf *g_a2h_mbox; +volatile struct ring_buf *g_h2a_mbox; + +/*********************************************************************************** + * FUNCTIONS + ***********************************************************************************/ +__attribute__((optimize("O0"))) void csleep(uint32_t cycles) { + uint32_t start = snrt_mcycle(); + while ((snrt_mcycle() - start) < cycles) {} +} + +int syscall(uint64_t which, uint64_t arg0, uint64_t arg1, uint64_t arg2, + uint64_t arg3, uint64_t arg4) { + uint64_t magic_mem[6]; + int ret; + uint32_t retries = 0; + + volatile struct ring_buf *rb = g_a2h_rb; + + magic_mem[0] = which; + magic_mem[1] = arg0; + magic_mem[2] = arg1; + magic_mem[3] = arg2; + magic_mem[4] = arg3; + magic_mem[5] = arg4; + + do { + ret = rb_device_put(rb, (void *)magic_mem); + if (ret) { + ++retries; + csleep(1000000); + } + } while (ret != 0); + return retries; +} + +void snrt_putchar(char c) { + *(volatile uint32_t *)0x2002000 = c; + csleep(10000); + //syscall(SYS_write, 1, c, 1, 0, 0); +} + +void snrt_hero_exit(int code) { syscall(SYS_exit, code, 0, 0, 0, 0); } + +/*********************************************************************************** + * MAILBOX + ***********************************************************************************/ + +int mailbox_try_read(uint32_t *buffer) { + return rb_device_get(g_h2a_mbox, buffer) == 0 ? 1 : 0; +} +int mailbox_read(uint32_t *buffer, size_t n_words) { + int ret; + while (n_words--) { + do { + ret = rb_device_get(g_h2a_mbox, &buffer[n_words]); + if (ret) { + csleep(1000000); + } + } while (ret); + } + return 0; +} +int mailbox_write(uint32_t word) { + int ret; + do { + ret = rb_device_put(g_a2h_mbox, &word); + if (ret) { + csleep(10000); + } + } while (ret); + return ret; +} diff --git a/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.h b/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.h new file mode 100644 index 000000000..8d532e46e --- /dev/null +++ b/target/sim/sw/device/apps/libomptarget_device/src/sw_mailbox.h @@ -0,0 +1,197 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include +#include +#include +#include + +/*********************************************************************************** + * MACROS + ***********************************************************************************/ + +#define MBOX_DEVICE_READY (0x01U) +#define MBOX_DEVICE_START (0x02U) +#define MBOX_DEVICE_BUSY (0x03U) +#define MBOX_DEVICE_DONE (0x04U) +#define MBOX_DEVICE_STOP (0x0FU) +#define MBOX_DEVICE_LOGLVL (0x10U) +#define MBOX_HOST_READY (0x1000U) +#define MBOX_HOST_DONE (0x3000U) + +#define SYS_exit 60 +#define SYS_write 64 +#define SYS_read 63 +#define SYS_wake 1235 +#define SYS_cycle 1236 + +/*********************************************************************************** + * TYPES + ***********************************************************************************/ + +/** + * @brief Ring buffer for simple communication from accelerator to host. + * @tail: Points to the element in `data` which is read next + * @head: Points to the element in `data` which is written next + * @size: Number of elements in `data`. Head and tail pointer wrap at `size` + * @element_size: Size of each element in bytes + * @data_p: points to the base of the data buffer in physical address + * @data_v: points to the base of the data buffer in virtual address space + */ +struct ring_buf { + uint32_t head; + uint32_t size; + uint32_t tail; + uint32_t element_size; + uint64_t data_v; + uint64_t data_p; +}; + + +/*********************************************************************************** + * DATA + ***********************************************************************************/ +extern volatile struct ring_buf *g_a2h_rb; +extern volatile struct ring_buf *g_a2h_mbox; +extern volatile struct ring_buf *g_h2a_mbox; + +/*********************************************************************************** + * INLINES + ***********************************************************************************/ + +static inline void dump_mbox(struct ring_buf *rbuf) { + printf("---DUMPING NOW---\n\r"); + printf("mbox (%x)\n\r", rbuf); + uint8_t* addr = rbuf; + for(int i = 0; i < sizeof(struct ring_buf); i++) { + if(i % 8 == 0) + printf("\n\r(%x) : ", addr); + printf("%x-", *(addr++)); + } + printf("\n\r"); + printf("head : %#x = %u\n\r" , &rbuf->head , rbuf->head ); + printf("size : %#x = %u\n\r" , &rbuf->size , rbuf->size ); + printf("tail : %#x = %u\n\r" , &rbuf->tail , rbuf->tail ); + printf("data_p : %#x = %lx\n\r", &rbuf->data_p , rbuf->data_p ); + printf("data_v : %#x = %lx\n\r", &rbuf->data_v , rbuf->data_v ); + //printf("tail %u, data_v %" PRIu64 ", element_size %u, size %u, data_p %" PRIu64 ", head %u\n\r", rbuf->tail, rbuf->data_v, rbuf->element_size, rbuf->size, rbuf->data_p, rbuf->head); + printf("---DUMPING ENDS---\n\r"); +} + +/** + * @brief Copy data from `el` in the next free slot in the ring-buffer on the + *physical addresses + * + * @param rb pointer to the ring buffer struct + * @param el pointer to the data to be copied into the ring buffer + * @return int 0 on succes, -1 if the buffer is full + */ +static inline int rb_device_put(volatile struct ring_buf *rb, void *el) { + uint32_t next_head = (rb->head + 1) % rb->size; + // caught the tail, can't put data + if (next_head == rb->tail) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)rb->data_p + rb->element_size *rb->head + i) = + *((uint8_t *)el + i); + rb->head = next_head; + return 0; +} +/** + * @brief Pop element from ring buffer on virtual addresses + * + * @param rb pointer to ring buffer struct + * @param el pointer to where element is copied to + * @return 0 on success, -1 if no element could be popped + */ +static inline int rb_host_get(volatile struct ring_buf *rb, void *el) { + // caught the head, can't get data + if (rb->tail == rb->head) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)el + i) = + *((uint8_t *)rb->data_v + rb->element_size * rb->tail + i); + rb->tail = (rb->tail + 1) % rb->size; + return 0; +} + +/** + * @brief Copy data from `el` in the next free slot in the ring-buffer on the + *virtual addresses + * + * @param rb pointer to the ring buffer struct + * @param el pointer to the data to be copied into the ring buffer + * @return int 0 on succes, -1 if the buffer is full + */ +static inline int rb_host_put(volatile struct ring_buf *rb, void *el) { + uint32_t next_head = (rb->head + 1) % rb->size; + // caught the tail, can't put data + if (next_head == rb->tail) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)rb->data_v + rb->element_size *rb->head + i) = + *((uint8_t *)el + i); + rb->head = next_head; + return 0; +} +/** + * @brief Pop element from ring buffer on physicl addresses + * + * @param rb pointer to ring buffer struct + * @param el pointer to where element is copied to + * @return 0 on success, -1 if no element could be popped + */ +static inline int rb_device_get(volatile struct ring_buf *rb, void *el) { + // caught the head, can't get data + if (rb->tail == rb->head) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)el + i) = + *((uint8_t *)rb->data_p + rb->element_size * rb->tail + i); + rb->tail = (rb->tail + 1) % rb->size; + return 0; +} +/** + * @brief Init the ring buffer. See `struct ring_buf` for details + */ +static inline void rb_init(volatile struct ring_buf *rb, uint64_t size, + uint64_t element_size) { + rb->tail = 0; + rb->head = 0; + rb->size = size; + rb->element_size = element_size; +} + +/** + * @brief Holds physical addresses of the shared L3 + * @a2h_rb: accelerator to host ring buffer + * @head: base of heap memory + */ +struct l3_layout { + uint32_t a2h_rb; + uint32_t a2h_mbox; + uint32_t h2a_mbox; + uint32_t heap; +}; + +/*********************************************************************************** + * PUBLICS + ***********************************************************************************/ +int syscall(uint64_t which, uint64_t arg0, uint64_t arg1, uint64_t arg2, + uint64_t arg3, uint64_t arg4); +void csleep(uint32_t cycles); +void snrt_hero_exit(int code); +/** + * @brief Blocking mailbox read access + */ +int mailbox_read(uint32_t *buffer, size_t n_words); +/** + * @brief Non-Blocking mailbox read access. Return 1 on success, 0 on fail + */ +int mailbox_try_read(uint32_t *buffer); +/** + * @brief Blocking mailbox write access + */ +int mailbox_write(uint32_t word); diff --git a/target/sim/sw/device/runtime/Makefile b/target/sim/sw/device/runtime/Makefile index fab277dfe..df59d075d 100644 --- a/target/sim/sw/device/runtime/Makefile +++ b/target/sim/sw/device/runtime/Makefile @@ -19,7 +19,9 @@ SNRT_DIR = $(SNITCH_ROOT)/sw/snRuntime # Dependencies INCDIRS += $(SNRT_DIR)/src +INCDIRS += $(SNRT_DIR)/src/omp INCDIRS += $(SNRT_DIR)/api +INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes INCDIRS += $(SW_DIR)/shared/platform INCDIRS += $(SW_DIR)/shared/platform/generated diff --git a/target/sim/sw/device/runtime/src/occamy_start.c b/target/sim/sw/device/runtime/src/occamy_start.c index 0dec4735b..acc7b405b 100644 --- a/target/sim/sw/device/runtime/src/occamy_start.c +++ b/target/sim/sw/device/runtime/src/occamy_start.c @@ -12,6 +12,9 @@ #define SNRT_CRT0_POST_BARRIER #define SNRT_CRT0_CALLBACK7 +static inline void snrt_exit(int exit_code) { +} + static inline void snrt_crt0_callback3() { _snrt_cluster_hw_barrier = cluster_hw_barrier_addr(snrt_cluster_idx()); } diff --git a/target/sim/sw/device/runtime/src/putchar.c b/target/sim/sw/device/runtime/src/putchar.c index 84f345d0e..3ef169e56 100644 --- a/target/sim/sw/device/runtime/src/putchar.c +++ b/target/sim/sw/device/runtime/src/putchar.c @@ -3,4 +3,4 @@ // SPDX-License-Identifier: Apache-2.0 // Provide an implementation for putchar. -void _putchar(char character) {} +void __attribute__((weak)) _putchar(char character) {} diff --git a/target/sim/sw/device/runtime/src/snrt.c b/target/sim/sw/device/runtime/src/snrt.c index 12003018e..cc02c074d 100644 --- a/target/sim/sw/device/runtime/src/snrt.c +++ b/target/sim/sw/device/runtime/src/snrt.c @@ -7,7 +7,11 @@ #include "alloc.c" #include "cls.c" #include "cluster_interrupts.c" +#include "dm.c" #include "dma.c" +#include "eu.c" +#include "kmp.c" +#include "omp.c" #include "global_interrupts.c" #include "occamy_device.c" #include "occamy_memory.c" diff --git a/target/sim/sw/device/runtime/src/snrt.h b/target/sim/sw/device/runtime/src/snrt.h index 57686fe94..0dd5f7f59 100644 --- a/target/sim/sw/device/runtime/src/snrt.h +++ b/target/sim/sw/device/runtime/src/snrt.h @@ -18,19 +18,26 @@ #include "cls_decls.h" #include "cluster_interrupt_decls.h" #include "global_interrupt_decls.h" +#include "riscv_decls.h" #include "memory_decls.h" #include "sync_decls.h" #include "team_decls.h" +#include "start_decls.h" // Implementation #include "alloc.h" #include "cls.h" #include "cluster_interrupts.h" +#include "dm.h" #include "dma.h" #include "dump.h" #include "global_interrupts.h" #include "occamy_device.h" #include "occamy_memory.h" +#include "eu.h" +#include "kmp.h" +#include "omp.h" +#include "perf_cnt.h" #include "printf.h" #include "riscv.h" #include "ssr.h" diff --git a/target/sim/sw/device/toolchain.mk b/target/sim/sw/device/toolchain.mk index 9e83aad99..7fa4cc69a 100644 --- a/target/sim/sw/device/toolchain.mk +++ b/target/sim/sw/device/toolchain.mk @@ -6,4 +6,6 @@ BENDER ?= bender SNITCH_ROOT = $(shell $(BENDER) path snitch_cluster) -include $(SNITCH_ROOT)/target/snitch_cluster/sw/toolchain.mk \ No newline at end of file +RISCV_CFLAGS += --sysroot=$(HERO_INSTALL)/rv32imafd-ilp32d/riscv32-unknown-elf -target riscv32-unknown-elf +RISCV_LDFLAGS += -L$(HERO_INSTALL)/lib/clang/15.0.0/rv32imafdvzfh-ilp32d/lib/ +include $(SNITCH_ROOT)/target/snitch_cluster/sw/toolchain.mk