diff --git a/target/sim/sw/device/Makefile b/target/sim/sw/device/Makefile index 4aba34748..89f4f5a30 100644 --- a/target/sim/sw/device/Makefile +++ b/target/sim/sw/device/Makefile @@ -5,7 +5,7 @@ # Luca Colagrande # Add user applications to APPS variable -APPS = offload +APPS = offload sndnn/gemm axpy TARGET ?= all @@ -22,5 +22,8 @@ runtime: $(MAKE) -C $@ $(TARGET) # Explicit dependency of apps on runtime -$(APP_SUBDIRS): runtime +$(APP_SUBDIRS): libraries/snDNN runtime + $(MAKE) -C $@ $(TARGET) + +libraries/snDNN: runtime $(MAKE) -C $@ $(TARGET) diff --git a/target/sim/sw/device/apps/Makefile b/target/sim/sw/device/apps/Makefile new file mode 100644 index 000000000..115d1ccbd --- /dev/null +++ b/target/sim/sw/device/apps/Makefile @@ -0,0 +1,28 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +SUBDIRS = offload +# SUBDIRS += gemm +SUBDIRS += axpy +# SUBDIRS += nop +# SUBDIRS += blas/axpy +# SUBDIRS += blas/gemm +# SUBDIRS += sndnn/batchnorm +# # SUBDIRS += sndnn/conv2d # fails with exit code 32 +# SUBDIRS += sndnn/fusedconv +# SUBDIRS += sndnn/gelu +SUBDIRS += sndnn/gemm +# # SUBDIRS += sndnn/layernorm # throws illegal instruction in simulation +# SUBDIRS += sndnn/linear +# SUBDIRS += sndnn/maxpool +# # SUBDIRS += sndnn/softmax + +.PHONY: all clean $(SUBDIRS) + +all: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(TARGET) diff --git a/target/sim/sw/device/apps/common.mk b/target/sim/sw/device/apps/common.mk index 4c4e5ab00..f379ea058 100644 --- a/target/sim/sw/device/apps/common.mk +++ b/target/sim/sw/device/apps/common.mk @@ -4,18 +4,48 @@ # # Luca Colagrande -include ../../toolchain.mk +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +include $(MK_DIR)/../toolchain.mk ################### # Build variables # ################### +# Fixed paths in repository tree +ROOT = $(abspath $(MK_DIR)/../../../../..) # Directories -BUILDDIR = $(abspath build) -APPSDIR = $(abspath ../) -RUNTIME_DIR = $(abspath ../../runtime) +APPSDIR = $(abspath $(MK_DIR)/) +# RUNTIME_DIR = $(abspath ../../runtime) +RUNTIME_DIR = $(abspath $(MK_DIR)/../runtime) SNRT_DIR = $(shell bender path snitch_cluster)/sw/snRuntime -SW_DIR = $(abspath ../../../) +# SW_DIR = $(abspath ../../../) +SW_DIR = $(abspath $(MK_DIR)/../../) + +################# +# SNDNN_LIBRARY # +################# +ifdef USE_SNDNN_LIBRARY +SNDNN_DIR := $(shell bender path snitch_cluster)/sw/snDNN +SNDNN_LIB_DIR := $(abspath $(MK_DIR)/../libraries/snDNN) +SNDNN_LIB_NAME = snDNN + +# Dependencies +INCDIRS += $(SNDNN_LIB_DIR)/src +INCDIRS += $(SNDNN_DIR)/src +INCDIRS += $(SNDNN_DIR)/include +# Linker script +# RISCV_LDFLAGS += -L$(abspath $(SNDNN_LIB_DIR)) +# Link snRuntime library +RISCV_LDFLAGS += -L$(abspath $(SNDNN_LIB_DIR)/build/) +RISCV_LDFLAGS += -l$(SNDNN_LIB_NAME) +BUILDDIR = $(abspath $(MK_DIR)/sndnn/$(APP)/build) +SNDNN_LIB = $(realpath $(SNDNN_LIB_DIR)/build/lib$(SNDNN_LIB_NAME).a) +LD_SRCS += $(SNDNN_LIB) +else +BUILDDIR = $(abspath $(MK_DIR)/$(APP)/build) +endif # Dependencies INCDIRS += $(RUNTIME_DIR)/src @@ -83,7 +113,7 @@ $(DEP): $(SRCS) | $(BUILDDIR) $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< > $@ $(ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR) - $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ + $(RISCV_CC) $(RISCV_CFLAGS) $(SRCS) $(RISCV_LDFLAGS) -o $@ $(BIN): $(ELF) | $(BUILDDIR) $(RISCV_OBJCOPY) $(OBJCOPY_FLAGS) $< $@ diff --git a/target/sim/sw/device/apps/sndnn/Makefile b/target/sim/sw/device/apps/sndnn/Makefile new file mode 100644 index 000000000..337cf8841 --- /dev/null +++ b/target/sim/sw/device/apps/sndnn/Makefile @@ -0,0 +1,34 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Gianna Paulin + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/$(APP_NAME)/src) +SNDNN_DIR := $(shell bender path snitch_cluster)/sw/snDNN/) +SNDNN_LIB_DIR := $(realpath $(MK_DIR)/../../libraries/snDNN/) +# SNDNN_SRC_DIR := $(shell bender path snitch_cluster)/sw/snDNN/src) + +INCLUDE_DIR = $(realpath $(SNDNN_DIR)/include) +INCLUDE_DIR += $(realpath $(SNDNN_DIR)/src) +INCLUDE_DIR += $(realpath $(SNDNN_LIB_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/$(APP_NAME)_params.hjson + +APP ?= $(APP_NAME) +SRCS += $(realpath $(SRC_DIR)/net_$(APP_NAME).c) +# SRCS += $(realpath $(SNDNN_LIB_DIR)/src/sndnn.c) +INCDIRS += $(DATA_DIR) $(SRC_DIR) $(INCLUDE_DIR) + +$(DATA_DIR)/data_$(APP_NAME).h: $(MK_DIR)/datagen.py $(DATA_CFG) + $< -c $(DATA_CFG) > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_DIR)/data_$(APP_NAME).h + +clean: clean-data diff --git a/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson b/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson new file mode 100644 index 000000000..e3b54c274 --- /dev/null +++ b/target/sim/sw/device/apps/sndnn/data/gemm_params.hjson @@ -0,0 +1,17 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMM + +{ + kernel: "GEMM" + M: 16, + N: 16, + K: 16, + alpha: 0, + transpose_A: false, + transpose_B: true, + prec: 32, + expand: 0 +} diff --git a/target/sim/sw/device/apps/sndnn/datagen.py b/target/sim/sw/device/apps/sndnn/datagen.py new file mode 100755 index 000000000..65fc63f31 --- /dev/null +++ b/target/sim/sw/device/apps/sndnn/datagen.py @@ -0,0 +1,874 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Tim Fischer +# Author: Viviane Potocnik + +import numpy as np +import torch +import torch.nn as nn +import argparse +import pathlib +import hjson + + +np.random.seed(42) +torch.manual_seed(42) + +global verbose + + +def array_to_cstr(a, fmt=float): + out = '{' + if fmt == float: + if isinstance(a, np.ndarray): + a = a.flat + if isinstance(a, torch.Tensor): + a = a.numpy().flat + for el in a: + out += '{}, '.format(el) + else: + for sign, exp, mant in zip(a['sign'].numpy().flat, + a['exponent'].numpy().flat, + a['mantissa'].numpy().flat): + value = sign * 2**7 + exp * 2**2 + mant + out += "0x{:02x}, ".format(value) + out = out[:-2] + '}' + return out + + +def emit_header_file(layer_type: str, **kwargs): + + file_path = pathlib.Path(__file__).parent / 'data' + emit_str = "// Copyright 2022 ETH Zurich and University of Bologna.\n" + \ + "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" + \ + "// SPDX-License-Identifier: Apache-2.0\n\n" + + if layer_type == 'Conv2d': + file = file_path / 'data_conv2d.h' + emit_str += emit_conv2d_layer(**kwargs) + elif layer_type == 'GEMM': + file = file_path / 'data_gemm.h' + emit_str += emit_GEMM_layer(**kwargs) + elif layer_type == 'BatchNorm': + file = file_path / 'data_batchnorm.h' + emit_str += emit_batchnorm_layer(**kwargs) + elif layer_type == 'MaxPool': + file = file_path / 'data_maxpool.h' + emit_str += emit_maxpool_layer(**kwargs) + elif layer_type == 'FusedConv': + file = file_path / 'data_fusedconv.h' + emit_str += emit_fusedconv(**kwargs) + elif layer_type == 'Linear': + file = file_path / 'data_linear.h' + emit_str += emit_linear_layer(**kwargs) + elif layer_type == 'GELU': + file = file_path / 'data_gelu.h' + emit_str += emit_gelu_layer(**kwargs) + elif layer_type == 'SoftMax': + file = file_path / 'data_softmax.h' + emit_str += emit_softmax_layer(**kwargs) + elif layer_type == 'LayerNorm': + file = file_path / 'data_layernorm.h' + emit_str += emit_layernorm_layer(**kwargs) + + with file.open('w') as f: + f.write(emit_str) + + +def emit_layernorm_layer(name='layernorm', **kwargs): + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + + batch_size, seq_len, embeddings = ifmap.shape + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + checksum = torch.sum(ifmap, dim=-1) + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'layernorm_layer_t {name}_l = {{\n' + layer_str += f'\t.BATCH_SIZE = {batch_size},\n' # batch_size + layer_str += f'\t.SEQ_LEN = {seq_len},\n' # seq_len + layer_str += f'\t.EMBEDDINGS = {embeddings},\n' # embeddings + layer_str += f'\t.dtype = FP{kwargs["prec"]},\n' + layer_str += '};\n\n\n' + + layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]' + layer_str += f'[{embeddings}] __attribute__((section(".data")));\n\n' + layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{embeddings}] = ' \ + + array_to_cstr(ifmap) + ';\n\n' + layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{embeddings}] = ' \ + + array_to_cstr(ofmap) + ';\n\n' + layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \ + + array_to_cstr(checksum) + ';\n\n' + + return layer_str + + +def emit_softmax_layer(name='softmax', **kwargs): + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + reduce_dim = kwargs['reduce_dim'] + + batch_size, seq_len, input_samples = ifmap.shape + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'softmax_layer_t {name}_l = {{\n' + layer_str += f'\t.BATCH_SIZE = {batch_size},\n' # batch_size + layer_str += f'\t.SEQ_LEN = {seq_len},\n' # seq_len + layer_str += f'\t.INPUT_SAMPLES = {input_samples},\n' # input_samples + layer_str += f'\t.REDUCE_DIM = {reduce_dim},\n' # reduce_dim + layer_str += f'\t.dtype = FP{kwargs["prec"]},\n' + layer_str += '};\n\n\n' + + checksum = torch.sum(ofmap, dim=-1) + + layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]' + layer_str += f'[{input_samples}] __attribute__((section(".data")));\n\n' + layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{input_samples}] = ' \ + + array_to_cstr(ifmap) + ';\n\n' + layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{input_samples}] = ' \ + + array_to_cstr(ofmap) + ';\n\n' + layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \ + + array_to_cstr(checksum) + ';\n\n' + + return layer_str + + +def emit_gelu_layer(name='gelu', **kwargs): + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + + batch_size, seq_len, hidden_nodes = ifmap.shape + # print("batch_size: {}".format(batch_size)) + # print("seq_len: {},".format(seq_len)) + # print("hidden_nodes: {}".format(hidden_nodes)) + # for i in range(batch_size): + # for j in range(seq_len): + # for k in range(hidden_nodes): + # print("ifmap[{}][{}][{}] = {}".format(i, j, k, ifmap[i][j][k])) + # print("ofmap[{}][{}][{}] = {}".format(i, j, k, ofmap[i][j][k])) + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'gelu_layer_t {name}_l = {{\n' + layer_str += f'\t.BATCH_SIZE = {batch_size},\n' # batch_size + layer_str += f'\t.SEQ_LEN = {seq_len},\n' # seq_len + layer_str += f'\t.HIDDEN_NODES = {hidden_nodes},\n' # hidden_size + layer_str += f'\t.dtype = FP{kwargs["prec"]},\n' + layer_str += '};\n\n\n' + + layer_str += f'static {dtype} {name}_result[{batch_size}][{seq_len}]' + layer_str += f'[{hidden_nodes}] __attribute__((section(".data")));\n\n' + layer_str += f'static {dtype} {name}_ifmap_dram[{batch_size}][{seq_len}][{hidden_nodes}] = ' \ + + array_to_cstr(ifmap) + ';\n\n\n' + layer_str += f'static {dtype} {name}_ofmap_dram[{batch_size}][{seq_len}][{hidden_nodes}] = ' \ + + array_to_cstr(ofmap) + ';\n\n\n' + layer_str += f'static {dtype} {name}_checksum[{batch_size}][{seq_len}] = ' \ + + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n' + + return layer_str + + +def emit_linear_layer(name='linear', **kwargs): + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + weights = kwargs['weights'] + bias = kwargs['bias'] + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + + ch, ci = ifmap.shape + _, co = ofmap.shape + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'linear_layer_t {name}_l = {{\n' + layer_str += f'\t.CO = {co},\n' # out_features + layer_str += f'\t.CI = {ci},\n' # in_features + layer_str += f'\t.CH = {ch},\n' # height + layer_str += f'\t.CW = {ci}\n' # width + layer_str += '};\n\n\n' + + layer_str += f'static {dtype} {name}_result[{co*ch}] __attribute__((section(".data")));\n\n' + layer_str += f'static {dtype} {name}_checksum' + \ + f'[{co*ch}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n' + layer_str += f'static {dtype} {name}_ifmap_dram' + \ + f'[{ch}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n' + layer_str += f'static {dtype} {name}_weights_dram' + \ + f'[{co}][{ci}] = ' + array_to_cstr(weights) + ';\n\n\n' + layer_str += f'static {dtype} {name}_bias_dram[{co}] = ' + array_to_cstr(bias) + ';\n\n\n' + layer_str += f'static {dtype} {name}_ofmap_dram' + \ + f'[{ch}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n' + + return layer_str + + +def emit_conv2d_layer(name='conv2d', **kwargs): + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + weights = kwargs['weights'] + + n, ih, iw, ci = ifmap.shape + _, oh, ow, co = ofmap.shape + _, fh, fw, _ = weights.shape + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'conv_layer {name}_l = {{\n' + layer_str += f'\t.CO = {co},\n' + layer_str += f'\t.CI = {ci},\n' + layer_str += f'\t.IH = {ih},\n' + layer_str += f'\t.IW = {iw},\n' + layer_str += f'\t.OH = {oh},\n' + layer_str += f'\t.OW = {ow},\n' + layer_str += f'\t.FH = {fh},\n' + layer_str += f'\t.FW = {fw}\n' + layer_str += '};\n\n\n' + + layer_str += f'static double {name}_result' + \ + f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' + layer_str += f'static double {name}_checksum' + \ + f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n' + layer_str += f'static double {name}_ifmap_dram' + \ + f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n' + layer_str += f'static double {name}_weights_dram' + \ + f'[{co}][{ci}][{fh}][{fw}] = ' + array_to_cstr(weights) + ';\n\n\n' + layer_str += f'static double {name}_ofmap_dram' + \ + f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n' + + return layer_str + + +def emit_GEMM_layer(name='gemm', **kwargs): + mat_A = kwargs['A'] + mat_B = kwargs['B'] + mat_C = kwargs['C'] + result = kwargs['result'] + + m = kwargs['M'] + n = kwargs['N'] + k = kwargs['K'] + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'gemm_layer {name}_l = {{\n' + layer_str += f'\t.M = {m},\n' + layer_str += f'\t.N = {n},\n' + layer_str += f'\t.K = {k},\n' + layer_str += f'\t.TA = {int(kwargs["ta"])},\n' + layer_str += f'\t.TB = {int(kwargs["tb"])},\n' + layer_str += f'\t.ALPHA = {kwargs["alpha"]},\n' + layer_str += f'\t.dtype = FP{kwargs["prec"]},\n' + layer_str += f'\t.expand = {kwargs["expand"]}\n' + layer_str += '};\n\n\n' + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + if dtype != 'char': + layer_str += f'static {dtype} {name}_A_dram' + \ + f'[{m}][{k}] = ' + array_to_cstr(mat_A) + ';\n\n\n' + layer_str += f'static {dtype} {name}_B_dram' + \ + f'[{k}][{n}] = ' + array_to_cstr(mat_B) + ';\n\n\n' + layer_str += f'static {dtype} {name}_C_dram' + \ + f'[{m}][{n}] = ' + array_to_cstr(mat_C) + ';\n\n\n' + layer_str += f'static {dtype} {name}_result' + \ + f'[{m}][{n}] __attribute__((section(".data")));\n\n' + layer_str += f'static {dtype} {name}_checksum' + \ + f'[{m}] = ' + array_to_cstr(torch.sum(result, dim=-1)) + ';\n\n\n' + else: + layer_str += f'static {dtype} {name}_A_dram [{m}][{k}] = ' + \ + array_to_cstr(kwargs['bits_A'], fmt='char') + ';\n\n\n' + layer_str += f'static {dtype} {name}_B_dram [{k}][{n}] = ' + \ + array_to_cstr(kwargs['bits_B'], fmt='char') + ';\n\n\n' + layer_str += f'static {dtype} {name}_C_dram [{m}][{n}] = ' + \ + array_to_cstr(kwargs['bits_C'], fmt='char') + ';\n\n\n' + + return layer_str + + +def emit_batchnorm_layer(name='batchnorm', **kwargs): + + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + beta = kwargs['beta'] + gamma = kwargs['gamma'] + + n, ih, iw, ci = ifmap.shape + _, oh, ow, co = ofmap.shape + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'conv_layer {name}_l = {{\n' + layer_str += f'\t.CO = {co},\n' + layer_str += f'\t.CI = {ci},\n' + layer_str += f'\t.IH = {ih},\n' + layer_str += f'\t.IW = {iw},\n' + layer_str += f'\t.OH = {oh},\n' + layer_str += f'\t.OW = {ow},\n' + layer_str += '};\n\n\n' + + layer_str += f'static double {name}_result' + \ + f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' + layer_str += f'static double {name}_checksum' + \ + f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n' + layer_str += f'static double {name}_ifmap_dram' + \ + f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n' + layer_str += f'static double {name}_beta_dram' + \ + f'[{ci}] = ' + array_to_cstr(beta) + ';\n\n\n' + layer_str += f'static double {name}_gamma_dram' + \ + f'[{ci}] = ' + array_to_cstr(gamma) + ';\n\n\n' + layer_str += f'static double {name}_ofmap_dram' + \ + f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n' + + return layer_str + + +def emit_maxpool_layer(name='maxpool', **kwargs): + + ifmap = kwargs['ifmap'] + ofmap = kwargs['ofmap'] + k = kwargs['kernel_size'] + + n, ih, iw, ci = ifmap.shape + _, oh, ow, co = ofmap.shape + + layer_str = '' + layer_str += '#include "layer.h"\n\n' + layer_str += f'conv_layer {name}_l = {{\n' + layer_str += f'\t.CO = {co},\n' + layer_str += f'\t.CI = {ci},\n' + layer_str += f'\t.IH = {ih},\n' + layer_str += f'\t.IW = {iw},\n' + layer_str += f'\t.OH = {oh},\n' + layer_str += f'\t.OW = {ow},\n' + layer_str += f'\t.FH = {k},\n' + layer_str += f'\t.FW = {k},\n' + layer_str += '};\n\n\n' + + layer_str += f'static double {name}_result' + \ + f'[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' + layer_str += f'static double {name}_checksum' + \ + f'[{oh}][{ow}] = ' + array_to_cstr(torch.sum(ofmap, dim=-1)) + ';\n\n\n' + layer_str += f'static double {name}_ifmap_dram' + \ + f'[{ih}][{iw}][{ci}] = ' + array_to_cstr(ifmap) + ';\n\n\n' + layer_str += f'static double {name}_ofmap_dram' + \ + f'[{oh}][{ow}][{co}] = ' + array_to_cstr(ofmap) + ';\n\n\n' + + return layer_str + + +def emit_fusedconv(name='fusedconv', **kwargs): + + ifmap = kwargs['ifmap'] + kernel = kwargs['kernel'] + bn_k = kwargs['bn_k'] + bn_l = kwargs['bn_l'] + ofmap = kwargs['ofmap'] + ofmap_before = kwargs['ofmap_before'] + ifmap_padded = kwargs['ifmap_padded'] + + padding = kwargs['padding'] + + if kwargs['depthwise']: + ih, iw, ci = ifmap.shape + oh, ow, co = ofmap.shape + fh, fw, co = kernel.shape + ci = co + ih_pad, iw_pad, _ = ifmap_padded.shape + elif kwargs['chw_layer']: + ci, ih, iw = ifmap.shape + oh, ow, co = ofmap.shape + co, ci, fh, fw = kernel.shape + _, ih_pad, iw_pad = ifmap_padded.shape + else: + ih, iw, ci = ifmap.shape + oh, ow, co = ofmap.shape + _, fh, fw, _ = kernel.shape + ih_pad, iw_pad, _ = ifmap_padded.shape + + ctypes = { + '64': 'double', + '32': 'float', + '16': '__fp16', + '8': 'char' + } + + dtype = ctypes[str(kwargs['prec'])] + + layer_str = '#include \n' + layer_str += '#include "conv2d.h"\n\n' + layer_str += 'kernel_fp32 k = {\n' + layer_str += f'\t.ch_in = {ci},\n' + layer_str += f'\t.ch_out = {co},\n' + layer_str += f'\t.dim_in_x = {iw},\n' + layer_str += f'\t.dim_in_y = {ih},\n' + layer_str += f'\t.dim_kernel_x = {fw},\n' + layer_str += f'\t.dim_kernel_y = {fh},\n' + layer_str += f'\t.dim_out_x = {ow},\n' + layer_str += f'\t.dim_out_y = {oh},\n' + layer_str += f'\t.padding_y_top = {padding["padding_y_top"]},\n' + layer_str += f'\t.padding_y_bottom = {padding["padding_y_bottom"]},\n' + layer_str += f'\t.padding_x_left = {padding["padding_x_left"]},\n' + layer_str += f'\t.padding_x_right = {padding["padding_x_right"]},\n' + layer_str += f'\t.stride_x = {kwargs["stride"]["stride_x"]},\n' + layer_str += f'\t.stride_y = {kwargs["stride"]["stride_y"]},\n' + layer_str += f'\t.flag_relu = {kwargs["flags"]["flag_relu"]},\n' + layer_str += f'\t.flag_batch_norm = {kwargs["flags"]["flag_batch_norm"]},\n' + layer_str += f'\t.flag_y_accumulate_start = {kwargs["flags"]["flag_y_accumulate_start"]},\n' + layer_str += f'\t.flag_y_accumulate_end = {kwargs["flags"]["flag_y_accumulate_end"]},\n' + layer_str += '};\n\n' + layer_str += f'uint32_t dw = {kwargs["depthwise"]};\n' + layer_str += f'uint32_t chw_layer = {kwargs["chw_layer"]};\n' + + layer_str += f'static {dtype} {name}_pInBuffer_dram' + \ + f'[{ih_pad}][{iw_pad}][{ci}] = ' + array_to_cstr(ifmap_padded) + ';\n\n' + layer_str += f'static {dtype} {name}_pWeight_dram' + \ + f'[{co}][{fh}][{fw}][{ci}] = {array_to_cstr(kernel)};\n\n' + layer_str += f'static {dtype} {name}_lambda_dram' + \ + f'[{ci}] = {array_to_cstr(bn_l)};\n\n' + layer_str += f'static {dtype} {name}_kappa_dram' + \ + f'[{ci}] = {array_to_cstr(bn_k)};\n\n' + layer_str += f'static {dtype} {name}_pOutBuffer_dram' + \ + f'[{oh}][{ow}][{co}] = {array_to_cstr(ofmap_before)};\n\n' + layer_str += f'static {dtype} {name}_pCheckOutBuffer_dram' + \ + f'[{oh}][{ow}][{co}] = {array_to_cstr(ofmap)};\n\n' + + return layer_str + + +def rand_data_generator(shape, prec, alt=False): + if prec == 64: + return torch.randn(shape, requires_grad=False, dtype=torch.float64), {} + elif prec == 32: + return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} + elif prec == 16: + if alt: + return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + else: + return torch.randn(shape, requires_grad=False, dtype=torch.float16), {} + elif prec == 8: + sign = torch.randint(0, 2, shape, + requires_grad=False, dtype=torch.uint8) # -1 or 1 + exponent = torch.randint(0, 16, shape, + requires_grad=False, dtype=torch.uint8) # < 0b01111 + mantissa = torch.randint(0, 4, shape, + requires_grad=False, dtype=torch.uint8) # can be arbitrary + bits = {'sign': sign, 'exponent': exponent, 'mantissa': mantissa} + # TODO: not actually correct + sign_val = (-1.0)**sign.double() + exp_val = (2.0**(exponent.double()-15.0)) + man_val = (1.0 + mantissa.double() / (2**2)) + val = sign_val*exp_val*man_val + return val, bits + + +def conv2d(ifmap, weights, padding=1, stride=1): + n, ci, ih, iw = ifmap.shape + co, _, fh, fw = weights.shape + + conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh-1)//2, (fw-1)//2)) + conv2d.weight = nn.Parameter(weights, requires_grad=False) + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), + requires_grad=False) + ofmap = conv2d(ifmap) + + return ofmap + + +def max_pooling(ifmap, kernel): + n, ci, ih, iw = ifmap.shape + max_pool = nn.MaxPool2d(kernel_size=kernel) + ofmap = max_pool(ifmap) + + return ofmap + + +def batchnorm(ifmap): + n, ci, ih, iw = ifmap.shape + bn = torch.nn.BatchNorm2d(ci) + bn.weight.requires_grad = False + bn.bias.requires_grad = False + running_mean = torch.randn_like(bn.running_mean, requires_grad=False) + running_var = torch.rand_like(bn.running_var, requires_grad=False) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) + beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) + ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) + + return ofmap, gamma, beta + + +def fused_conv(ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise): + + ih, iw, ci = ifmap.shape + if not depthwise: + co, fh, fw, _ = weights.shape + else: + fh, fw, co = weights.shape + ci = co + + ifmap_padded = torch.zeros(ih + padding['padding_y_top'] + padding['padding_y_bottom'], iw + + padding['padding_x_left'] + padding['padding_x_right'], + ci, + requires_grad=False, dtype=ifmap.dtype) + ifmap_padded[padding['padding_y_top']:ih+padding['padding_y_top'], + padding['padding_x_left']:iw+padding['padding_x_left']] = ifmap + + # Don't cover undefined behaviour when there are steps without a complete kernel window + if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride['stride_y'] != 0: + print("Warning: rounding h output dimension") + if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride['stride_x'] != 0: + print("Warning: rounding w output dimension") + + ofmap = torch.zeros((ifmap_padded.shape[0] - (fh - 1) - 1) // stride['stride_y'] + 1, + (ifmap_padded.shape[1] - (fw - 1) - 1) // stride['stride_x'] + 1, co) + if accumulate: + ofmap_before = torch.randn_like(ofmap, requires_grad=False) + else: + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + + if verbose: + print(ifmap.shape, ifmap_padded.shape, ofmap.shape) + + if (depthwise): + # depthwise Conv2d + for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride['stride_y']): + for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride['stride_x']): + for c in range(co): + ofmap[h//stride['stride_y'], w//stride['stride_x'], + c] = torch.dot( + ifmap_padded[h:h+fh, w:w+fw, c].flatten(), + weights[:, :, c].flatten()) + else: + # Conv2d + for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride['stride_y']): + for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride['stride_x']): + for c in range(co): + ofmap[h//stride['stride_y'], w//stride['stride_x'], + c] = torch.dot( + ifmap_padded[h:h+fh, w:w+fw].flatten(), + weights[c].flatten()) + + ofmap += ofmap_before + + # BatchNorm + if bn: + ofmap = ofmap * bn_k + bn_l + + # ReLU + if relu: + ofmap = torch.nn.functional.relu(ofmap) + + return ofmap, ofmap_before, ifmap_padded + + +def linear(ifmap, weights, bias): + + ifmap = ifmap.flatten(1) + ofmap = torch.matmul(ifmap, weights.T) + bias + + return ofmap + + +def gelu(ifmap): + gelu = torch.nn.GELU() + ofmap = gelu(ifmap) + + return ofmap + + +def softmax(ifmap, axis): + softmax = torch.nn.Softmax(dim=axis) + ofmap = softmax(ifmap) + + # print the global max of the input + # print("max of input: ", torch.max(ifmap)) + + return ofmap + + +def layernorm(ifmap, eps, shape): + ln = torch.nn.LayerNorm(shape, eps=eps) + ofmap = ln(ifmap) + + return ofmap + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + "-v", + "--verbose", + action='store_true', + help='Set verbose' + ) + + args = parser.parse_args() + + global verbose + verbose = args.verbose + + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + if param['prec'] == 64: + dtype = torch.float64 + elif param['prec'] == 16: + dtype = torch.float16 + elif param['prec'] == 8: + dtype = None + else: + dtype = torch.float32 + + if param['kernel'] == 'Conv2d': + ifmap = torch.randn(1, param['channels']['in'], + param['input_dim']['height'], + param['input_dim']['width'], requires_grad=False, dtype=dtype) + weights = torch.randn(param['channels']['out'], + param['channels']['in'], + param['filter']['height'], + param['filter']['width'], requires_grad=False, dtype=dtype) + + ofmap = conv2d(ifmap, weights, + padding=param['filter']['padding'], + stride=param['filter']['stride']) + + # convert from CHW to HWC format + ifmap = ifmap.permute(0, 2, 3, 1) + ofmap = ofmap.permute(0, 2, 3, 1) + weights = weights.permute(0, 2, 3, 1) + kwargs = {'ifmap': ifmap, 'weights': weights, 'ofmap': ofmap} + emit_header_file('Conv2d', **kwargs) + + elif param['kernel'] == 'GEMM': + mat_A, bits_A = rand_data_generator((param['M'], param['K']), param['prec']) + mat_B, bits_B = rand_data_generator((param['K'], param['N']), param['prec']) + mat_C, bits_C = rand_data_generator((param['M'], param['N']), param['prec']) + + result = param['alpha'] * mat_C + torch.matmul(mat_A, mat_B) + + if param['transpose_A']: + mat_A = mat_A.T + if param['transpose_B']: + mat_B = mat_B.T + + kwargs = { + 'A': mat_A, + 'B': mat_B, + 'C': mat_C, + 'result': result, + 'M': param['M'], + 'N': param['N'], + 'K': param['K'], + 'ta': param['transpose_A'], + 'tb': param['transpose_B'], + 'alpha': param['alpha'], + 'prec': param['prec'], + 'expand': param['expand'], + 'bits_A': bits_A, + 'bits_B': bits_B, + 'bits_C': bits_C + } + + emit_header_file('GEMM', **kwargs) + + elif param['kernel'] == 'BatchNorm': + ifmap = torch.randn(1, param['channels']['in'], + param['input_dim']['height'], + param['input_dim']['width'], requires_grad=False, dtype=dtype) + + ofmap, gamma, beta = batchnorm(ifmap) + + # convert from CHW to HWC format + ifmap = ifmap.permute(0, 2, 3, 1) + ofmap = ofmap.permute(0, 2, 3, 1) + + kwargs = {'ifmap': ifmap, 'beta': beta, 'gamma': gamma, 'ofmap': ofmap} + emit_header_file('BatchNorm', **kwargs) + + elif param['kernel'] == 'MaxPool': + ifmap = torch.randn(1, param['channels']['in'], + param['input_dim']['height'], + param['input_dim']['width'], requires_grad=False, dtype=dtype) + + ofmap = max_pooling(ifmap, param['kernel_size']) + + # convert from CHW to HWC format + ifmap = ifmap.permute(0, 2, 3, 1) + ofmap = ofmap.permute(0, 2, 3, 1) + + kwargs = {'ifmap': ifmap, 'ofmap': ofmap, 'kernel_size': param['kernel_size']} + emit_header_file('MaxPool', **kwargs) + + elif param['kernel'] == 'FusedConv': + ifmap = torch.randn(param['dim_in_y'], + param['dim_in_x'], + param['ch_in'], requires_grad=False, dtype=dtype) + if not param['depthwise']: + kernel = torch.randn(param['ch_out'], param['dim_kernel_y'], param['dim_kernel_x'], + param['ch_in'], requires_grad=False, dtype=dtype) + else: + kernel = torch.randn(param['dim_kernel_y'], param['dim_kernel_x'], + param['ch_in'], requires_grad=False, dtype=dtype) + + bn_k = torch.randn(param['ch_out'], requires_grad=False) + bn_l = torch.randn(param['ch_out'], requires_grad=False) + + flag_y_accumulate_start = param['flags']['flag_y_accumulate_start'] + ofmap, ofmap_before, ifmap_padded = fused_conv(ifmap, + kernel, + bn_k, + bn_l, + param['padding'], + param['stride'], + param['flags']['flag_batch_norm'], + param['flags']['flag_relu'], + not flag_y_accumulate_start, + param['depthwise']) + + if param['chw_layer']: + ifmap = ifmap.permute(2, 0, 1) + ifmap_padded = ifmap_padded.permute(2, 0, 1) + kernel = kernel.permute(0, 3, 1, 2) + + kwargs = { + 'ifmap': ifmap, + 'ifmap_padded': ifmap_padded, + 'ofmap': ofmap, + 'ofmap_before': ofmap_before, + 'kernel': kernel, + 'bn_k': bn_k, + 'bn_l': bn_l, + 'padding': param['padding'], + 'stride': param['stride'], + 'prec': param['prec'], + 'flags': param['flags'], + 'depthwise': param['depthwise'], + 'chw_layer': param['chw_layer'] + } + emit_header_file('FusedConv', **kwargs) + + elif param['kernel'] == 'Linear': + # in_features = param['input_dim']['width'] + # out_features = param['channels']['out'] + ifmap = torch.randn(param['input_dim']['height'], + param['input_dim']['width'], requires_grad=False, dtype=dtype) + weights = torch.randn(param['channels']['out'], + param['input_dim']['width'], requires_grad=False, dtype=dtype) + bias = torch.randn(param['channels']['out'], requires_grad=False, dtype=dtype) + ofmap = linear(ifmap, weights, bias) + + kwargs = { + 'ifmap': ifmap, + 'weights': weights, + 'bias': bias, + 'ofmap': ofmap, + 'prec': param['prec'], + } + emit_header_file('Linear', **kwargs) + + elif param['kernel'] == 'GELU': + ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'], + param['input_dim']['hidden_nodes'], requires_grad=False, dtype=dtype) + ofmap = gelu(ifmap) + + kwargs = { + 'ifmap': ifmap, + 'ofmap': ofmap, + 'prec': param['prec'], + } + + emit_header_file('GELU', **kwargs) + + elif param['kernel'] == 'SoftMax': + ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'], + param['input_dim']['input_samples'], requires_grad=False, dtype=dtype) + ofmap = softmax(ifmap, param['reduce_dim']) + + kwargs = { + 'ifmap': ifmap, + 'ofmap': ofmap, + 'reduce_dim': param['reduce_dim'], + 'prec': param['prec'], + } + + emit_header_file('SoftMax', **kwargs) + + elif param['kernel'] == 'LayerNorm': + ifmap = torch.randn(param['input_dim']['batch_size'], param['input_dim']['seq_len'], + param['input_dim']['embeddings'], requires_grad=False, dtype=dtype) + + eps = param['eps'] + + ofmap = layernorm(ifmap, eps, param['input_dim']['embeddings']) + + ofmap = ofmap.detach().numpy() + + # print("LayerNorm output shape: ", ofmap.shape) + # print("LayerNorm output: ", ofmap) + + kwargs = { + 'ifmap': ifmap, + 'ofmap': ofmap, + 'prec': param['prec'], + } + + emit_header_file('LayerNorm', **kwargs) + + else: + print("No valid kernel selected") + + +if __name__ == '__main__': + main() diff --git a/target/sim/sw/device/apps/sndnn/gemm/Makefile b/target/sim/sw/device/apps/sndnn/gemm/Makefile new file mode 100644 index 000000000..ddd45cbaf --- /dev/null +++ b/target/sim/sw/device/apps/sndnn/gemm/Makefile @@ -0,0 +1,18 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Gianna Paulin + +# DNN_DIR = $(abspath ../../../../../../../sw/snDNN) +DNN_DIR = $(abspath ..) +# SNDNN_DIR = $(abspath ..) +APPS_DIR = $(abspath ../..) + +APP_NAME = gemm +USE_SNDNN_LIBRARY = true + +include $(DNN_DIR)/Makefile +include $(APPS_DIR)/common.mk + +$(DEP): $(DATA_DIR)/data_$(APP_NAME).h diff --git a/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c b/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c new file mode 100644 index 000000000..8bc0876f9 --- /dev/null +++ b/target/sim/sw/device/apps/sndnn/gemm/src/net_gemm.c @@ -0,0 +1,250 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// SW testbench for profiling GEMM kernels in different +// floating point precisions (fp64, fp32, fp16), as well as +// different memory layouts for matrices (transposed/not-transposed) +// Correctness of results are checked automatically + +#include "data_gemm.h" +#include "sndnn.h" +// #include "gemm.h" +#include "layer.h" +#include "math.h" +// #include "perf_cnt.h" +// #include "printf.h" +#include "snrt.h" +// #include "utils.h" + +// Other variables +__thread volatile comm_buffer_t* comm_buffer; + +// Padding of innermost dimension of a Matrix +// Useful for preventing banking conflicts between cores +// that are accessing different rows of the matrix +#define MAT_ROW_PADDING 0 + +// Padding in between matrices A, B for preventing +// banking conflicts in the beginning +#define MAT_PADDING 0 + +#define CHECK_RESULT + +void *share_ptr; + +int main() { + +/** OFFLOAD SECTION START **/ + + // Initialize pointers + comm_buffer = (volatile comm_buffer_t*)get_communication_buffer(); + + // Notify CVA6 when snRuntime initialization is done + post_wakeup_cl(); + return_to_cva6(SYNC_ALL); + snrt_wfi(); + + // Reset state after wakeup + // mcycle(); + post_wakeup_cl(); + +/** OFFLOAD SECTION END **/ + + gemm_l.A = (void *)gemm_A_dram; + gemm_l.B = (void *)gemm_B_dram; + gemm_l.C = (void *)gemm_C_dram; + + const gemm_layer l1_gemm_l = gemm_l; + + const uint32_t cluster_num = snrt_cluster_num(); + const uint32_t cluster_id = snrt_cluster_idx(); + const uint32_t compute_num = snrt_cluster_compute_core_num(); + const uint32_t compute_id = snrt_global_core_idx(); + + void *mat_A, *mat_B, *mat_C; + + uint32_t mat_A_size = + (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) * + l1_gemm_l.dtype; + uint32_t mat_B_size = + (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype; + uint32_t mat_C_size = l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype; + + uint32_t total_size = mat_A_size + mat_B_size + mat_C_size; + + void *ptr; + + if (compute_id == 0) { + ptr = snrt_l1alloc(total_size); + share_ptr = ptr; + } + + snrt_cluster_hw_barrier(); + + ptr = share_ptr; + + mat_A = ptr; + ptr += (l1_gemm_l.M * (l1_gemm_l.K + MAT_ROW_PADDING) + MAT_PADDING) * + l1_gemm_l.dtype; + mat_B = ptr; + ptr += (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.N * l1_gemm_l.dtype; + mat_C = ptr; + ptr += l1_gemm_l.M * l1_gemm_l.N * l1_gemm_l.dtype; + + uint32_t errors = 0; + + snrt_global_barrier(); + + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_A = + snrt_dma_start_2d(mat_A, l1_gemm_l.A, l1_gemm_l.dtype * l1_gemm_l.K, + l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING), + l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.M); + snrt_dma_txid_t txid_B = + snrt_dma_start_2d(mat_B, l1_gemm_l.B, l1_gemm_l.dtype * l1_gemm_l.K, + l1_gemm_l.dtype * (l1_gemm_l.K + MAT_ROW_PADDING), + l1_gemm_l.dtype * l1_gemm_l.K, l1_gemm_l.N); + + snrt_dma_txid_t txid_C = snrt_dma_start_1d( + mat_C, l1_gemm_l.C, l1_gemm_l.dtype * l1_gemm_l.M * l1_gemm_l.N); + + snrt_dma_wait_all(); + } + + // snrt_cluster_hw_barrier(); + snrt_global_barrier(); + + // if (snrt_is_compute_core() && + // snrt_cluster_compute_core_num() < compute_num) { + if (snrt_is_compute_core()) { + const uint32_t setup_SSR = 1; + + if (!l1_gemm_l.TA && !l1_gemm_l.TB) { + volatile uint32_t A_offset = + compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype; + volatile uint32_t C_offset = + compute_id * l1_gemm_l.N * l1_gemm_l.dtype; + volatile uint32_t ldA = + compute_num * (l1_gemm_l.K + MAT_ROW_PADDING); + volatile uint32_t ldB = l1_gemm_l.N + MAT_ROW_PADDING; + volatile uint32_t ldC = l1_gemm_l.N * compute_num; + + benchmark_get_cycle(); + gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, l1_gemm_l.K, + &mat_A[A_offset], ldA, l1_gemm_l.TA, mat_B, ldB, + l1_gemm_l.TB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, + setup_SSR); + benchmark_get_cycle(); + } else if (!l1_gemm_l.TA && l1_gemm_l.TB) { + volatile uint32_t A_offset = + compute_id * (l1_gemm_l.K + MAT_ROW_PADDING) * l1_gemm_l.dtype; + volatile uint32_t C_offset = + compute_id * l1_gemm_l.N * l1_gemm_l.dtype; + volatile uint32_t ldA = + compute_num * (l1_gemm_l.K + MAT_ROW_PADDING); + volatile uint32_t ldB = l1_gemm_l.K + MAT_ROW_PADDING; + volatile uint32_t ldC = l1_gemm_l.N * compute_num; + + benchmark_get_cycle(); + switch (l1_gemm_l.dtype) { + case FP64: + gemm_fp64_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, + l1_gemm_l.K, &mat_A[A_offset], ldA, + l1_gemm_l.TA, mat_B, ldB, l1_gemm_l.TB, + &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, + setup_SSR); + break; + case FP32: + gemm_fp32_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, + l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, + ldB, &mat_C[C_offset], ldC, &l1_gemm_l.ALPHA, + setup_SSR); + break; + case FP16: + if (l1_gemm_l.expand) { + gemm_fp16_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, + l1_gemm_l.K, &mat_A[A_offset], ldA, + mat_B, ldB, &mat_C[C_offset], ldC, + &l1_gemm_l.ALPHA, setup_SSR); + } else { + gemm_fp16_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, + l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, + ldB, &mat_C[C_offset], ldC, + &l1_gemm_l.ALPHA, setup_SSR); + } + break; + case FP8: + gemm_fp8_ex_opt(l1_gemm_l.M / compute_num, l1_gemm_l.N, + l1_gemm_l.K, &mat_A[A_offset], ldA, mat_B, + ldB, &mat_C[C_offset], ldC, + &l1_gemm_l.ALPHA, setup_SSR); + break; + } + benchmark_get_cycle(); + } else if (l1_gemm_l.TA) { + // printf("transpose TA not supported\n"); + } + snrt_cluster_hw_barrier(); + } else { + snrt_cluster_hw_barrier(); + } + // snrt_cluster_hw_barrier(); + snrt_global_barrier(); + +#ifdef CHECK_RESULT + + if (compute_id == 0) { + if (l1_gemm_l.dtype == FP64) { + for (uint32_t m = 0; m < l1_gemm_l.M; m++) { + double checksum = gemm_checksum[m]; + double sum = 0.0; + for (uint32_t n = 0; n < l1_gemm_l.N; n++) { + sum += ((double *)mat_C)[m * l1_gemm_l.N + n]; + } + if (fabs(sum - checksum) > 0.001) { + errors += l1_gemm_l.N; + } + } + } else if (l1_gemm_l.dtype == FP32) { + for (uint32_t m = 0; m < l1_gemm_l.M; m++) { + float checksum = gemm_checksum[m]; + float sum = 0.0; + for (uint32_t n = 0; n < l1_gemm_l.N; n++) { + sum += ((float *)mat_C)[m * l1_gemm_l.N + n]; + } + if (fabs(sum - checksum) > 0.001) { + errors += l1_gemm_l.N; + } + } + } else if (l1_gemm_l.dtype == FP16) { + for (uint32_t m = 0; m < l1_gemm_l.M; m++) { + __fp16 checksum = gemm_checksum[m]; + float sum = 0.0; + for (uint32_t n = 0; n < l1_gemm_l.N; n++) { + sum += ((__fp16 *)mat_C)[m * l1_gemm_l.N + n]; + } + if (fabs(sum - checksum) > 0.05) { + errors += l1_gemm_l.N; + } + } + } else if (l1_gemm_l.dtype == FP8) { + // printf("No golden model yet for fp8!\n"); + } + // printf("%d/%d Errors\n", errors, l1_gemm_l.M * l1_gemm_l.N); + } + +/** OFFLOAD SECTION START **/ + comm_buffer->usr_data_ptr = errors; +/** OFFLOAD SECTION END **/ + +#endif + +/** OFFLOAD SECTION START **/ + snrt_global_barrier(); + return_to_cva6(SYNC_ALL); +/** OFFLOAD SECTION END **/ + + // TODO: change back!!! + return 0; +} diff --git a/target/sim/sw/device/libraries/snDNN/Makefile b/target/sim/sw/device/libraries/snDNN/Makefile new file mode 100644 index 000000000..34b201583 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/Makefile @@ -0,0 +1,127 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# Gianna Paulin + +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +include $(MK_DIR)/../../toolchain.mk + +SNITCH_ROOT := $(realpath $(MK_DIR)/../../../../../../deps/snitch_cluster/) + +################### +# Build variables # +################### + +############# +# snRuntime # +############# +# Directories + +CUR_DIR := $(realpath $(MK_DIR)/.) +SNRT_DIR := $(realpath $(SNITCH_ROOT)/sw/snRuntime) +SNRT_DIR := $(shell bender path snitch_cluster)/sw/snRuntime +SNDNN_DIR := $(shell bender path snitch_cluster)/sw/snDNN + +# SNRT_DIR := $(realpath $(MK_DIR)/../../../../../sw/snRuntime) +# ifeq (SELECT_RUNTIME, banshee) +RUNTIME_DIR := $(realpath $(MK_DIR)/../../runtime) +# else +# RUNTIME_DIR := $(realpath $(MK_DIR)/../../runtime/rtl) +# endif + +# Dependencies +INCDIRS += $(RUNTIME_DIR)/src +# INCDIRS += $(RUNTIME_DIR)/../../shared +INCDIRS += $(RUNTIME_DIR)/../../shared/platform +INCDIRS += $(RUNTIME_DIR)/../../shared/platform/generated +INCDIRS += $(RUNTIME_DIR)/../../shared/runtime +INCDIRS += $(SNRT_DIR)/api +INCDIRS += $(SNRT_DIR)/api/omp +INCDIRS += $(SNRT_DIR)/src +INCDIRS += $(SNRT_DIR)/src/omp +INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes + +# Directories +BUILDDIR = $(abspath $(CUR_DIR)/build/) +SRC_DIR = $(abspath $(CUR_DIR)/src/) +SW_DIR = $(abspath $(CUR_DIR)/../../) +# SNRT_DIR = $(shell bender path snitch_cluster)/sw/snRuntime +# SNDNN_DIR = $(shell bender path snitch_cluster)/sw/snDNN + +# Dependencies +# INCDIRS += $(SNRT_DIR)/src +# INCDIRS += $(SNRT_DIR)/src/omp +# INCDIRS += $(SNRT_DIR)/api +# INCDIRS += $(SNRT_DIR)/api/omp +# INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes +# INCDIRS += $(SW_DIR)/shared/platform +# INCDIRS += $(SW_DIR)/shared/platform/generated +# INCDIRS += $(SW_DIR)/shared/runtime +# SRCS += $(SRC_DIR)/occamy_start.S +# SRCS += $(SRC_DIR)/snrt.c + +######### +# snDNN # +######### +# Directories +BUILDDIR = $(abspath build/) +# SNDNN_DIR = $(realpath $(SNITCH_ROOT)/sw/snDNN) +SRC_DIR = $(SNDNN_DIR)/src +SRC_DIR_LAYER = $(CUR_DIR)/src + +# Dependencies +INCDIRS += $(SNDNN_DIR)/src +INCDIRS += $(SNDNN_DIR)/include +INCDIRS += $(CUR_DIR)/src/ + +SRCS += $(CUR_DIR)/src/sndnn.c + +########### +# Outputs # +########### + +OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS))))) +DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS))))) +LIB = $(BUILDDIR)/libsnDNN.a +DUMP = $(BUILDDIR)/libsnDNN.dump +ALL_OUTPUTS = $(LIB) $(DUMP) + +######### +# Rules # +######### + +.PHONY: all +all: $(ALL_OUTPUTS) + +.PHONY: clean +clean: + rm -rf $(BUILDDIR) + +$(BUILDDIR): + mkdir -p $@ + +$(BUILDDIR)/%.o: $(SRC_DIR_LAYER)/%.S | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.o: $(SRC_DIR_LAYER)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.d: $(SRC_DIR_LAYER)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@ + +########### +# Library # +########### +$(LIB): $(OBJS) | $(BUILDDIR) + $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ + +$(DUMP): $(LIB) | $(BUILDDIR) + $(RISCV_OBJDUMP) -D $< > $@ + +ifneq ($(MAKECMDGOALS),clean) +-include $(DEPS) +endif diff --git a/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c b/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c new file mode 100644 index 000000000..64c075377 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/batchnorm_layer.c @@ -0,0 +1,138 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "batchnorm_layer.h" + +#include "batchnorm.h" +#include "layer.h" +// #include "printf.h" +#include "snrt.h" + +void batchnorm_layer(const conv_layer *l) { + const uint32_t cluster_num = snrt_cluster_num(); + const uint32_t cluster_id = snrt_cluster_idx(); + const uint32_t compute_num = snrt_cluster_compute_core_num(); + const uint32_t compute_id = snrt_cluster_core_idx(); + + // Each cluster loads one tile of a row + uint32_t ifmap_size = 2 * l->IW * l->TILE_CI; + uint32_t weights_size = l->CI; + uint32_t ofmap_size = 2 * l->IW * l->TILE_CI; + + double *ptr = (double *)snrt_l1_start_addr(); + double *ifmap = ptr; + ptr += ifmap_size; + double *gamma = ptr; + ptr += weights_size; + double *beta = ptr; + ptr += weights_size; + double *ofmap = ptr; + ptr += ofmap_size; + + uint32_t read_buf = 0; + uint32_t write_buf = 0; + + uint32_t prev_oh; + uint32_t prev_ow; + uint32_t prev_ci; + + for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) { + for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) { + if (snrt_is_dm_core()) { + // Load weights once in the beginning + if (oh == cluster_id && ci == 0) { + snrt_dma_start_1d(gamma, l->gamma, sizeof(double) * l->CI); + snrt_dma_start_1d(beta, l->beta, sizeof(double) * l->CI); + snrt_dma_wait_all(); + } + + // Load some stuff + if (l->TILE_CI == l->CI) { + // data layout is consecutively in memory + snrt_dma_start_1d(&ifmap[write_buf * ifmap_size / 2], + &l->ifmap[oh * l->IW * l->CI], + sizeof(double) * l->IW * l->TILE_CI); + } else { + // data is interleaved + snrt_dma_start_2d( + &ifmap[write_buf * ifmap_size / 2], /* dst */ + &l->ifmap[oh * l->IW * l->CI + ci], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->TILE_CI, /* dst_stride */ + sizeof(double) * l->CI, /* src_stride */ + l->IW); /* repetitions */ + } + + snrt_dma_wait_all(); + + snrt_cluster_hw_barrier(); + + if (!(oh == cluster_id && ci == 0)) { + if (l->TILE_CI == l->CI) { + // data is stored consecutively + snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI], + &ofmap[!read_buf * (ofmap_size / 2)], + sizeof(double) * l->IW * l->CI); + } else { + // data is stored in interleaved layout + snrt_dma_start_2d( + &l->ofmap[prev_oh * l->OW * l->CI + + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->CI, /* dst_stride */ + sizeof(double) * l->TILE_CI, /* src_stride */ + l->IW); /* repetitions */ + } + } + + snrt_dma_wait_all(); + write_buf = !write_buf; + read_buf = !read_buf; + prev_ci = ci; + prev_oh = oh; + /* prev_ow = ow; */ + } + + if (snrt_is_compute_core()) { + // Wait for data + snrt_cluster_hw_barrier(); + // initially setup SSRs + uint32_t setup_SSR = (oh == cluster_id && ci == 0); + + // Start kernel + batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id], + &gamma[ci + compute_id], &beta[ci + compute_id], + &ofmap[write_buf * ofmap_size / 2 + compute_id], + l->OW, l->TILE_CI, compute_num, setup_SSR); + + write_buf = !write_buf; + read_buf = !read_buf; + } + } + } + + snrt_cluster_hw_barrier(); + + // Store last tile back + if (snrt_is_dm_core()) { + if (l->TILE_CI == l->CI) { + // data is stored consecutively + snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI], + &ofmap[!read_buf * (ofmap_size / 2)], + sizeof(double) * l->IW * l->CI); + } else { + // data is stored in interleaved layout + snrt_dma_start_2d( + &l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->CI, /* dst_stride */ + sizeof(double) * l->TILE_CI, /* src_stride */ + l->IW); /* repetitions */ + } + + snrt_dma_wait_all(); + } +} diff --git a/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c b/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c new file mode 100644 index 000000000..27ad83c84 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/conv2d_layer.c @@ -0,0 +1,396 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "conv2d_layer.h" + +#include "gemm.h" +#include "layer.h" +// #include "printf.h" +#include "snrt.h" +#include "utils.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) + +void conv2d_layer(const conv_layer *l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_cluster_compute_core_num(); + + const uint32_t cluster_per_quadrant = min(4, cluster_num); + + // typedef struct cluster_mem_alloc_struct { + // double im2col[2][compute_num][l->FW*l->FH*l->TILE_CI+1]; + // double ifmap[2][l->FH][compute_num + l->FW - 1][l->TILE_CI]; + // double weights[compute_num][l->FH*l->FW*l->TILE_CI+1]; + // double ofmap[2][compute_num][8]; + // volatile uint32_t synch_flag[2]; + // } cluster_mem_alloc; + + // im2col[2][compute_num][l->FW*l->FH*l->TILE_CI+1]; + uint32_t im2col_row_stride = l->FW * l->FH * l->TILE_CI + 1; + uint32_t im2col_mat_stride = im2col_row_stride * compute_num; + uint32_t im2col_size = 2 * im2col_mat_stride; + + // ifmap[2][l->FH][compute_num + l->FW - 1][l->TILE_CI]; + uint32_t ifmap_col_stride = l->TILE_CI; + uint32_t ifmap_row_stride = ifmap_col_stride * (compute_num + l->FW - 1); + uint32_t ifmap_stride = ifmap_row_stride * l->FH; + uint32_t ifmap_size = 2 * ifmap_stride; + + // weights[compute_num][l->FH*l->FW*l->TILE_CI+1]; + uint32_t weights_co_stride = l->FH * l->FW * l->TILE_CI + 1; + uint32_t weights_size = compute_num * weights_co_stride; + + // ofmap[2][compute_num][8]; + uint32_t ofmap_co_stride = 8; + uint32_t ofmap_stride = compute_num * ofmap_co_stride; + uint32_t ofmap_size = 2 * ofmap_stride; + + double *ptr = (double *)snrt_l1_next(); + double *im2col = ptr; + ptr += im2col_size; + double *ifmap = ptr; + ptr += ifmap_size; + double *weights = ptr; + ptr += weights_size; + double *ofmap = ptr; + ptr += ofmap_size; + volatile uint32_t *synch_flag = (void *)ptr; + + uint32_t write_buf = 0; + uint32_t read_buf = 0; + + int32_t oh_prev = -1; + int32_t ow_prev = -1; + + // snrt_global_barrier(); + + benchmark_get_cycle(); + + // Distribute output channels across clusters + for (uint32_t co = cluster_id * compute_num; co < l->CO; + co += cluster_num * compute_num) { + // Tile CI dimension + for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) { + benchmark_get_cycle(); + + // Load weights in the beginning + if (snrt_is_dm_core()) { + snrt_dma_start_tracking(); + + // Weights are stored in CO x FH x FW x CI format with + // additional padding (CI + 1) to prevent banking conflicts + for (uint32_t _co = 0; _co < 8; _co++) { + if (l->TILE_CI == l->CI) { + snrt_dma_txid_t weight_txid = snrt_dma_start_1d( + &weights[_co * weights_co_stride], /* dst */ + &l->weights[(co + _co) * l->FH * l->FW * + l->CI], /* src */ + sizeof(double) * l->CI * l->FH * l->FW /* size */); + } else { + snrt_dma_txid_t weight_txid = snrt_dma_start_2d( + &weights[_co * weights_co_stride], /* dst */ + &l->weights[(co + _co) * l->FH * l->FW * l->CI + + ci], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->TILE_CI, /* dst_stride */ + sizeof(double) * l->CI, /* src_stride */ + l->FH * l->FW /* repetitions */); + } + } + snrt_dma_wait_all(); + + snrt_dma_stop_tracking(); + } + benchmark_get_cycle(); + + // Iterate over pixels, outer loop iterates over tiles of columns in + // feature map, inner loop iterates over rows. Each core processes + // one pixel at a time. In case of cluster2cluster communication, + // each cluster in a quadrant starts with a different row. The first + // time, all clusters load a different row from memory. In each + // subsequent iteration the leading cluster loads a new row from + // main memory and the others load from the next cluster + for (uint32_t ow = 0; ow < l->OW; ow += compute_num) { + if (l->cluster2cluster) { + synch_flag[0] = 0; + synch_flag[1] = 0; + } + + for (uint32_t _oh = 0; _oh < l->OH; _oh++) { + // If cluster2cluster is enabled, each cluster starts with a + // different row, requires that OH is bigger than + // cluster_num (per quadrant at least) + uint32_t oh = ((cluster_per_quadrant - 1) - + (cluster_id % cluster_per_quadrant) + _oh) % + l->OH; + + if (snrt_is_dm_core()) { + uint32_t n_ifmap_pixel_read = + min(compute_num + l->FW - 1, + l->IW - ow + (l->pad << 1)); + uint32_t n_ofmap_pixel_read = + min(compute_num, l->OW - ow); + uint32_t n_ofmap_pixel_write = + min(compute_num, l->OW - ow_prev); + + // Load the intermediate outputs from memory + if (ci != 0) { + snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d( + &ofmap[write_buf * ofmap_stride], /* dst */ + &l->ofmap[(oh * l->OW + ow) * l->CO + + co], /* src */ + sizeof(double) * 8, /* size */ + sizeof(double) * 8, /* dst_stride */ + sizeof(double) * l->CO, /* src_stride */ + n_ofmap_pixel_read); /* repetitions */ + snrt_dma_wait_all(); + } else { + dma_memset(&ofmap[write_buf * ofmap_stride], 0, + sizeof(double) * 8 * n_ofmap_pixel_read); + } + + if (l->cluster2cluster) { + // All except last cluster need to wait until + // cluster synch flag is cleared + if (cluster_id % cluster_per_quadrant != + cluster_per_quadrant - 1) { + while (synch_flag[write_buf]) + ; + } + } + + snrt_dma_start_tracking(); + + // The input feature map needs to be loaded from main + // memory in the following cases: 1) cluster2cluster + // communication is not enabled 2) The first iteration, + // every cluster loads a row from main memory 3) The + // leading cluster always loads rows from main memory + if (!l->cluster2cluster || _oh == 0 || + cluster_id % cluster_per_quadrant == 0) { + // Transfer in FH * (compute_num + FW - 1) pixels + // such that im2col transformation can be performed + // for every core + + for (uint32_t fh = 0; fh < l->FH; fh++) { + // Fill horizontal lines with zeros for padding + if (oh + fh < l->pad || + oh + fh >= l->IH + ((l->FH - 1) >> 1)) { + dma_memset(&ifmap[write_buf * ifmap_stride + + fh * ifmap_row_stride], + 0, + sizeof(double) * l->TILE_CI * + n_ifmap_pixel_read); + } else { + uint32_t padding_left = + (ow < l->pad) ? (l->pad - ow) : 0; + uint32_t padding_right = + (ow + compute_num + l->pad <= l->OW) + ? 0 + : n_ifmap_pixel_read - + ((l->FW - 1) >> 1) - + (l->IW - ow); + + // If there is need for padding, set whole + // buffer to zero + if (padding_left || padding_right) { + dma_memset( + &ifmap[write_buf * ifmap_stride + + fh * ifmap_row_stride], + 0, + sizeof(double) * + (compute_num + l->FW - 1) * + l->TILE_CI); + } + + // Then fill in the rest of the values + snrt_dma_txid_t ifmap_txid = + snrt_dma_start_2d( + &ifmap[write_buf * ifmap_stride + + fh * ifmap_row_stride + + padding_left * + ifmap_col_stride], /* dst + */ + (double *)&l->ifmap + [((oh + fh - l->pad) * l->IW + + ow - + (l->pad - padding_left)) * + l->CI + + ci], /* src */ + sizeof(double) * + l->TILE_CI, /* size */ + sizeof(double) * + l->TILE_CI, /* dst_stride */ + sizeof(double) * + l->CI, /* src_stride */ + n_ifmap_pixel_read - padding_left - + padding_right /* n_ifmap_pixel_read + */ + /* repetitions */); + snrt_dma_wait_all(); + } + } + + } + + // Transfer tile from other cluster to memory + else { + // A cluster always loads from the previous cluster + // uint32_t cluster_offset = 0x00040000; # TODO: already defined + volatile uint32_t *src_synch_flag = + (void *)synch_flag - cluster_offset; + double *src_ifmap = (void *)ifmap - cluster_offset; + + // Wait until previous cluster has released data + if (l->cluster2cluster && + (cluster_id % cluster_per_quadrant) != 0) { + while (src_synch_flag[!write_buf] == 0) + ; + } + + // Transfer in FH * (compute_num + FW - 1) pixels + // such that im2col transformation can be performed + // for every core + snrt_dma_txid_t ifmap_txid = snrt_dma_start_1d( + &ifmap[write_buf * ifmap_stride], + &src_ifmap[!write_buf * ifmap_stride], + sizeof(double) * n_ifmap_pixel_read * + l->TILE_CI * l->FH); + snrt_dma_wait_all(); + + // clear synch flag of src cluster + if (l->cluster2cluster && + (cluster_id % cluster_per_quadrant) != 0) { + // printf("Cluster %d clearing synch flag %p\n", + // cluster_id, &src_synch_flag[!write_buf]); + src_synch_flag[!write_buf] = 0; + } + } + + snrt_dma_stop_tracking(); + + // New data is produced + if (l->cluster2cluster) { + synch_flag[write_buf] = 1; + // printf("Cluster %d setting synch flag %p\n", + // cluster_id, &synch_flag[write_buf]); + } + + snrt_dma_start_tracking(); + + // Reshuffle and write data to the im2col buffer by the + // DMA + for (uint32_t n = 0; n < compute_num; n++) { + // only construct im2col matrix for leftover pixels + if (ow + n < l->OW) { + snrt_dma_txid_t im2col_txid = snrt_dma_start_2d( + &im2col[write_buf * im2col_mat_stride + + n * im2col_row_stride], /* dst */ + &ifmap[read_buf * ifmap_stride + + n * ifmap_col_stride], /* src */ + sizeof(double) * l->FW * + l->TILE_CI, /* size */ + sizeof(double) * l->FW * + l->TILE_CI, /* dst_stride */ + sizeof(double) * (compute_num + l->FW - 1) * + l->TILE_CI, /* src_stride */ + l->FH /* repetitions */); + } + } + + // Wait for im2col transform to end, and synchronize + // with compute cores + snrt_dma_wait_all(); + snrt_dma_stop_tracking(); + snrt_cluster_hw_barrier(); + benchmark_get_cycle(); + + // Transfer back the output feature maps + if (oh_prev + ow_prev >= 0) { + snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d( + &l->ofmap[(oh_prev * l->OW + ow_prev) * l->CO + + co], /* dst */ + &ofmap[!read_buf * ofmap_stride], /* src */ + sizeof(double) * 8, /* size */ + sizeof(double) * l->CO, /* dst_stride */ + sizeof(double) * 8, /* src_stride */ + n_ofmap_pixel_write); /* repetitions */ + snrt_dma_wait_all(); + } + oh_prev = oh; + ow_prev = ow; + + // Toggle write and read buffer + write_buf = !write_buf; + read_buf = !read_buf; + } + + if (snrt_is_compute_core()) { + // Wait until DMA core has finished the im2col transform + benchmark_get_cycle(); + snrt_cluster_hw_barrier(); + benchmark_get_cycle(); + + // Each core performs a matrix multiplication on the + // im2col buffer Of size (1 x FHxFWxCI) x (FHxFWxCI x + // 8), 8 represents CO and is the unrolling factor + // needed to prevent RAW conflicts. + if (ow + compute_id < l->OW) { + uint32_t setup_SSR = + (ci == 0 && ow == 0 && _oh == 0) ? 1 : 0; + + if (ci != 0 && l->TILE_CI != l->CI) { + const uint32_t alpha = 0; + gemm_fp64_opt( + 1, 8, l->FH * l->FW * l->TILE_CI, + &im2col[read_buf * im2col_mat_stride + + compute_id * im2col_row_stride], + 0, 0, weights, + l->FH * l->FW * l->TILE_CI + 1, 1, + &ofmap[write_buf * ofmap_stride + + compute_id * ofmap_co_stride], + 0, &alpha, setup_SSR); + + } else { + const uint32_t alpha = 1; + gemm_fp64_opt( + 1, 8, l->FH * l->FW * l->TILE_CI, + &im2col[read_buf * im2col_mat_stride + + compute_id * im2col_row_stride], + 0, 0, weights, + l->FH * l->FW * l->TILE_CI + 1, 1, + &ofmap[write_buf * ofmap_stride + + compute_id * ofmap_co_stride], + 0, &alpha, setup_SSR); + } + } + // Toggle read and write buffer + read_buf = !read_buf; + write_buf = !write_buf; + } + } + } + + snrt_cluster_hw_barrier(); + + // Transfer back last output tile + if (snrt_is_dm_core()) { + snrt_dma_txid_t ofmap_txid = snrt_dma_start_2d( + &l->ofmap[(oh_prev * l->OW + ow_prev) * l->CO + + co], /* dst */ + &ofmap[!read_buf * ofmap_stride], /* src */ + sizeof(double) * 8, /* size */ + sizeof(double) * l->CO, /* dst_stride */ + sizeof(double) * 8, /* src_stride */ + min(compute_num, l->OW - ow_prev)); /* repetitions */ + snrt_dma_wait_all(); + } + } + } + + // snrt_global_barrier(); +} diff --git a/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c b/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c new file mode 100644 index 000000000..f45c2849d --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/gelu_layer.c @@ -0,0 +1,69 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "gelu_layer.h" + +#include "gelu.h" +#include "layer.h" +// #include "printf.h" +#include "snrt.h" + +void gelu_layer(const gelu_layer_t *l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_cluster_compute_core_num(); + + uint32_t ifmap_size = + l->BATCH_SIZE * l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float); + uint32_t ofmap_size = ifmap_size; + + void *ptr = (float *)snrt_l1_next(); + float *ifmap = ptr; + ptr += ifmap_size; + float *ofmap = ptr; + ptr += ofmap_size; + + // DMA transfer the ifmap into the cluster TCDM + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( + ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), + l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), + l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float)); + + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core() && + snrt_cluster_compute_core_num() < compute_num) { + // determine the row offset for each core + int32_t row_offset = compute_id * l->HIDDEN_NODES; + + // determine the row stride of each matrix + int32_t ldI = compute_num * l->HIDDEN_NODES; + + // determine the batch offset for each core + int32_t batch_offset = l->SEQ_LEN * l->HIDDEN_NODES; + + // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); + + for (int b = 0; b < l->BATCH_SIZE; b++) { + // if (compute_id == 1) { + // printf("BATCH: %d\n", b); + // } + gelu_fp32(&ifmap[row_offset + b * batch_offset], + &ofmap[row_offset + b * batch_offset], ldI, l->BATCH_SIZE, + l->SEQ_LEN / 8, l->HIDDEN_NODES); + } + + snrt_cluster_hw_barrier(); + + } else { + snrt_cluster_hw_barrier(); + } + + snrt_global_barrier(); +} \ No newline at end of file diff --git a/target/sim/sw/device/libraries/snDNN/src/gemm_layer.c b/target/sim/sw/device/libraries/snDNN/src/gemm_layer.c new file mode 100644 index 000000000..e69de29bb diff --git a/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c b/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c new file mode 100644 index 000000000..3a1c63df9 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/layernorm_layer.c @@ -0,0 +1,60 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "layernorm_layer.h" + +#include "layer.h" +#include "layernorm.h" +// #include "printf.h" +#include "snrt.h" + +void layernorm_layer(const layernorm_layer_t *l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_global_core_idx(); + + uint32_t ifmap_size = + l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float); + uint32_t ofmap_size = ifmap_size; + + void *ptr = (float *)snrt_l1_next(); + float *ifmap = ptr; + ptr += ifmap_size; + float *ofmap = ptr; + ptr += ofmap_size; + + // DMA transfer the ifmap into the cluster TCDM + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( + ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), + l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), + l->SEQ_LEN * l->EMBEDDINGS * sizeof(float)); + + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core()) { + // determine the row offset for each core + int32_t row_offset = compute_id * l->EMBEDDINGS; + + // determine the row stride of each matrix + int32_t ldI = compute_num * l->EMBEDDINGS; + + // determine the batch offset for each core + int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS; + + // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); + layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, + batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8, + l->EMBEDDINGS, l->EPS); + + } else { + snrt_cluster_hw_barrier(); + } + + snrt_global_barrier(); +} \ No newline at end of file diff --git a/target/sim/sw/device/libraries/snDNN/src/linear_layer.c b/target/sim/sw/device/libraries/snDNN/src/linear_layer.c new file mode 100644 index 000000000..f19a74bd6 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/linear_layer.c @@ -0,0 +1,104 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "linear_layer.h" + +#include "layer.h" +#include "linear.h" +// #include "printf.h" +#include "snrt.h" + +void linear_layer(const linear_layer_t *l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_cluster_compute_core_num(); + + uint32_t ifmap_size = l->CH * l->CW * sizeof(float); + uint32_t weights_size = l->CO * l->CI * sizeof(float); + uint32_t bias_size = l->CO * sizeof(float); + uint32_t ofmap_size = l->CH * l->CO * sizeof(float); + + void *ptr = (float *)snrt_l1_next(); + float *ifmap = ptr; + ptr += ifmap_size; + float *weights = ptr; + ptr += weights_size; + float *bias = ptr; + ptr += bias_size; + float *ofmap = ptr; + ptr += ofmap_size; + float *result = ptr; + ptr += ofmap_size; + + // now we DMA transfer the weights and bias into the cluster TCDM + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_bias = snrt_dma_start_1d(bias, l->bias, bias_size); + snrt_dma_txid_t txid_weights = snrt_dma_start_2d( + weights, l->weights, l->CO * sizeof(float), l->CO * sizeof(float), + l->CO * sizeof(float), l->CI * sizeof(float)); + + snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( + ifmap, l->ifmap, l->CH * sizeof(float), l->CH * sizeof(float), + l->CH * sizeof(float), l->CW * sizeof(float)); + + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core() && + snrt_cluster_compute_core_num() < compute_num) { + // determine the row stride of each matrix + int32_t ldI = l->CH * l->CW; + int32_t ldW = compute_num * l->CO; + int32_t ldB = compute_num; + int32_t ldO = ldB; + + // determine the row offset of each matrix + int32_t offW = compute_id * l->CO; + int32_t offB = compute_id; + int32_t offO = compute_id; + + // printf("compute_id = %d, offW = %d, offB = %d, offO = %d\n", + // compute_id, offW, offB, offO); + + linear_fp32(ifmap, ldI, &weights[offW], ldW, &bias[compute_id], ldB, + ofmap, ldO, l->CI, l->CO / compute_num, l->CH); + + } else { + snrt_cluster_hw_barrier(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_result = snrt_dma_start_2d( + result, l->result, l->CH * sizeof(float), l->CH * sizeof(float), + l->CH * sizeof(float), l->CO * sizeof(float)); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + // TODO: fix this, wrong values for ofmap printed + if (compute_id == 0) { + // compare result with ofmap + float tolerance = 1e-6; + int error = 0; + for (int i = 0; i < l->CH; i++) { + for (int j = 0; j < l->CO; j++) { + if (result[i * l->CO + j] - ofmap[i * l->CO + j] > tolerance) { + printf( + "MISMATCH: result[%d][%d] = %f, ofmap[%d][%d] = %f\n", + i, j, result[i * l->CO + j], i, j, + ofmap[i * l->CO + j]); + error += 1; + } + } + } + + printf("[%d/%d] mismatches\n", error, l->CH * l->CO); + } +} \ No newline at end of file diff --git a/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c b/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c new file mode 100644 index 000000000..561837416 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/maxpool_layer.c @@ -0,0 +1,115 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "maxpool_layer.h" + +#include "layer.h" +#include "maxpool.h" +// #include "printf.h" +#include "snrt.h" + +void maxpool_layer(const conv_layer *l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_global_core_idx(); + + // Each cluster loads one tile of kernel size + uint32_t ifmap_size = 2 * l->FH * l->FW * l->TILE_CI; + uint32_t ofmap_size = 2 * l->TILE_CI; + + double *ptr = (double *)snrt_l1_next(); + double *ifmap = ptr; + ptr += ifmap_size; + double *ofmap = ptr; + ptr += ofmap_size; + + uint32_t read_buf = 0; + uint32_t write_buf = 0; + + uint32_t prev_oh; + uint32_t prev_ow; + uint32_t prev_ci; + + // tiles are distributed across clusters + for (uint32_t tile = cluster_id; tile < l->OH * l->OW; + tile += cluster_num) { + for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) { + uint32_t oh = tile / l->OW; + uint32_t ow = tile % l->OW; + + if (snrt_is_dm_core()) { + for (uint32_t fh = 0; fh < l->FH; fh++) { + if (l->TILE_CI == l->CI) { + snrt_dma_start_1d( + &ifmap[write_buf * (ifmap_size / 2) + + fh * l->FW * l->TILE_CI], /* dst */ + &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) * + l->CI], /* src */ + sizeof(double) * l->TILE_CI * l->FW /* size */); + } else { + // printf("bubu\n"); + snrt_dma_start_2d( + &ifmap[write_buf * (ifmap_size / 2) + + fh * l->FW * l->TILE_CI], /* dst */ + &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) * + l->CI + + ci], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->TILE_CI, /* dst_stride */ + sizeof(double) * l->CI, /* src_stride */ + l->FW /* repetitions */); + } + } + snrt_dma_wait_all(); + + // synchronize with compute cores after loading data + snrt_cluster_hw_barrier(); + + if (!(tile == cluster_id && ci == 0)) { + snrt_dma_start_2d( + &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->CI, /* dst_stride */ + sizeof(double) * l->TILE_CI, /* src_stride */ + 1 /* repetitions */); + } + + snrt_dma_wait_all(); + write_buf = !write_buf; + read_buf = !read_buf; + prev_ci = ci; + prev_oh = oh; + prev_ow = ow; + } + + if (snrt_is_compute_core()) { + // wait for data to arrive + snrt_cluster_hw_barrier(); + + maxpool_fp64(&ifmap[read_buf * ifmap_size / 2 + compute_id], + &ofmap[write_buf * ofmap_size / 2 + compute_id], + l->TILE_CI, l->FH, l->FW, compute_num); + + write_buf = !write_buf; + read_buf = !read_buf; + } + } + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()) { + snrt_dma_start_2d( + &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + prev_ci], /* dst */ + &ofmap[!read_buf * (ofmap_size / 2)], /* src */ + sizeof(double) * l->TILE_CI, /* size */ + sizeof(double) * l->CI, /* dst_stride */ + sizeof(double) * l->TILE_CI, /* src_stride */ + 1 /* repetitions */); + snrt_dma_wait_all(); + } +} diff --git a/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c b/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c new file mode 100644 index 000000000..86413de3f --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/nnlinear_backend_baseline.c @@ -0,0 +1,275 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "nnlinear_backend_baseline.h" + +#include "network.h" +#include "nnlinear_baseline.h" +// #include "printf.h" +#include "snrt.h" +#include "utils.h" + +// define which parts of the network to run +#define RUN_FEEDFORWARD 1 +#define RUN_GRADIENT_UPDATE 1 +#define RUN_TRAINING_STEP 1 +#define GET_ACCURACY 1 +#define GET_LOSS 1 +#define RUN_RTL 0 +#define NUM_EPOCHS 1 +#define BATCH_SIZE 1 +#define DATASET_SIZE 2 // 60000 +#define INFO 0 + +void nnlinear_backend_baseline(const network_fp32_t *n) { + uint32_t cluster_num = snrt_cluster_num(); // Total number of clusters + uint32_t cluster_core_num = + snrt_cluster_core_num(); // Total cores per cluster + uint32_t cluster_id = snrt_cluster_idx(); // Cluster ID + uint32_t compute_num = + snrt_cluster_compute_core_num(); // Number of compute cores per cluster + uint32_t global_compute_num = + snrt_global_core_num(); // Total cores incl. DM core per cluster + uint32_t compute_id = + snrt_cluster_compute_core_num(); // Core ID of each compute core + uint32_t dm_id = snrt_cluster_dm_core_idx(); // DM core ID of each cluster + uint32_t global_compute_id = + snrt_global_core_idx(); // Core ID of each core on all clusters + + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf( + "======================== System Info " + "========================\n"); + printf("Total number of clusters: %d\n", cluster_num); + printf("Total cores per cluster: %d\n", cluster_core_num); + printf("Number of compute cores per cluster: %d\n", compute_num); + printf("Total cores incl. DM core per cluster: %d\n", + global_compute_num); + printf( + "=============================================================" + "\n"); + } + } + + snrt_cluster_hw_barrier(); + + uint32_t weights_size = NUM_CLASSES * IN_CH * n->dtype; + uint32_t biases_size = NUM_CLASSES * n->dtype; + uint32_t activations_size = NUM_CLASSES * n->dtype; + uint32_t image_size = IN_CH * n->dtype; + uint32_t loss_size = n->dtype; + uint32_t labels_size = sizeof(uint32_t); + + // cluster 0 variabels: + float *weights; + float *weight_grads; + float *biases; + float *bias_grads; + float *images; + float *activations; + float *loss; + uint32_t *targets; + + void *tcdm_ptr = (float *)snrt_l1_next(); + + // cluster 0 memory map + weights = tcdm_ptr; + tcdm_ptr += weights_size; + weight_grads = tcdm_ptr; + tcdm_ptr += weights_size; + biases = tcdm_ptr; + tcdm_ptr += biases_size; + activations = tcdm_ptr; + tcdm_ptr += activations_size; + bias_grads = tcdm_ptr; + tcdm_ptr += biases_size; + images = tcdm_ptr; + tcdm_ptr += image_size; + loss = tcdm_ptr; + tcdm_ptr += loss_size; + targets = tcdm_ptr; + tcdm_ptr += labels_size; + + // DRAM pointers to images and targets + uint32_t *images_dram = (void *)0x80040000; + uint32_t *targets_dram = (void *)0x80108000; + + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_B = snrt_dma_start_1d(biases, n->b, biases_size); + snrt_dma_wait_all(); + snrt_dma_txid_t txid_W = + snrt_dma_start_2d(weights, n->W, IN_CH * n->dtype, IN_CH * n->dtype, + IN_CH * n->dtype, NUM_CLASSES); + } + + snrt_cluster_hw_barrier(); + + uint32_t number_of_images = 256; + int correct = 0; + int predict = 0; + int epoch_count = 0; + float epoch_loss, epoch_acc = 0; + float mean_epoch_loss, mean_epoch_acc = 0; + float batch_acc = 0; + float batch_loss = 0; + loss[0] = 0.0f; + + int batches = DATASET_SIZE / BATCH_SIZE; + + for (int epoch = 0; epoch < NUM_EPOCHS; epoch++) { + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf( + "======================== EPOCH [%d/%d] start. " + "========================\n", + (epoch + 1), NUM_EPOCHS); + } + } + for (int batch = 0; batch < batches; batch++) { + batch_loss = 0; + batch_acc = 0; + correct = 0; + if (snrt_is_compute_core()) { + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf( + "======================== BATCH [%d/%d] start. " + "========================\n", + (batch + 1), batches); + } + } + /* Zero out the gradients + * TODO: make this more efficient! + */ + for (int i = 0; i < NUM_CLASSES; i++) { + bias_grads[i] = 0; + for (int j = 0; j < IN_CH; j++) { + weight_grads[i * IN_CH + j] = 0; + } + } + + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf("INFO: Gradients have been zeroed out.\n"); + } + } + + snrt_cluster_hw_barrier(); + + } else if (!snrt_is_compute_core()) { + snrt_cluster_hw_barrier(); + } + for (uint32_t image = 0; image < BATCH_SIZE; image++) { + uint32_t volatile curr_img = + image * IN_CH + batch * BATCH_SIZE * IN_CH; + // printf("======================== Image %d + // ========================\n", curr_img / 784); + uint32_t volatile curr_target = image + batch * BATCH_SIZE; + if (snrt_is_dm_core()) { + float img_checksum = 0; + snrt_dma_start_tracking(); + snrt_dma_txid_t txid_img = + snrt_dma_start_1d(images, // destination + &images_dram[curr_img], // source + n->dtype * IN_CH); // size + snrt_dma_wait_all(); + snrt_dma_txid_t txid_target = + snrt_dma_start_1d(targets, // destination + &targets_dram[curr_target], // source + sizeof(uint32_t)); // size + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core() && + snrt_cluster_compute_core_num() < compute_num) { + GradientUpdate_baseline(images, activations, biases, + weights, weight_grads, bias_grads, + targets[0], loss); + snrt_cluster_hw_barrier(); + batch_loss += *loss; + /* Accuracy Calculation */ + float max_activation = activations[0]; + predict = 0; + for (int i = 0; i < NUM_CLASSES; i++) { + if (max_activation < activations[i]) { + max_activation = activations[i]; + predict = i; + } + } + + if (predict == targets[0]) { + correct++; + } + snrt_cluster_hw_barrier(); + + // printf("pred = %d, target = %d\n", predict, targets[0]); + + } else if (!snrt_is_compute_core()) { + snrt_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); + } + } + + snrt_cluster_hw_barrier(); + + // After one epoch we update the weights + if (snrt_is_compute_core() && + snrt_cluster_compute_core_num() < compute_num) { + batch_acc = (float)correct / (float)BATCH_SIZE; + epoch_acc += batch_acc; + epoch_loss += batch_loss / BATCH_SIZE; + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf( + "A total of [%d/%d] images were predicted " + "correctly in " + "batch %d\n", + correct, BATCH_SIZE, batch + 1); + printf("batch acc = %.6f\n", batch_acc * 100); + printf("batch loss = %.6f\n", batch_loss / BATCH_SIZE); + } + } + + TrainingStep_baseline(biases, weights, weight_grads, bias_grads, + n->learning_rate); + + if (batch % (batches - 1) == 0 && batch != 0) { + epoch_count++; + mean_epoch_loss = epoch_loss / batches; + mean_epoch_acc = epoch_acc / batches; + if (INFO == 1) { + if (compute_id == 0 && cluster_id == 0) { + printf( + "=========================== EPOCH %u done. " + "===========================\n", + epoch_count); + printf( + "=========================== Epoch Acc %.3f " + "===========================\n", + mean_epoch_acc * 100); + printf( + "=========================== Epoch Loss %.3f " + " " + "===========================\n", + mean_epoch_loss); + } + } + epoch_loss = 0; + epoch_acc = 0; + } + + } else if (!snrt_is_compute_core()) { + snrt_cluster_hw_barrier(); + } + + snrt_cluster_hw_barrier(); + } + } + snrt_global_barrier(); +} \ No newline at end of file diff --git a/target/sim/sw/device/libraries/snDNN/src/sndnn.c b/target/sim/sw/device/libraries/snDNN/src/sndnn.c new file mode 100644 index 000000000..c48d057c1 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/sndnn.c @@ -0,0 +1,20 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "sndnn.h" + +#include "batchnorm.c" +#include "batchnorm_layer.c" +#include "conv2d.c" +#include "conv2d_layer.c" +#include "gelu_layer.c" +#include "gemm.c" +#include "layernorm_layer.c" +#include "linear_layer.c" +#include "maxpool.c" +#include "maxpool_layer.c" +#include "snrt.h" +#include "utils.c" +// #include "nnlinear_backend_baseline.c" +// #include "softmax_layer.c" diff --git a/target/sim/sw/device/libraries/snDNN/src/sndnn.h b/target/sim/sw/device/libraries/snDNN/src/sndnn.h new file mode 100644 index 000000000..1d0775b7b --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/sndnn.h @@ -0,0 +1,29 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +// // Snitch cluster specific +// #include "snitch_cluster_defs.h" + +#include "batchnorm.h" +#include "batchnorm_layer.h" +#include "conv2d.h" +#include "conv2d_layer.h" +#include "gelu.h" +#include "gelu_layer.h" +#include "gemm.h" +#include "layer.h" +#include "layernorm.h" +#include "layernorm_layer.h" +#include "linear.h" +#include "linear_layer.h" +#include "maxpool_layer.h" +#include "network.h" +#include "utils.h" +// #include "nnlinear_backend_baseline.h" +// #include "softmax_layer.h" diff --git a/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c b/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c new file mode 100644 index 000000000..8dcbe5922 --- /dev/null +++ b/target/sim/sw/device/libraries/snDNN/src/softmax_layer.c @@ -0,0 +1,60 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// #include "softmax_layer.h" + +#include "layer.h" +// #include "printf.h" +#include "sndnn.h" +#include "snrt.h" +#include "softmax.h" + +void softmax_layer(softmax_layer_t *const l) { + uint32_t cluster_num = snrt_cluster_num(); + uint32_t cluster_id = snrt_cluster_idx(); + uint32_t compute_num = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_global_core_idx(); + + uint32_t ifmap_size = + l->BATCH_SIZE * l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float); + uint32_t ofmap_size = ifmap_size; + + void *ptr = (float *)snrt_l1_next(); + float *ifmap = ptr; + ptr += ifmap_size; + float *ofmap = ptr; + ptr += ofmap_size; + + // DMA transfer the ifmap into the cluster TCDM + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( + ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), + l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), + l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float)); + + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core()) { + // determine the row offset for each core + int32_t row_offset = compute_id * l->INPUT_SAMPLES; + + // determine the row stride of each matrix + int32_t ldI = compute_num * l->INPUT_SAMPLES; + + // determine the batch offset for each core + int32_t batch_offset = l->SEQ_LEN * l->INPUT_SAMPLES; + + // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); + softmax_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, batch_offset, + l->BATCH_SIZE, l->SEQ_LEN / 8, l->INPUT_SAMPLES); + + } else { + snrt_cluster_hw_barrier(); + } + + snrt_global_barrier(); +} \ No newline at end of file diff --git a/target/sim/sw/host/Makefile b/target/sim/sw/host/Makefile index 217287043..b4b177db1 100644 --- a/target/sim/sw/host/Makefile +++ b/target/sim/sw/host/Makefile @@ -7,6 +7,7 @@ # Add user applications to APPS variable APPS = hello_world APPS += offload +APPS += offload_general TARGET ?= all diff --git a/target/sim/sw/host/apps/common_general.mk b/target/sim/sw/host/apps/common_general.mk index cbd7136a6..a500bbe21 100644 --- a/target/sim/sw/host/apps/common_general.mk +++ b/target/sim/sw/host/apps/common_general.mk @@ -4,6 +4,10 @@ # # Luca Colagrande +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) + ###################### # Invocation options # ###################### @@ -21,11 +25,16 @@ RISCV_OBJDUMP = riscv64-unknown-elf-objdump RISCV_READELF = riscv64-unknown-elf-readelf # Directories -BUILDDIR = $(abspath build) -HOST_DIR = $(abspath ../../) +BUILDDIRS = $(foreach LIB, $(LIBS), $(abspath $(MK_DIR)/$(APP)/build/$(LIB))) +BUILDDIR = $(abspath $(MK_DIR)/$(APP)/build) +HOST_DIR = $(abspath $(MK_DIR)/../) RUNTIME_DIR = $(abspath $(HOST_DIR)/runtime) DEVICE_DIR = $(abspath $(HOST_DIR)/../device) +# Library names +LIBS = sndnn +LIBS += blas + # Dependencies INCDIRS += $(RUNTIME_DIR) INCDIRS += $(HOST_DIR)/../shared/platform/generated @@ -61,17 +70,19 @@ RISCV_LDFLAGS += -T$(LINKER_SCRIPT) # Device binary DEVICE_BUILDDIR = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build) -DEVICE_BINARY = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/$(DEVICE_APP).bin) +DEVICE_BINARY = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/$(basename $(notdir $(DEVICE_APP))).bin) ORIGIN_LD = $(foreach DEVICE_APP, $(DEVICE_APPS), $(DEVICE_DIR)/apps/$(DEVICE_APP)/build/origin.ld) ########### # Outputs # ########### -PARTIAL_ELF = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(APP).part.elf)) +PARTIAL_ELF = $(abspath $(BUILDDIR)/$(APP).part.elf) +# ELF = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(basename $(notdir $(DEVICE_APP))).elf)) +# DEP = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(basename $(notdir $(DEVICE_APP))).d)) ELF = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).elf)) DEP = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).d)) -PARTIAL_DUMP = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(APP).part.dump)) +PARTIAL_DUMP = $(abspath $(BUILDDIR)/$(APP).part.dump) DUMP = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).dump)) DWARF = $(foreach DEVICE_APP, $(DEVICE_APPS), $(abspath $(BUILDDIR)/$(DEVICE_APP).dwarf)) PARTIAL_OUTPUTS = $(PARTIAL_ELF) $(PARTIAL_DUMP) $(ORIGIN_LD) @@ -89,24 +100,24 @@ finalize-build: $(FINAL_OUTPUTS) .PHONY: clean clean: - rm -rf $(BUILDDIR) + rm -rf $(BUILDDIRS) rm -f $(OFFSET_LD) -$(BUILDDIR): +$(BUILDDIRS): mkdir -p $@ $(DEVICE_BUILDDIR): mkdir -p $@ -$(DEP): $(SRCS) | $(BUILDDIR) +$(DEP): $(SRCS) | $(BUILDDIRS) $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(PARTIAL_ELF)' $< > $@ $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< >> $@ # Partially linked object -$(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR) +$(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIRS) $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ -$(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIR) +$(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIRS) $(RISCV_OBJDUMP) -D $< > $@ # Device object relocation address @@ -115,14 +126,17 @@ $(ORIGIN_LD): $(PARTIAL_ELF) | $(DEVICE_BUILDDIR) echo "Writing device object relocation address 0x$$RELOC_ADDR to $@"; \ echo "L3_ORIGIN = 0x$$RELOC_ADDR;" > $@ -$(ELF): $(DEP) $(LD_SRCS) $(DEVICE_BINARY) | $(BUILDDIR) - $(eval FINAL_CFLAGS := -DDEVICEBIN=\"$(DEVICE_DIR)/apps/$(CUR_APP_NAME)/build/$(CUR_APP_NAME).bin\") +$(ELF): $(DEP) $(LD_SRCS) $(DEVICE_BINARY) | $(BUILDDIRS) + $(eval CUR_APP_NAME:=$(basename $(notdir $@))) + $(eval CUR_DIR_NAME:=$(basename $(basename $(dir $@)))) + $(eval CUR_LIB=$(strip $(foreach LIB,$(LIBS),$(findstring $(LIB),$(CUR_DIR_NAME))))) + $(eval FINAL_CFLAGS := -DDEVICEBIN=\"$(DEVICE_DIR)/apps/$(CUR_LIB)/$(CUR_APP_NAME)/build/$(CUR_APP_NAME).bin\") $(RISCV_CC) $(RISCV_CFLAGS) $(FINAL_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ -$(DUMP): $(ELF) | $(BUILDDIR) +$(DUMP): $(ELF) | $(BUILDDIRS) $(RISCV_OBJDUMP) -D $< > $@ -$(DWARF): $(ELF) | $(BUILDDIR) +$(DWARF): $(ELF) | $(BUILDDIRS) $(RISCV_READELF) --debug-dump $< > $@ ifneq ($(MAKECMDGOALS),clean) diff --git a/target/sim/sw/host/apps/offload_general/Makefile b/target/sim/sw/host/apps/offload_general/Makefile new file mode 100644 index 000000000..671646049 --- /dev/null +++ b/target/sim/sw/host/apps/offload_general/Makefile @@ -0,0 +1,16 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP = offload_general +SRCS = src/offload_general.c +# INCL_DEVICE_BINARY = true + +DEVICE_APPS = sndnn/gemm +# DEVICE_APPS = gemm +DEVICE_APPS += axpy +# DEVICE_APPS += blas/gemm + +include ../common_general.mk diff --git a/target/sim/sw/host/apps/offload_general/layout.csv b/target/sim/sw/host/apps/offload_general/layout.csv new file mode 100644 index 000000000..c3e3cedd5 --- /dev/null +++ b/target/sim/sw/host/apps/offload_general/layout.csv @@ -0,0 +1,4 @@ +,prepare data,send interrupt,clr interrupt,get local job ptr,barrier,copy job in,copy data in,get args,barrier,compute,barrier,copy output,send interrupt,clr interrupt +0,1,2,,,,,,,,,,,,4 +"range(1,9)",,,1,2,3,,,4,5,6,7,,, +9,,,1,2,,3,4,,5,,6,7,8, diff --git a/target/sim/sw/host/apps/offload_general/src/offload_general.c b/target/sim/sw/host/apps/offload_general/src/offload_general.c new file mode 100644 index 000000000..0405426a5 --- /dev/null +++ b/target/sim/sw/host/apps/offload_general/src/offload_general.c @@ -0,0 +1,49 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// #include "offload_general.h" + +#include "host.c" + +// Other variables +// __thread volatile comm_buffer_t* comm_buffer; + +#define N_JOBS 1 + +int main() { + // Reset and ungate quadrant 0, deisolate + reset_and_ungate_quad(0); + deisolate_quad(0, ISO_MASK_ALL); + + // Enable interrupts to receive notice of job termination + enable_sw_interrupts(); + + // Program Snitch entry point and communication buffer + program_snitches(); + + // Wakeup Snitches for snRuntime initialization + wakeup_snitches_cl(); + + int32_t snitch_return_value = -1; + + // Wait for snRuntime initialization to be over + wait_snitches_done(); + + // Send jobs + // for (int i = 0; i < N_JOBS; i++) { + // Start Snitches + // mcycle(); + wakeup_snitches_cl(); + + // Wait for job done + wait_sw_interrupt(); + // Clear interrupt + clear_sw_interrupt(0); + // wait_snitches_done(); + snitch_return_value = ((int32_t)comm_buffer.usr_data_ptr); + // } + // Exit routine + // mcycle(); + return snitch_return_value; +}