Skip to content

Commit

Permalink
Merge branch 'xpulp-dev-matheusd' into 'master'
Browse files Browse the repository at this point in the history
XPulpIMG MAC and SIMD instructions

See merge request mempool/mempool!78
  • Loading branch information
suehtamacv authored and SamuelRiedel committed Jun 24, 2021
2 parents ca90bbd + 9570e75 commit 763346c
Show file tree
Hide file tree
Showing 246 changed files with 10,884 additions and 423 deletions.
8 changes: 4 additions & 4 deletions .gitlab-ci.d/lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ EXIT_STATUS=0

# Only check C and C++ files for clang-format compatibility
echo "Checking C/C++ files for clang-format compliance"
clang_files=$(echo $files | tr ' ' '\n' | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
clang_files=$(echo "$files" | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
# Remove files from dependencies
clang_files=$(echo $clang_files | grep -vP "hardware/deps/")
clang_files=$(echo $clang_files | grep -vP "toolchain/")
clang_files=$(echo $clang_files | grep -vP "be/")
clang_files=$(echo "$clang_files" | grep -vP "hardware/deps/")
clang_files=$(echo "$clang_files" | grep -vP "toolchain/")
clang_files=$(echo "$clang_files" | grep -vP "be/")
for file in $clang_files; do
echo $file
./.gitlab-ci.d/run_clang_format.py \
Expand Down
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## Unreleased

## 0.3.0 - 2021-03-31

### Added
- Toolchain and hardware support for Xpulp instructions:
- Post-incrementing and register-register loads and stores (`pv.lb[u]`, `pv.lh[u]`, `pv.lw`)
- 32-bit multiply-accumulate instructions (`pv.mac`, `pv.msu`)
- Arithmetic SIMD instructions (`pv.{add, sub, abs, avg, avgu, min, minu, max, maxu, srl, sra, sll, or, xor, and, dotsp, dotup, dotusp, sdotsp, sdotup, sdotusp}.{h, b}`
- Sub-word manipulation SIMD instructions (`pv.{extract, extractu, insert, shuffle2}.{h, b}`)

### Fixed
- Disable the branch prediction if there are multiple early-hits
- Align end of `.text` section with the instruction cache
- Observe the code style guidelines in the matrix multiplication and convolution kernels

### Changed
- Clean-up the pedantic compilation warnings of the matrix multiplication and convolution kernels

## 0.2.0 - 2021-03-29

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@ app=hello_world make benchmark

You can set up the configuration of the system in the file `config/config.mk`, controlling the total number of cores, the number of cores per tile and whether the Xpulpimg extension is enabled or not in the Snitch core; the `xpulpimg` parameter also control the default core architecture considered when compiling applications for MemPool.

To simulate the MemPool system with Verilator use the same format, but with the target
```bash
make verilate
```
If, during the Verilator model compilation, you run out of space on your disk, use
```bash
export OBJCACHE=''
```
to disable the use of `ccache`. Keep in mind that this will make the following compilations slower, since compiled object files will no longer be cached.

If the tracer is enabled, its output traces are found under `hardware/build`, for both ModelSim and Verilator simulations.

## Common Problems

- If building the GCC toolchain fails because *makeinfo/texinfo* is missing, try the following command:
Expand Down
127 changes: 65 additions & 62 deletions apps/common/kernel/convolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,30 @@ void conv2d_parallel(int32_t const *__restrict__ in, uint32_t in_x,
uint32_t k_x, uint32_t k_y,
int32_t volatile *__restrict__ out, uint32_t id,
uint32_t numThreads) {
int boundary_x = k_x / 2;
int boundary_y = k_y / 2;
int boundary_x = (int)(k_x / 2);
int boundary_y = (int)(k_y / 2);
// Now we only care about valid entries
while (id < boundary_x) {
while (id < (unsigned int)boundary_x) {
id += numThreads;
}
int32_t sum;
uint32_t weight = 0;
for (int i = 0; i < k_x * k_y; ++i) {
for (unsigned int i = 0; i < k_x * k_y; ++i) {
weight += k[i];
}
// TODO implement boundary halo
// Start at the boundary_x
for (int i = id; i < in_x - boundary_x; i += numThreads) {
for (int j = boundary_y; j < in_y - boundary_y; j++) {
for (int i = (int)id; i < (int)in_x - boundary_x; i += (int)numThreads) {
for (int j = boundary_y; j < (int)in_y - boundary_y; j++) {
sum = 0;
for (int m = -boundary_y; m < (int)(k_y - boundary_y); m++) {
for (int n = -boundary_x; n < (int)(k_x - boundary_x); n++) {
sum += in[(j + m) * in_x + (i + n)] *
k[(m + boundary_y) * k_x + (n + boundary_x)];
for (int m = -boundary_y; m < (int)k_y - boundary_y; m++) {
for (int n = -boundary_x; n < (int)k_x - boundary_x; n++) {
sum += in[(unsigned int)(j + m) * in_x + (unsigned int)(i + n)] *
(int)k[(unsigned int)(m + boundary_y) * k_x +
(unsigned int)(n + boundary_x)];
}
}
out[j * in_x + i] = sum / weight;
out[(unsigned int)j * in_x + (unsigned int)i] = sum / (int)weight;
}
}
}
Expand All @@ -59,24 +60,26 @@ void conv2d_shifted_parallel(int32_t const *__restrict__ in, uint32_t in_x,
uint32_t k_x, uint32_t k_y,
int32_t volatile *__restrict__ out, uint32_t id,
uint32_t numThreads) {
uint32_t boundary_x = k_x / 2;
uint32_t boundary_y = k_y / 2;
int boundary_x = (int)(k_x / 2);
int boundary_y = (int)(k_y / 2);
int32_t sum;
uint32_t weight = 0;
for (int i = 0; i < k_x * k_y; ++i) {
for (unsigned int i = 0; i < k_x * k_y; ++i) {
weight += k[i];
}
// TODO implement boundary halo
// Now we only care about valid entries
for (uint32_t i = id; i < in_x - (2 * boundary_x); i += numThreads) {
for (uint32_t j = 0; j < in_y - (2 * boundary_y); j++) {
for (unsigned int i = id; i < in_x - (unsigned int)(2 * boundary_x);
i += numThreads) {
for (unsigned int j = 0; j < in_y - (unsigned int)(2 * boundary_y); j++) {
sum = 0;
for (uint32_t m = 0; m < k_y; m++) {
for (uint32_t n = 0; n < k_x; n++) {
sum += in[(j + m) * in_x + (i + n)] * k[m * k_x + n];
for (unsigned int m = 0; m < k_y; m++) {
for (unsigned int n = 0; n < k_x; n++) {
sum += in[(j + m) * in_x + (i + n)] * (int)k[m * k_x + n];
}
}
out[(j + boundary_y) * in_x + (i + boundary_x)] = sum / weight;
out[(j + (unsigned int)boundary_y) * in_x +
(i + (unsigned int)boundary_x)] = sum / (int)weight;
}
}
}
Expand All @@ -87,7 +90,7 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
uint32_t id, uint32_t numThreads) {
int32_t sum;
uint32_t weight = 0;
for (int i = 0; i < 9; ++i) {
for (unsigned int i = 0; i < 9; ++i) {
weight += k[i];
}
// TODO implement boundary halo
Expand All @@ -109,16 +112,16 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
for (uint32_t i = start; i < end; ++i) {
for (uint32_t j = 1; j < in_y - 1; j++) {
sum = 0;
sum += in[(j - 1) * in_x + (i - 1)] * k[0];
sum += in[(j - 1) * in_x + (i + 0)] * k[1];
sum += in[(j - 1) * in_x + (i + 1)] * k[2];
sum += in[(j + 0) * in_x + (i - 1)] * k[3];
sum += in[(j + 0) * in_x + (i + 0)] * k[4];
sum += in[(j + 0) * in_x + (i + 1)] * k[5];
sum += in[(j + 1) * in_x + (i - 1)] * k[6];
sum += in[(j + 1) * in_x + (i + 0)] * k[7];
sum += in[(j + 1) * in_x + (i + 1)] * k[8];
out[j * in_x + i] = sum / weight;
sum += in[(j - 1) * in_x + (i - 1)] * (int)k[0];
sum += in[(j - 1) * in_x + (i + 0)] * (int)k[1];
sum += in[(j - 1) * in_x + (i + 1)] * (int)k[2];
sum += in[(j + 0) * in_x + (i - 1)] * (int)k[3];
sum += in[(j + 0) * in_x + (i + 0)] * (int)k[4];
sum += in[(j + 0) * in_x + (i + 1)] * (int)k[5];
sum += in[(j + 1) * in_x + (i - 1)] * (int)k[6];
sum += in[(j + 1) * in_x + (i + 0)] * (int)k[7];
sum += in[(j + 1) * in_x + (i + 1)] * (int)k[8];
out[j * in_x + i] = sum / (int)weight;
}
}
}
Expand All @@ -135,19 +138,19 @@ void conv2d_3x3_shifted_unrolled_parallel(int32_t const *__restrict__ in,
}
// TODO implement boundary halo
// Now we only care about valid entries
for (int i = id; i < in_x - 2; i += numThreads) {
for (int j = 0; j < in_y - 2; j++) {
for (unsigned int i = id; i < in_x - 2; i += numThreads) {
for (unsigned int j = 0; j < in_y - 2; j++) {
sum = 0;
sum += in[(j + 0) * in_x + (i + 0)] * k[0];
sum += in[(j + 0) * in_x + (i + 1)] * k[1];
sum += in[(j + 0) * in_x + (i + 2)] * k[2];
sum += in[(j + 1) * in_x + (i + 0)] * k[3];
sum += in[(j + 1) * in_x + (i + 1)] * k[4];
sum += in[(j + 1) * in_x + (i + 2)] * k[5];
sum += in[(j + 2) * in_x + (i + 0)] * k[6];
sum += in[(j + 2) * in_x + (i + 1)] * k[7];
sum += in[(j + 2) * in_x + (i + 2)] * k[8];
out[(j + 1) * in_x + (i + 1)] = sum / weight;
sum += in[(j + 0) * in_x + (i + 0)] * (int)k[0];
sum += in[(j + 0) * in_x + (i + 1)] * (int)k[1];
sum += in[(j + 0) * in_x + (i + 2)] * (int)k[2];
sum += in[(j + 1) * in_x + (i + 0)] * (int)k[3];
sum += in[(j + 1) * in_x + (i + 1)] * (int)k[4];
sum += in[(j + 1) * in_x + (i + 2)] * (int)k[5];
sum += in[(j + 2) * in_x + (i + 0)] * (int)k[6];
sum += in[(j + 2) * in_x + (i + 1)] * (int)k[7];
sum += in[(j + 2) * in_x + (i + 2)] * (int)k[8];
out[(j + 1) * in_x + (i + 1)] = sum / (int)weight;
}
}
}
Expand All @@ -158,15 +161,15 @@ void init_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
uint32_t id, uint32_t numThreads) {
// Parallelize over rows
if (img_y > img_x) {
for (int i = id; i < img_y; i += numThreads) {
for (int j = 0; j < img_x; ++j) {
img[i * img_x + j] = (i % 16) + (j % 4);
for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
for (int j = 0; j < (int)img_x; ++j) {
img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
}
}
} else {
for (int j = id; j < img_x; j += numThreads) {
for (int i = 0; i < img_y; ++i) {
img[i * img_x + j] = (i % 16) + (j % 4);
for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
for (int i = 0; i < (int)img_y; ++i) {
img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
}
}
}
Expand All @@ -177,15 +180,15 @@ void zero_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
uint32_t id, uint32_t numThreads) {
// Parallelize over rows
if (img_y > img_x) {
for (int i = id; i < img_y; i += numThreads) {
for (int j = 0; j < img_x; ++j) {
img[i * img_x + j] = 0;
for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
for (int j = 0; j < (int)img_x; ++j) {
img[(unsigned int)i * img_x + (unsigned int)j] = 0;
}
}
} else {
for (int j = id; j < img_x; j += numThreads) {
for (int i = 0; i < img_y; ++i) {
img[i * img_x + j] = 0;
for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
for (int i = 0; i < (int)img_y; ++i) {
img[(unsigned int)i * img_x + (unsigned int)j] = 0;
}
}
}
Expand All @@ -197,18 +200,18 @@ extern uint32_t barrier_init;
int verify_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
uint32_t id, uint32_t numThreads) {
// Parallelize over rows
for (int i = id + 1; i < img_y - 1; i += numThreads) {
int32_t y = i % 16;
for (int i = (int)id + 1; i < (int)img_y - 1; i += (int)numThreads) {
int y = i % 16;
if (i % 16 == 0)
y = 4;
if (i % 16 == 15)
y = 11;
for (int32_t j = 1; j < img_x - 1; ++j) {
int32_t x = ((j % 4) / 2) + 1;
if (img[i * img_x + j] != x + y) {
return (i + j) == 0 ? -1 : i * img_x + j;
for (int j = 1; j < (int)img_x - 1; ++j) {
int x = ((j % 4) / 2) + 1;
if ((int)img[i * (int)img_x + j] != x + y) {
return (i + j) == 0 ? -1 : i * (int)img_x + j;
}
img[i * img_x + j] = 0;
img[i * (int)img_x + j] = 0;
}
}
return 0;
Expand Down
1 change: 1 addition & 0 deletions apps/common/link.ld
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ SECTIONS {
.text : {
*(.text.init)
*(.text)
. = ALIGN(0x40);
} > l2

/* Data on L2 */
Expand Down
8 changes: 6 additions & 2 deletions apps/common/runtime.mk
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ GCC_INSTALL_DIR ?= $(INSTALL_DIR)/riscv-gcc
LLVM_INSTALL_DIR ?= $(INSTALL_DIR)/llvm
HALIDE_INSTALL_DIR ?= $(INSTALL_DIR)/halide

COMPILER ?= llvm
COMPILER ?= gcc
XPULPIMG ?= $(xpulpimg)

RISCV_XLEN ?= 32
Expand Down Expand Up @@ -68,14 +68,18 @@ RISCV_STRIP ?= $(RISCV_PREFIX)strip

# Defines
DEFINES := -DNUM_CORES=$(num_cores) -DBOOT_ADDR=0x$(boot_addr) -DL2_BASE=0x$(l2_base) -DL2_SIZE=0x$(l2_size)
# Define __XPULPIMG if the extension is active
ifeq ($(XPULPIMG),1)
DEFINES += -D__XPULPIMG
endif

# Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target
RISCV_LLVM_TARGET ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR)

RISCV_WARNINGS += -Wunused-variable -Wconversion -Wall -Wextra # -Werror
RISCV_FLAGS_COMMON_TESTS ?= -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -I$(CURDIR)/common -static
RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -std=gnu99 -O3 -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
RISCV_FLAGS_GCC ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS)
RISCV_FLAGS_GCC ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS) -falign-loops=32 -falign-jumps=32

RISCV_FLAGS_LLVM ?= -mcmodel=small -mllvm -enable-misched
ifeq ($(COMPILER),gcc)
Expand Down
19 changes: 0 additions & 19 deletions apps/common/synchronization.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

// Author: Samuel Riedel, ETH Zurich

static inline unsigned amo_add(void volatile *const address, unsigned value);

#include <stdbool.h>
#include <stdint.h>

Expand Down Expand Up @@ -71,20 +69,3 @@ void mempool_barrier(uint32_t num_cores, uint32_t cycles) {
mempool_wait(cycles);
}
}

/**
* Expose the atomic add instruction.
*
* @param address A pointer to an address on L2 memory to store the value.
* @param value Value to add to the specified memory location.
*
* @return Value previously stored in memory.
*/
static inline unsigned amo_add(void volatile *const address, unsigned value) {
unsigned ret;
__asm__ __volatile__("" : : : "memory");
asm volatile("amoadd.w %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
__asm__ __volatile__("" : : : "memory");
return ret;
}
41 changes: 41 additions & 0 deletions apps/common/synchronization.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,47 @@

// Author: Samuel Riedel, ETH Zurich

#ifndef __SYNCHRONIZATION_H__
#define __SYNCHRONIZATION_H__

// Barrier functions
void mempool_barrier_init(uint32_t core_id, uint32_t num_cores);
void mempool_barrier(uint32_t num_cores, uint32_t cycles);

// Atomics

/**
* Expose the atomic add instruction.
*
* @param address A pointer to an address on L2 memory to store the value.
* @param value Value to add to the specified memory location.
*
* @return Value previously stored in memory.
*/
static inline unsigned amo_add(void volatile *const address, unsigned value) {
unsigned ret;
asm volatile("" : : : "memory");
asm volatile("amoadd.w %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
asm volatile("" : : : "memory");
return ret;
}

/**
* Expose the atomic or instruction.
*
* @param address A pointer to an address on L2 memory to store the value.
* @param value Value to add to the specified memory location.
*
* @return Value previously stored in memory.
*/
static inline unsigned amo_or(void volatile *const address, unsigned value) {
unsigned ret;
asm volatile("" : : : "memory");
asm volatile("amoor.w %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
asm volatile("" : : : "memory");
return ret;
}

#endif // __SYNCHRONIZATION_H__
Loading

0 comments on commit 763346c

Please sign in to comment.