Merge branch 'xpulp-dev-matheusd' into 'master'

XPulpIMG MAC and SIMD instructions See merge request mempool/mempool!78
pulp-platform · Jun 24, 2021 · 763346c · 763346c
2 parents ca90bbd + 9570e75
commit 763346c
Show file tree

Hide file tree

Showing 246 changed files with 10,884 additions and 423 deletions.
diff --git a/.gitlab-ci.d/lint.sh b/.gitlab-ci.d/lint.sh
@@ -34,11 +34,11 @@ EXIT_STATUS=0
 
 # Only check C and C++ files for clang-format compatibility
 echo "Checking C/C++ files for clang-format compliance"
-clang_files=$(echo $files | tr ' ' '\n' | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
+clang_files=$(echo "$files" | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
 # Remove files from dependencies
-clang_files=$(echo $clang_files | grep -vP "hardware/deps/")
-clang_files=$(echo $clang_files | grep -vP "toolchain/")
-clang_files=$(echo $clang_files | grep -vP "be/")
+clang_files=$(echo "$clang_files" | grep -vP "hardware/deps/")
+clang_files=$(echo "$clang_files" | grep -vP "toolchain/")
+clang_files=$(echo "$clang_files" | grep -vP "be/")
 for file in $clang_files; do
   echo $file
   ./.gitlab-ci.d/run_clang_format.py \

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## Unreleased
 
+## 0.3.0 - 2021-03-31
+
+### Added
+- Toolchain and hardware support for Xpulp instructions:
+  - Post-incrementing and register-register loads and stores (`pv.lb[u]`, `pv.lh[u]`, `pv.lw`)
+  - 32-bit multiply-accumulate instructions (`pv.mac`, `pv.msu`)
+  - Arithmetic SIMD instructions (`pv.{add, sub, abs, avg, avgu, min, minu, max, maxu, srl, sra, sll, or, xor, and, dotsp, dotup, dotusp, sdotsp, sdotup, sdotusp}.{h, b}`
+  - Sub-word manipulation SIMD instructions (`pv.{extract, extractu, insert, shuffle2}.{h, b}`)
+
+### Fixed
+- Disable the branch prediction if there are multiple early-hits
+- Align end of `.text` section with the instruction cache
+- Observe the code style guidelines in the matrix multiplication and convolution kernels
+
+### Changed
+- Clean-up the pedantic compilation warnings of the matrix multiplication and convolution kernels
 
 ## 0.2.0 - 2021-03-29
 

diff --git a/README.md b/README.md
@@ -111,6 +111,18 @@ app=hello_world make benchmark
 
 You can set up the configuration of the system in the file `config/config.mk`, controlling the total number of cores, the number of cores per tile and whether the Xpulpimg extension is enabled or not in the Snitch core; the `xpulpimg` parameter also control the default core architecture considered when compiling applications for MemPool.
 
+To simulate the MemPool system with Verilator use the same format, but with the target
+```bash
+make verilate
+```
+If, during the Verilator model compilation, you run out of space on your disk, use
+```bash
+export OBJCACHE=''
+```
+to disable the use of `ccache`. Keep in mind that this will make the following compilations slower, since compiled object files will no longer be cached.
+
+If the tracer is enabled, its output traces are found under `hardware/build`, for both ModelSim and Verilator simulations.
+
 ## Common Problems
 
 - If building the GCC toolchain fails because *makeinfo/texinfo* is missing, try the following command:

diff --git a/apps/common/kernel/convolution.h b/apps/common/kernel/convolution.h
@@ -27,29 +27,30 @@ void conv2d_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                      uint32_t k_x, uint32_t k_y,
                      int32_t volatile *__restrict__ out, uint32_t id,
                      uint32_t numThreads) {
-  int boundary_x = k_x / 2;
-  int boundary_y = k_y / 2;
+  int boundary_x = (int)(k_x / 2);
+  int boundary_y = (int)(k_y / 2);
   // Now we only care about valid entries
-  while (id < boundary_x) {
+  while (id < (unsigned int)boundary_x) {
     id += numThreads;
   }
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < k_x * k_y; ++i) {
+  for (unsigned int i = 0; i < k_x * k_y; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
   // Start at the boundary_x
-  for (int i = id; i < in_x - boundary_x; i += numThreads) {
-    for (int j = boundary_y; j < in_y - boundary_y; j++) {
+  for (int i = (int)id; i < (int)in_x - boundary_x; i += (int)numThreads) {
+    for (int j = boundary_y; j < (int)in_y - boundary_y; j++) {
       sum = 0;
-      for (int m = -boundary_y; m < (int)(k_y - boundary_y); m++) {
-        for (int n = -boundary_x; n < (int)(k_x - boundary_x); n++) {
-          sum += in[(j + m) * in_x + (i + n)] *
-                 k[(m + boundary_y) * k_x + (n + boundary_x)];
+      for (int m = -boundary_y; m < (int)k_y - boundary_y; m++) {
+        for (int n = -boundary_x; n < (int)k_x - boundary_x; n++) {
+          sum += in[(unsigned int)(j + m) * in_x + (unsigned int)(i + n)] *
+                 (int)k[(unsigned int)(m + boundary_y) * k_x +
+                        (unsigned int)(n + boundary_x)];
         }
       }
-      out[j * in_x + i] = sum / weight;
+      out[(unsigned int)j * in_x + (unsigned int)i] = sum / (int)weight;
     }
   }
 }
@@ -59,24 +60,26 @@ void conv2d_shifted_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                              uint32_t k_x, uint32_t k_y,
                              int32_t volatile *__restrict__ out, uint32_t id,
                              uint32_t numThreads) {
-  uint32_t boundary_x = k_x / 2;
-  uint32_t boundary_y = k_y / 2;
+  int boundary_x = (int)(k_x / 2);
+  int boundary_y = (int)(k_y / 2);
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < k_x * k_y; ++i) {
+  for (unsigned int i = 0; i < k_x * k_y; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
   // Now we only care about valid entries
-  for (uint32_t i = id; i < in_x - (2 * boundary_x); i += numThreads) {
-    for (uint32_t j = 0; j < in_y - (2 * boundary_y); j++) {
+  for (unsigned int i = id; i < in_x - (unsigned int)(2 * boundary_x);
+       i += numThreads) {
+    for (unsigned int j = 0; j < in_y - (unsigned int)(2 * boundary_y); j++) {
       sum = 0;
-      for (uint32_t m = 0; m < k_y; m++) {
-        for (uint32_t n = 0; n < k_x; n++) {
-          sum += in[(j + m) * in_x + (i + n)] * k[m * k_x + n];
+      for (unsigned int m = 0; m < k_y; m++) {
+        for (unsigned int n = 0; n < k_x; n++) {
+          sum += in[(j + m) * in_x + (i + n)] * (int)k[m * k_x + n];
         }
       }
-      out[(j + boundary_y) * in_x + (i + boundary_x)] = sum / weight;
+      out[(j + (unsigned int)boundary_y) * in_x +
+          (i + (unsigned int)boundary_x)] = sum / (int)weight;
     }
   }
 }
@@ -87,7 +90,7 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                                   uint32_t id, uint32_t numThreads) {
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < 9; ++i) {
+  for (unsigned int i = 0; i < 9; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
@@ -109,16 +112,16 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
   for (uint32_t i = start; i < end; ++i) {
     for (uint32_t j = 1; j < in_y - 1; j++) {
       sum = 0;
-      sum += in[(j - 1) * in_x + (i - 1)] * k[0];
-      sum += in[(j - 1) * in_x + (i + 0)] * k[1];
-      sum += in[(j - 1) * in_x + (i + 1)] * k[2];
-      sum += in[(j + 0) * in_x + (i - 1)] * k[3];
-      sum += in[(j + 0) * in_x + (i + 0)] * k[4];
-      sum += in[(j + 0) * in_x + (i + 1)] * k[5];
-      sum += in[(j + 1) * in_x + (i - 1)] * k[6];
-      sum += in[(j + 1) * in_x + (i + 0)] * k[7];
-      sum += in[(j + 1) * in_x + (i + 1)] * k[8];
-      out[j * in_x + i] = sum / weight;
+      sum += in[(j - 1) * in_x + (i - 1)] * (int)k[0];
+      sum += in[(j - 1) * in_x + (i + 0)] * (int)k[1];
+      sum += in[(j - 1) * in_x + (i + 1)] * (int)k[2];
+      sum += in[(j + 0) * in_x + (i - 1)] * (int)k[3];
+      sum += in[(j + 0) * in_x + (i + 0)] * (int)k[4];
+      sum += in[(j + 0) * in_x + (i + 1)] * (int)k[5];
+      sum += in[(j + 1) * in_x + (i - 1)] * (int)k[6];
+      sum += in[(j + 1) * in_x + (i + 0)] * (int)k[7];
+      sum += in[(j + 1) * in_x + (i + 1)] * (int)k[8];
+      out[j * in_x + i] = sum / (int)weight;
     }
   }
 }
@@ -135,19 +138,19 @@ void conv2d_3x3_shifted_unrolled_parallel(int32_t const *__restrict__ in,
   }
   // TODO implement boundary halo
   // Now we only care about valid entries
-  for (int i = id; i < in_x - 2; i += numThreads) {
-    for (int j = 0; j < in_y - 2; j++) {
+  for (unsigned int i = id; i < in_x - 2; i += numThreads) {
+    for (unsigned int j = 0; j < in_y - 2; j++) {
       sum = 0;
-      sum += in[(j + 0) * in_x + (i + 0)] * k[0];
-      sum += in[(j + 0) * in_x + (i + 1)] * k[1];
-      sum += in[(j + 0) * in_x + (i + 2)] * k[2];
-      sum += in[(j + 1) * in_x + (i + 0)] * k[3];
-      sum += in[(j + 1) * in_x + (i + 1)] * k[4];
-      sum += in[(j + 1) * in_x + (i + 2)] * k[5];
-      sum += in[(j + 2) * in_x + (i + 0)] * k[6];
-      sum += in[(j + 2) * in_x + (i + 1)] * k[7];
-      sum += in[(j + 2) * in_x + (i + 2)] * k[8];
-      out[(j + 1) * in_x + (i + 1)] = sum / weight;
+      sum += in[(j + 0) * in_x + (i + 0)] * (int)k[0];
+      sum += in[(j + 0) * in_x + (i + 1)] * (int)k[1];
+      sum += in[(j + 0) * in_x + (i + 2)] * (int)k[2];
+      sum += in[(j + 1) * in_x + (i + 0)] * (int)k[3];
+      sum += in[(j + 1) * in_x + (i + 1)] * (int)k[4];
+      sum += in[(j + 1) * in_x + (i + 2)] * (int)k[5];
+      sum += in[(j + 2) * in_x + (i + 0)] * (int)k[6];
+      sum += in[(j + 2) * in_x + (i + 1)] * (int)k[7];
+      sum += in[(j + 2) * in_x + (i + 2)] * (int)k[8];
+      out[(j + 1) * in_x + (i + 1)] = sum / (int)weight;
     }
   }
 }
@@ -158,15 +161,15 @@ void init_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                        uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
   if (img_y > img_x) {
-    for (int i = id; i < img_y; i += numThreads) {
-      for (int j = 0; j < img_x; ++j) {
-        img[i * img_x + j] = (i % 16) + (j % 4);
+    for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
+      for (int j = 0; j < (int)img_x; ++j) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
       }
     }
   } else {
-    for (int j = id; j < img_x; j += numThreads) {
-      for (int i = 0; i < img_y; ++i) {
-        img[i * img_x + j] = (i % 16) + (j % 4);
+    for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
+      for (int i = 0; i < (int)img_y; ++i) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
       }
     }
   }
@@ -177,15 +180,15 @@ void zero_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                        uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
   if (img_y > img_x) {
-    for (int i = id; i < img_y; i += numThreads) {
-      for (int j = 0; j < img_x; ++j) {
-        img[i * img_x + j] = 0;
+    for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
+      for (int j = 0; j < (int)img_x; ++j) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = 0;
       }
     }
   } else {
-    for (int j = id; j < img_x; j += numThreads) {
-      for (int i = 0; i < img_y; ++i) {
-        img[i * img_x + j] = 0;
+    for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
+      for (int i = 0; i < (int)img_y; ++i) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = 0;
       }
     }
   }
@@ -197,18 +200,18 @@ extern uint32_t barrier_init;
 int verify_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                         uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
-  for (int i = id + 1; i < img_y - 1; i += numThreads) {
-    int32_t y = i % 16;
+  for (int i = (int)id + 1; i < (int)img_y - 1; i += (int)numThreads) {
+    int y = i % 16;
     if (i % 16 == 0)
       y = 4;
     if (i % 16 == 15)
       y = 11;
-    for (int32_t j = 1; j < img_x - 1; ++j) {
-      int32_t x = ((j % 4) / 2) + 1;
-      if (img[i * img_x + j] != x + y) {
-        return (i + j) == 0 ? -1 : i * img_x + j;
+    for (int j = 1; j < (int)img_x - 1; ++j) {
+      int x = ((j % 4) / 2) + 1;
+      if ((int)img[i * (int)img_x + j] != x + y) {
+        return (i + j) == 0 ? -1 : i * (int)img_x + j;
       }
-      img[i * img_x + j] = 0;
+      img[i * (int)img_x + j] = 0;
     }
   }
   return 0;

diff --git a/apps/common/link.ld b/apps/common/link.ld
@@ -25,6 +25,7 @@ SECTIONS {
   .text : {
     *(.text.init)
     *(.text)
+    . = ALIGN(0x40);
   } > l2
 
   /* Data on L2 */

diff --git a/apps/common/runtime.mk b/apps/common/runtime.mk
@@ -28,7 +28,7 @@ GCC_INSTALL_DIR    ?= $(INSTALL_DIR)/riscv-gcc
 LLVM_INSTALL_DIR   ?= $(INSTALL_DIR)/llvm
 HALIDE_INSTALL_DIR ?= $(INSTALL_DIR)/halide
 
-COMPILER      ?= llvm
+COMPILER      ?= gcc
 XPULPIMG      ?= $(xpulpimg)
 
 RISCV_XLEN    ?= 32
@@ -68,14 +68,18 @@ RISCV_STRIP   ?= $(RISCV_PREFIX)strip
 
 # Defines
 DEFINES := -DNUM_CORES=$(num_cores) -DBOOT_ADDR=0x$(boot_addr) -DL2_BASE=0x$(l2_base) -DL2_SIZE=0x$(l2_size)
+# Define __XPULPIMG if the extension is active
+ifeq ($(XPULPIMG),1)
+	DEFINES += -D__XPULPIMG
+endif
 
 # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target
 RISCV_LLVM_TARGET  ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR)
 
 RISCV_WARNINGS += -Wunused-variable -Wconversion -Wall -Wextra # -Werror
 RISCV_FLAGS_COMMON_TESTS ?= -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -I$(CURDIR)/common -static
 RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -std=gnu99 -O3 -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
-RISCV_FLAGS_GCC    ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS)
+RISCV_FLAGS_GCC    ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS) -falign-loops=32 -falign-jumps=32
 
 RISCV_FLAGS_LLVM   ?= -mcmodel=small -mllvm -enable-misched
 ifeq ($(COMPILER),gcc)

diff --git a/apps/common/synchronization.c b/apps/common/synchronization.c
@@ -16,8 +16,6 @@
 
 // Author: Samuel Riedel, ETH Zurich
 
-static inline unsigned amo_add(void volatile *const address, unsigned value);
-
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -71,20 +69,3 @@ void mempool_barrier(uint32_t num_cores, uint32_t cycles) {
       mempool_wait(cycles);
   }
 }
-
-/**
-
- * Expose the atomic add instruction.
- *
- * @param   address     A pointer to an address on L2 memory to store the value.
- * @param   value       Value to add to the specified memory location.
- *
- * @return  Value previously stored in memory.
- */
-static inline unsigned amo_add(void volatile *const address, unsigned value) {
-  unsigned ret;
-  __asm__ __volatile__("" : : : "memory");
-  asm volatile("amoadd.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
-  __asm__ __volatile__("" : : : "memory");
-  return ret;
-}
diff --git a/apps/common/synchronization.h b/apps/common/synchronization.h
@@ -16,6 +16,47 @@
 
 // Author: Samuel Riedel, ETH Zurich
 
+#ifndef __SYNCHRONIZATION_H__
+#define __SYNCHRONIZATION_H__
+
 // Barrier functions
 void mempool_barrier_init(uint32_t core_id, uint32_t num_cores);
 void mempool_barrier(uint32_t num_cores, uint32_t cycles);
+
+// Atomics
+
+/**
+
+ * Expose the atomic add instruction.
+ *
+ * @param   address     A pointer to an address on L2 memory to store the value.
+ * @param   value       Value to add to the specified memory location.
+ *
+ * @return  Value previously stored in memory.
+ */
+static inline unsigned amo_add(void volatile *const address, unsigned value) {
+  unsigned ret;
+  asm volatile("" : : : "memory");
+  asm volatile("amoadd.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
+  asm volatile("" : : : "memory");
+  return ret;
+}
+
+/**
+
+ * Expose the atomic or instruction.
+ *
+ * @param   address     A pointer to an address on L2 memory to store the value.
+ * @param   value       Value to add to the specified memory location.
+ *
+ * @return  Value previously stored in memory.
+ */
+static inline unsigned amo_or(void volatile *const address, unsigned value) {
+  unsigned ret;
+  asm volatile("" : : : "memory");
+  asm volatile("amoor.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
+  asm volatile("" : : : "memory");
+  return ret;
+}
+
+#endif // __SYNCHRONIZATION_H__