diff --git a/.gitlab-ci.d/lint.sh b/.gitlab-ci.d/lint.sh
index 17b2d35a8..5202e33cd 100755
--- a/.gitlab-ci.d/lint.sh
+++ b/.gitlab-ci.d/lint.sh
@@ -34,11 +34,11 @@ EXIT_STATUS=0
 
 # Only check C and C++ files for clang-format compatibility
 echo "Checking C/C++ files for clang-format compliance"
-clang_files=$(echo $files | tr ' ' '\n' | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
+clang_files=$(echo "$files" | grep -P "(?<!\.ld)\.(h|c|cpp)\b")
 # Remove files from dependencies
-clang_files=$(echo $clang_files | grep -vP "hardware/deps/")
-clang_files=$(echo $clang_files | grep -vP "toolchain/")
-clang_files=$(echo $clang_files | grep -vP "be/")
+clang_files=$(echo "$clang_files" | grep -vP "hardware/deps/")
+clang_files=$(echo "$clang_files" | grep -vP "toolchain/")
+clang_files=$(echo "$clang_files" | grep -vP "be/")
 for file in $clang_files; do
   echo $file
   ./.gitlab-ci.d/run_clang_format.py \
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1095a1e30..adffc5144 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## Unreleased
 
+## 0.3.0 - 2021-03-31
+
+### Added
+- Toolchain and hardware support for Xpulp instructions:
+  - Post-incrementing and register-register loads and stores (`pv.lb[u]`, `pv.lh[u]`, `pv.lw`)
+  - 32-bit multiply-accumulate instructions (`pv.mac`, `pv.msu`)
+  - Arithmetic SIMD instructions (`pv.{add, sub, abs, avg, avgu, min, minu, max, maxu, srl, sra, sll, or, xor, and, dotsp, dotup, dotusp, sdotsp, sdotup, sdotusp}.{h, b}`
+  - Sub-word manipulation SIMD instructions (`pv.{extract, extractu, insert, shuffle2}.{h, b}`)
+
+### Fixed
+- Disable the branch prediction if there are multiple early-hits
+- Align end of `.text` section with the instruction cache
+- Observe the code style guidelines in the matrix multiplication and convolution kernels
+
+### Changed
+- Clean-up the pedantic compilation warnings of the matrix multiplication and convolution kernels
 
 ## 0.2.0 - 2021-03-29
 
diff --git a/README.md b/README.md
index ca4a8416b..fde0f29e0 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,18 @@ app=hello_world make benchmark
 
 You can set up the configuration of the system in the file `config/config.mk`, controlling the total number of cores, the number of cores per tile and whether the Xpulpimg extension is enabled or not in the Snitch core; the `xpulpimg` parameter also control the default core architecture considered when compiling applications for MemPool.
 
+To simulate the MemPool system with Verilator use the same format, but with the target
+```bash
+make verilate
+```
+If, during the Verilator model compilation, you run out of space on your disk, use
+```bash
+export OBJCACHE=''
+```
+to disable the use of `ccache`. Keep in mind that this will make the following compilations slower, since compiled object files will no longer be cached.
+
+If the tracer is enabled, its output traces are found under `hardware/build`, for both ModelSim and Verilator simulations.
+
 ## Common Problems
 
 - If building the GCC toolchain fails because *makeinfo/texinfo* is missing, try the following command:
diff --git a/apps/common/kernel/convolution.h b/apps/common/kernel/convolution.h
index d8ec60969..743ff3387 100644
--- a/apps/common/kernel/convolution.h
+++ b/apps/common/kernel/convolution.h
@@ -27,29 +27,30 @@ void conv2d_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                      uint32_t k_x, uint32_t k_y,
                      int32_t volatile *__restrict__ out, uint32_t id,
                      uint32_t numThreads) {
-  int boundary_x = k_x / 2;
-  int boundary_y = k_y / 2;
+  int boundary_x = (int)(k_x / 2);
+  int boundary_y = (int)(k_y / 2);
   // Now we only care about valid entries
-  while (id < boundary_x) {
+  while (id < (unsigned int)boundary_x) {
     id += numThreads;
   }
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < k_x * k_y; ++i) {
+  for (unsigned int i = 0; i < k_x * k_y; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
   // Start at the boundary_x
-  for (int i = id; i < in_x - boundary_x; i += numThreads) {
-    for (int j = boundary_y; j < in_y - boundary_y; j++) {
+  for (int i = (int)id; i < (int)in_x - boundary_x; i += (int)numThreads) {
+    for (int j = boundary_y; j < (int)in_y - boundary_y; j++) {
       sum = 0;
-      for (int m = -boundary_y; m < (int)(k_y - boundary_y); m++) {
-        for (int n = -boundary_x; n < (int)(k_x - boundary_x); n++) {
-          sum += in[(j + m) * in_x + (i + n)] *
-                 k[(m + boundary_y) * k_x + (n + boundary_x)];
+      for (int m = -boundary_y; m < (int)k_y - boundary_y; m++) {
+        for (int n = -boundary_x; n < (int)k_x - boundary_x; n++) {
+          sum += in[(unsigned int)(j + m) * in_x + (unsigned int)(i + n)] *
+                 (int)k[(unsigned int)(m + boundary_y) * k_x +
+                        (unsigned int)(n + boundary_x)];
         }
       }
-      out[j * in_x + i] = sum / weight;
+      out[(unsigned int)j * in_x + (unsigned int)i] = sum / (int)weight;
     }
   }
 }
@@ -59,24 +60,26 @@ void conv2d_shifted_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                              uint32_t k_x, uint32_t k_y,
                              int32_t volatile *__restrict__ out, uint32_t id,
                              uint32_t numThreads) {
-  uint32_t boundary_x = k_x / 2;
-  uint32_t boundary_y = k_y / 2;
+  int boundary_x = (int)(k_x / 2);
+  int boundary_y = (int)(k_y / 2);
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < k_x * k_y; ++i) {
+  for (unsigned int i = 0; i < k_x * k_y; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
   // Now we only care about valid entries
-  for (uint32_t i = id; i < in_x - (2 * boundary_x); i += numThreads) {
-    for (uint32_t j = 0; j < in_y - (2 * boundary_y); j++) {
+  for (unsigned int i = id; i < in_x - (unsigned int)(2 * boundary_x);
+       i += numThreads) {
+    for (unsigned int j = 0; j < in_y - (unsigned int)(2 * boundary_y); j++) {
       sum = 0;
-      for (uint32_t m = 0; m < k_y; m++) {
-        for (uint32_t n = 0; n < k_x; n++) {
-          sum += in[(j + m) * in_x + (i + n)] * k[m * k_x + n];
+      for (unsigned int m = 0; m < k_y; m++) {
+        for (unsigned int n = 0; n < k_x; n++) {
+          sum += in[(j + m) * in_x + (i + n)] * (int)k[m * k_x + n];
         }
       }
-      out[(j + boundary_y) * in_x + (i + boundary_x)] = sum / weight;
+      out[(j + (unsigned int)boundary_y) * in_x +
+          (i + (unsigned int)boundary_x)] = sum / (int)weight;
     }
   }
 }
@@ -87,7 +90,7 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
                                   uint32_t id, uint32_t numThreads) {
   int32_t sum;
   uint32_t weight = 0;
-  for (int i = 0; i < 9; ++i) {
+  for (unsigned int i = 0; i < 9; ++i) {
     weight += k[i];
   }
   // TODO implement boundary halo
@@ -109,16 +112,16 @@ void conv2d_3x3_unrolled_parallel(int32_t const *__restrict__ in, uint32_t in_x,
   for (uint32_t i = start; i < end; ++i) {
     for (uint32_t j = 1; j < in_y - 1; j++) {
       sum = 0;
-      sum += in[(j - 1) * in_x + (i - 1)] * k[0];
-      sum += in[(j - 1) * in_x + (i + 0)] * k[1];
-      sum += in[(j - 1) * in_x + (i + 1)] * k[2];
-      sum += in[(j + 0) * in_x + (i - 1)] * k[3];
-      sum += in[(j + 0) * in_x + (i + 0)] * k[4];
-      sum += in[(j + 0) * in_x + (i + 1)] * k[5];
-      sum += in[(j + 1) * in_x + (i - 1)] * k[6];
-      sum += in[(j + 1) * in_x + (i + 0)] * k[7];
-      sum += in[(j + 1) * in_x + (i + 1)] * k[8];
-      out[j * in_x + i] = sum / weight;
+      sum += in[(j - 1) * in_x + (i - 1)] * (int)k[0];
+      sum += in[(j - 1) * in_x + (i + 0)] * (int)k[1];
+      sum += in[(j - 1) * in_x + (i + 1)] * (int)k[2];
+      sum += in[(j + 0) * in_x + (i - 1)] * (int)k[3];
+      sum += in[(j + 0) * in_x + (i + 0)] * (int)k[4];
+      sum += in[(j + 0) * in_x + (i + 1)] * (int)k[5];
+      sum += in[(j + 1) * in_x + (i - 1)] * (int)k[6];
+      sum += in[(j + 1) * in_x + (i + 0)] * (int)k[7];
+      sum += in[(j + 1) * in_x + (i + 1)] * (int)k[8];
+      out[j * in_x + i] = sum / (int)weight;
     }
   }
 }
@@ -135,19 +138,19 @@ void conv2d_3x3_shifted_unrolled_parallel(int32_t const *__restrict__ in,
   }
   // TODO implement boundary halo
   // Now we only care about valid entries
-  for (int i = id; i < in_x - 2; i += numThreads) {
-    for (int j = 0; j < in_y - 2; j++) {
+  for (unsigned int i = id; i < in_x - 2; i += numThreads) {
+    for (unsigned int j = 0; j < in_y - 2; j++) {
       sum = 0;
-      sum += in[(j + 0) * in_x + (i + 0)] * k[0];
-      sum += in[(j + 0) * in_x + (i + 1)] * k[1];
-      sum += in[(j + 0) * in_x + (i + 2)] * k[2];
-      sum += in[(j + 1) * in_x + (i + 0)] * k[3];
-      sum += in[(j + 1) * in_x + (i + 1)] * k[4];
-      sum += in[(j + 1) * in_x + (i + 2)] * k[5];
-      sum += in[(j + 2) * in_x + (i + 0)] * k[6];
-      sum += in[(j + 2) * in_x + (i + 1)] * k[7];
-      sum += in[(j + 2) * in_x + (i + 2)] * k[8];
-      out[(j + 1) * in_x + (i + 1)] = sum / weight;
+      sum += in[(j + 0) * in_x + (i + 0)] * (int)k[0];
+      sum += in[(j + 0) * in_x + (i + 1)] * (int)k[1];
+      sum += in[(j + 0) * in_x + (i + 2)] * (int)k[2];
+      sum += in[(j + 1) * in_x + (i + 0)] * (int)k[3];
+      sum += in[(j + 1) * in_x + (i + 1)] * (int)k[4];
+      sum += in[(j + 1) * in_x + (i + 2)] * (int)k[5];
+      sum += in[(j + 2) * in_x + (i + 0)] * (int)k[6];
+      sum += in[(j + 2) * in_x + (i + 1)] * (int)k[7];
+      sum += in[(j + 2) * in_x + (i + 2)] * (int)k[8];
+      out[(j + 1) * in_x + (i + 1)] = sum / (int)weight;
     }
   }
 }
@@ -158,15 +161,15 @@ void init_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                        uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
   if (img_y > img_x) {
-    for (int i = id; i < img_y; i += numThreads) {
-      for (int j = 0; j < img_x; ++j) {
-        img[i * img_x + j] = (i % 16) + (j % 4);
+    for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
+      for (int j = 0; j < (int)img_x; ++j) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
       }
     }
   } else {
-    for (int j = id; j < img_x; j += numThreads) {
-      for (int i = 0; i < img_y; ++i) {
-        img[i * img_x + j] = (i % 16) + (j % 4);
+    for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
+      for (int i = 0; i < (int)img_y; ++i) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = (i % 16) + (j % 4);
       }
     }
   }
@@ -177,15 +180,15 @@ void zero_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                        uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
   if (img_y > img_x) {
-    for (int i = id; i < img_y; i += numThreads) {
-      for (int j = 0; j < img_x; ++j) {
-        img[i * img_x + j] = 0;
+    for (int i = (int)id; i < (int)img_y; i += (int)numThreads) {
+      for (int j = 0; j < (int)img_x; ++j) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = 0;
       }
     }
   } else {
-    for (int j = id; j < img_x; j += numThreads) {
-      for (int i = 0; i < img_y; ++i) {
-        img[i * img_x + j] = 0;
+    for (int j = (int)id; j < (int)img_x; j += (int)numThreads) {
+      for (int i = 0; i < (int)img_y; ++i) {
+        img[(unsigned int)i * img_x + (unsigned int)j] = 0;
       }
     }
   }
@@ -197,18 +200,18 @@ extern uint32_t barrier_init;
 int verify_conv2d_image(volatile int32_t *img, uint32_t img_x, uint32_t img_y,
                         uint32_t id, uint32_t numThreads) {
   // Parallelize over rows
-  for (int i = id + 1; i < img_y - 1; i += numThreads) {
-    int32_t y = i % 16;
+  for (int i = (int)id + 1; i < (int)img_y - 1; i += (int)numThreads) {
+    int y = i % 16;
     if (i % 16 == 0)
       y = 4;
     if (i % 16 == 15)
       y = 11;
-    for (int32_t j = 1; j < img_x - 1; ++j) {
-      int32_t x = ((j % 4) / 2) + 1;
-      if (img[i * img_x + j] != x + y) {
-        return (i + j) == 0 ? -1 : i * img_x + j;
+    for (int j = 1; j < (int)img_x - 1; ++j) {
+      int x = ((j % 4) / 2) + 1;
+      if ((int)img[i * (int)img_x + j] != x + y) {
+        return (i + j) == 0 ? -1 : i * (int)img_x + j;
       }
-      img[i * img_x + j] = 0;
+      img[i * (int)img_x + j] = 0;
     }
   }
   return 0;
diff --git a/apps/common/link.ld b/apps/common/link.ld
index 169194115..b51601744 100644
--- a/apps/common/link.ld
+++ b/apps/common/link.ld
@@ -25,6 +25,7 @@ SECTIONS {
   .text : {
     *(.text.init)
     *(.text)
+    . = ALIGN(0x40);
   } > l2
 
   /* Data on L2 */
diff --git a/apps/common/runtime.mk b/apps/common/runtime.mk
index 07e8b96a5..00697da4c 100644
--- a/apps/common/runtime.mk
+++ b/apps/common/runtime.mk
@@ -28,7 +28,7 @@ GCC_INSTALL_DIR    ?= $(INSTALL_DIR)/riscv-gcc
 LLVM_INSTALL_DIR   ?= $(INSTALL_DIR)/llvm
 HALIDE_INSTALL_DIR ?= $(INSTALL_DIR)/halide
 
-COMPILER      ?= llvm
+COMPILER      ?= gcc
 XPULPIMG      ?= $(xpulpimg)
 
 RISCV_XLEN    ?= 32
@@ -68,6 +68,10 @@ RISCV_STRIP   ?= $(RISCV_PREFIX)strip
 
 # Defines
 DEFINES := -DNUM_CORES=$(num_cores) -DBOOT_ADDR=0x$(boot_addr) -DL2_BASE=0x$(l2_base) -DL2_SIZE=0x$(l2_size)
+# Define __XPULPIMG if the extension is active
+ifeq ($(XPULPIMG),1)
+	DEFINES += -D__XPULPIMG
+endif
 
 # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target
 RISCV_LLVM_TARGET  ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR)
@@ -75,7 +79,7 @@ RISCV_LLVM_TARGET  ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RI
 RISCV_WARNINGS += -Wunused-variable -Wconversion -Wall -Wextra # -Werror
 RISCV_FLAGS_COMMON_TESTS ?= -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -I$(CURDIR)/common -static
 RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -std=gnu99 -O3 -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
-RISCV_FLAGS_GCC    ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS)
+RISCV_FLAGS_GCC    ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS) -falign-loops=32 -falign-jumps=32
 
 RISCV_FLAGS_LLVM   ?= -mcmodel=small -mllvm -enable-misched
 ifeq ($(COMPILER),gcc)
diff --git a/apps/common/synchronization.c b/apps/common/synchronization.c
index 8d03d754f..eae5189be 100644
--- a/apps/common/synchronization.c
+++ b/apps/common/synchronization.c
@@ -16,8 +16,6 @@
 
 // Author: Samuel Riedel, ETH Zurich
 
-static inline unsigned amo_add(void volatile *const address, unsigned value);
-
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -71,20 +69,3 @@ void mempool_barrier(uint32_t num_cores, uint32_t cycles) {
       mempool_wait(cycles);
   }
 }
-
-/**
-
- * Expose the atomic add instruction.
- *
- * @param   address     A pointer to an address on L2 memory to store the value.
- * @param   value       Value to add to the specified memory location.
- *
- * @return  Value previously stored in memory.
- */
-static inline unsigned amo_add(void volatile *const address, unsigned value) {
-  unsigned ret;
-  __asm__ __volatile__("" : : : "memory");
-  asm volatile("amoadd.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
-  __asm__ __volatile__("" : : : "memory");
-  return ret;
-}
diff --git a/apps/common/synchronization.h b/apps/common/synchronization.h
index b23bb834b..0df50af8b 100644
--- a/apps/common/synchronization.h
+++ b/apps/common/synchronization.h
@@ -16,6 +16,47 @@
 
 // Author: Samuel Riedel, ETH Zurich
 
+#ifndef __SYNCHRONIZATION_H__
+#define __SYNCHRONIZATION_H__
+
 // Barrier functions
 void mempool_barrier_init(uint32_t core_id, uint32_t num_cores);
 void mempool_barrier(uint32_t num_cores, uint32_t cycles);
+
+// Atomics
+
+/**
+
+ * Expose the atomic add instruction.
+ *
+ * @param   address     A pointer to an address on L2 memory to store the value.
+ * @param   value       Value to add to the specified memory location.
+ *
+ * @return  Value previously stored in memory.
+ */
+static inline unsigned amo_add(void volatile *const address, unsigned value) {
+  unsigned ret;
+  asm volatile("" : : : "memory");
+  asm volatile("amoadd.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
+  asm volatile("" : : : "memory");
+  return ret;
+}
+
+/**
+
+ * Expose the atomic or instruction.
+ *
+ * @param   address     A pointer to an address on L2 memory to store the value.
+ * @param   value       Value to add to the specified memory location.
+ *
+ * @return  Value previously stored in memory.
+ */
+static inline unsigned amo_or(void volatile *const address, unsigned value) {
+  unsigned ret;
+  asm volatile("" : : : "memory");
+  asm volatile("amoor.w  %0, %1, (%2)" : "=r"(ret) : "r"(value), "r"(address));
+  asm volatile("" : : : "memory");
+  return ret;
+}
+
+#endif // __SYNCHRONIZATION_H__
diff --git a/apps/common/xpulp/builtins_v2.h b/apps/common/xpulp/builtins_v2.h
new file mode 100644
index 000000000..60923b321
--- /dev/null
+++ b/apps/common/xpulp/builtins_v2.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright (C) 2019 ETH Zurich, University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HAL_RISCV_BUILTINS_V2_H__
+#define __HAL_RISCV_BUILTINS_V2_H__
+
+/* ARITHMETIC SECTION */
+typedef signed short v2s __attribute__((vector_size(4)));
+typedef unsigned short v2u __attribute__((vector_size(4)));
+
+typedef signed char v4s __attribute__((vector_size(4)));
+typedef unsigned char v4u __attribute__((vector_size(4)));
+
+/* Packing of scalars into vectors */
+#define __PACK2(x, y) __builtin_pulp_pack2((signed short)(x), (signed short)(y))
+#define __PACKU2(x, y)                                                         \
+  __builtin_pulp_pack2((unsigned short)(x), (unsigned short)(y))
+
+#define __PACK4(x, y, z, t)                                                    \
+  __builtin_pulp_pack4((signed char)(x), (signed char)(y), (signed char)(z),   \
+                       (signed char)(t))
+#define __PACKU4(x, y, z, t)                                                   \
+  __builtin_pulp_pack4((unsigned char)(x), (unsigned char)(y),                 \
+                       (unsigned char)(z), (unsigned char)(t))
+
+/* Max */
+#define __MAX(a, b) __builtin_pulp_maxsi((a), (b))
+
+#define __MAX2(x, y) __builtin_pulp_max2((x), (y))
+#define __MAX4(x, y) __builtin_pulp_max4((x), (y))
+
+#define __MAXU2(x, y) __builtin_pulp_maxu2((x), (y))
+#define __MAXU4(x, y) __builtin_pulp_maxu4((x), (y))
+
+/* Min */
+#define __MIN2(x, y) __builtin_pulp_min2((x), (y))
+#define __MIN4(x, y) __builtin_pulp_min4((x), (y))
+
+#define __MINU2(x, y) __builtin_pulp_minu2((x), (y))
+#define __MINU4(x, y) __builtin_pulp_minu4((x), (y))
+
+/* Clip */
+#define __CLIP(x, precision)                                                   \
+  __builtin_pulp_clip((x), -(1 << (precision)), (1 << precision) - 1)
+#define __CLIP_R(x, bound) __builtin_pulp_clip_r((x), (bound))
+
+#define __CLIPU(x, precision) __builtin_pulp_clipu((x), 0, (1 << precision) - 1)
+#define __CLIPU_R(x, bound) __builtin_pulp_clipu_r((x), (bound))
+
+/* Abs */
+#define __ABS2(x) __builtin_pulp_abs2((x))
+#define __ABS4(x) __builtin_pulp_abs4((x))
+
+/* Unary minus */
+#define __NEG2(x) __builtin_pulp_neg2((x))
+#define __NEG4(x) __builtin_pulp_neg4((x))
+
+/* Addition */
+#define __ADD2(x, y) __builtin_pulp_add2((x), (y))
+#define __ADD4(x, y) __builtin_pulp_add4((x), (y))
+
+/* Substraction */
+#define __SUB2(x, y) __builtin_pulp_sub2((x), (y))
+#define __SUB4(x, y) __builtin_pulp_sub4((x), (y))
+
+/* Average */
+#define __AVG2(x, y) __builtin_pulp_avg2((x), (y))
+#define __AVG4(x, y) __builtin_pulp_avg4((x), (y))
+
+/* Average unsigned */
+#define __AVGU2(x, y) __builtin_pulp_avgu2((x), (y))
+#define __AVGU4(x, y) __builtin_pulp_avgu4((x), (y))
+
+/* Bitwise and */
+#define __AND2(x, y) __builtin_pulp_and2((x), (y))
+#define __AND4(x, y) __builtin_pulp_and4((x), (y))
+
+/* Bitwise or */
+#define __OR2(x, y) __builtin_pulp_or2((x), (y))
+#define __OR4(x, y) __builtin_pulp_or4((x), (y))
+
+/* Bitwise exor */
+#define __EXOR2(x, y) __builtin_pulp_exor2(x, y)
+#define __EXOR4(x, y) __builtin_pulp_exor4(x, y)
+
+/* Logical shift right */
+#define __SRL2(x, y) __builtin_pulp_srl2(x, y)
+#define __SRL4(x, y) __builtin_pulp_srl4(x, y)
+
+/* Arithmetic shift right */
+#define __SRA2(x, y) __builtin_pulp_sra2(x, y)
+#define __SRA4(x, y) __builtin_pulp_sra4(x, y)
+
+/* Logical shift left */
+#define __SLL2(x, y) __builtin_pulp_sll2(x, y)
+#define __SLL4(x, y) __builtin_pulp_sll4(x, y)
+
+/* Mac */
+#define __MAC(Acc, x, y) __builtin_pulp_mac((x), (y), (Acc))
+#define __MSU(Acc, x, y) __builtin_pulp_msu((x), (y), (Acc))
+
+#define __MACS(Acc, x, y) __builtin_pulp_macs((x), (y), (Acc))
+#define __MACHHS(Acc, x, y) __builtin_pulp_machhs((x), (y), (Acc))
+#define __MACU(Acc, x, y) __builtin_pulp_macu((x), (y), (Acc))
+#define __MACHHU(Acc, x, y) __builtin_pulp_machhu((x), (y), (Acc))
+
+#define __MACSN(Acc, x, y, n) __builtin_pulp_macsN((x), (y), (Acc), (n))
+#define __MACUN(Acc, x, y, n) __builtin_pulp_macuN((x), (y), (Acc), (n))
+#define __MACSRN(Acc, x, y, n)                                                 \
+  __builtin_pulp_macsRN((x), (y), (Acc), (n), (1 << ((n)-1)))
+#define __MACURN(Acc, x, y, n)                                                 \
+  __builtin_pulp_macuRN((x), (y), (Acc), (n), (1 << ((n)-1)))
+
+#define __MACHHSN(Acc, x, y, n) __builtin_pulp_machhsN((x), (y), (Acc), (n))
+#define __MACHHUN(Acc, x, y, n) __builtin_pulp_machhuN((x), (y), (Acc), (n))
+#define __MACHHSRN(Acc, x, y, n)                                               \
+  __builtin_pulp_machhsN((x), (y), (Acc), (n), (1 << ((n)-1)))
+#define __MACHHURN(Acc, x, y, n)                                               \
+  __builtin_pulp_machhuN((x), (y), (Acc), (n), (1 << ((n)-1)))
+
+/* Multiplications */
+#define __MULS(x, y) __builtin_pulp_muls((x), (y))
+#define __MULU(x, y) __builtin_pulp_mulu((x), (y))
+#define __MULHHS(x, y) __builtin_pulp_mulhhs((x), (y))
+#define __MULHHU(x, y) __builtin_pulp_mulhhu((x), (y))
+
+#define __MULSN(x, y, n) __builtin_pulp_mulsN((x), (y), (n))
+#define __MULSRN(x, y, n) __builtin_pulp_mulsRN((x), (y), (n), (1 << ((n)-1)))
+#define __MULUN(x, y, n) __builtin_pulp_muluN((x), (y), (n))
+#define __MULURN(x, y, n) __builtin_pulp_muluRN((x), (y), (n), (1 << ((n)-1)))
+
+#define __MULHHSN(x, y, n) __builtin_pulp_mulhhsN((x), (y), (n))
+#define __MULHHSRN(x, y, n)                                                    \
+  __builtin_pulp_mulhhsRN((x), (y), (n), (1 << ((n)-1)))
+#define __MULHHUN(x, y, n) __builtin_pulp_mulhhuN((x), (y), (n))
+#define __MULHHURN(x, y, n)                                                    \
+  __builtin_pulp_mulhhuRN((x), (y), (n), (1 << ((n)-1)))
+
+/* Vectorial product and sum of products */
+#define __DOTP2(x, y) __builtin_pulp_dotsp2((x), (y))
+#define __DOTPU2(x, y) __builtin_pulp_dotup2((x), (y))
+#define __DOTPUS2(x, y) __builtin_pulp_dotusp2((x), (y))
+
+#define __DOTPSC2(x, y) __builtin_pulp_dotspsc2((x), (y))
+#define __DOTPUSC2(x, y) __builtin_pulp_dotupsc2((x), (y))
+#define __DOTPUSSC2(x, y) __builtin_pulp_dotuspsc2((x), (y))
+
+#define __SUMDOTP2(x, y, z) __builtin_pulp_sdotsp2((x), (y), (z))
+#define __SUMDOTPU2(x, y, z) __builtin_pulp_sdotup2((x), (y), (z))
+#define __SUMDOTPUS2(x, y, z) __builtin_pulp_sdotusp2((x), (y), (z))
+
+#define __SUMDOTPSC2(x, y, z) __builtin_pulp_sdotspsc2((x), (y), (z))
+#define __SUMDOTPUSC2(x, y, z) __builtin_pulp_sdotupsc2((x), (y), (z))
+#define __SUMDOTPUSSC2(x, y, z) __builtin_pulp_sdotuspsc2((x), (y), (z))
+
+#define __DOTP4(x, y) __builtin_pulp_dotsp4((x), (y))
+#define __DOTPU4(x, y) __builtin_pulp_dotup4((x), (y))
+#define __DOTPUS4(x, y) __builtin_pulp_dotusp4((x), (y))
+
+#define __DOTPSC4(x, y) __builtin_pulp_dotspsc4((x), (y))
+#define __DOTPUSC4(x, y) __builtin_pulp_dotupsc4((x), (y))
+#define __DOTPUSSC4(x, y) __builtin_pulp_dotuspsc4((x), (y))
+
+#define __SUMDOTP4(x, y, z) __builtin_pulp_sdotsp4((x), (y), (z))
+#define __SUMDOTPU4(x, y, z) __builtin_pulp_sdotup4((x), (y), (z))
+#define __SUMDOTPUS4(x, y, z) __builtin_pulp_sdotusp4((x), (y), (z))
+
+#define __SUMDOTPSC4(x, y, z) __builtin_pulp_sdotspsc4((x), (y), (z))
+#define __SUMDOTPUSC4(x, y, z) __builtin_pulp_sdotupsc4((x), (y), (z))
+#define __SUMDOTPUSSC4(x, y, z) __builtin_pulp_sdotuspsc4((x), (y), (z))
+
+#ifdef ARCHI_CORE_HAS_CPLX
+
+/* Complex Multiplication, Q15x15 into Q15, with optional post scaling by 1 or 2
+ */
+#define __CPLXMULS(x, y) __builtin_pulp_cplxmuls((x), (y))
+#define __CPLXMULSDIV2(x, y) __builtin_pulp_cplxmulsdiv2((x), (y))
+#define __CPLXMULSDIV4(x, y) __builtin_pulp_cplxmulsdiv4((x), (y))
+
+/* Complex conjugate */
+#define __CPLXCONJ(x) __builtin_pulp_cplx_conj((x))
+
+/* Complex substration, result rotated by -pi/2 */
+#define __SUB2ROTMJ(x, y) __builtin_pulp_sub2rotmj((x), (y))
+
+/* Complex addition with post scaling by 1 or 2 */
+#define __ADD2DIV2(x, y) __builtin_pulp_add2div2((x), (y))
+#define __ADD2DIV4(x, y) __builtin_pulp_add2div4((x), (y))
+
+#define __ADD4DIV2(x, y) __builtin_pulp_add4div2((x), (y))
+#define __ADD4DIV4(x, y) __builtin_pulp_add4div4((x), (y))
+
+/* Complex subtraction with post scaling by 1 or 2 */
+#define __SUB2DIV2(x, y) __builtin_pulp_sub2div2((x), (y))
+#define __SUB2DIV4(x, y) __builtin_pulp_sub2div4((x), (y))
+
+#define __SUB4DIV2(x, y) __builtin_pulp_sub4div2((x), (y))
+#define __SUB4DIV4(x, y) __builtin_pulp_sub4div4((x), (y))
+
+/* Viterbi Max and Viterbi Select, pair of Q15 */
+#define __VITMAX(x, y) __builtin_pulp_vitmax2((x), (y))
+#define __VITSEL(x, y) __builtin_pulp_vitsel2((x), (y))
+
+#endif
+
+/* Position of the most significant bit of x */
+#define __FF1(x) __builtin_pulp_ff1((x))
+#define __FL1(x) __builtin_pulp_fl1((x))
+
+/* Number of sign bits */
+#define __CLB(x) __builtin_pulp_clb((x))
+
+static inline unsigned int __attribute__((always_inline))
+__ExtInsMaskFast(unsigned int Size, unsigned int Offset) {
+  return ((((Size - 1)) << 5) | (Offset));
+}
+static inline unsigned int __attribute__((always_inline))
+__ExtInsMaskSafe(unsigned int Size, unsigned int Offset) {
+  return ((((Size - 1) & 0x1F) << 5) | (Offset & 0x1F));
+}
+
+/* Bit set */
+#define __BITSET(x, size, off)                                                 \
+  __builtin_pulp_bset((x), (((1 << (size)) - 1) << (off)))
+#define __BITSET_R(x, size, off)                                               \
+  __builtin_pulp_bset_r((x), __ExtInsMaskFast((size), (off)))
+#define __BITSET_R_SAFE(x, size, off)                                          \
+  __builtin_pulp_bset_r((x), __ExtInsMaskSafe((size), (off)))
+
+/* Bit clr */
+#define __BITCLR(x, size, off)                                                 \
+  __builtin_pulp_bclr((x), ~(((1 << (size)) - 1) << (off)))
+#define __BITCLR_R(x, size, off)                                               \
+  __builtin_pulp_bclr_r((x), __ExtInsMaskFast((size), (off)))
+#define __BITCLR_R_SAFE(x, size, off)                                          \
+  __builtin_pulp_bclr_r((x), __ExtInsMaskSafe((size), (off)))
+
+/* Bit Extraction */
+#define __BITEXTRACT(x, size, off) __builtin_pulp_bextract((x), (size), (off))
+#define __BITEXTRACTU(x, size, off) __builtin_pulp_bextractu((x), (size), (off))
+
+#define __BITEXTRACT_R(x, size, off)                                           \
+  __builtin_pulp_bextract_r((x), __ExtInsMaskFast((size), (off)))
+#define __BITEXTRACTU_R(x, size, off)                                          \
+  __builtin_pulp_bextractu_r((x), __ExtInsMaskFast((size), (off)))
+
+#define __BITEXTRACT_R_SAFE(x, size, off)                                      \
+  __builtin_pulp_bextract_r((x), __ExtInsMaskSafe((size), (off)))
+#define __BITEXTRACTU_R_SAFE(x, size, off)                                     \
+  __builtin_pulp_bextractu_r((x), __ExtInsMaskSafe((size), (off)))
+
+/* Bit insertion */
+#define __BITINSERT(dst, src, size, off)                                       \
+  __builtin_pulp_binsert((dst), ~(((1 << (size)) - 1) << (off)), (src),        \
+                         (((1 << (size)) - 1) << (off)), (off))
+#define __BITINSERT_R(dst, src, size, off)                                     \
+  __builtin_pulp_binsert_r((dst), (src), __ExtInsMaskFast((size), (off)))
+#define __BITINSERT_R_SAFE(dst, src, size, off)                                \
+  __builtin_pulp_binsert_r((dst), (src), __ExtInsMaskSafe((size), (off)))
+
+/* 1 bit rotation to the right, 32 bits input */
+#define __ROTR(x) __builtin_pulp_rotr((x))
+
+/* Add with normalization */
+#define __ADDNORMU(x, y, scale) __builtin_pulp_adduN((x), (y), (scale))
+#define __ADDNORMU_REG(x, y, scale) __builtin_pulp_adduN_r((x), (y), (scale))
+#define __ADDNORM(x, y, scale) __builtin_pulp_addN((x), (y), (scale))
+#define __ADDNORM_REG(x, y, scale) __builtin_pulp_addN_r((x), (y), (scale))
+
+/* Add with normalization and rounding */
+#define __ADDROUNDNORMU(x, y, scale)                                           \
+  __builtin_pulp_adduRN((x), (y), (scale), (1 << ((scale)-1)))
+#define __ADDROUNDNORMU_REG(x, y, scale)                                       \
+  __builtin_pulp_adduRN_r((x), (y), (scale))
+#define __ADDROUNDNORM(x, y, scale)                                            \
+  __builtin_pulp_addRN((x), (y), (scale), (1 << ((scale)-1)))
+#define __ADDROUNDNORM_REG(x, y, scale)                                        \
+  __builtin_pulp_addRN_r((x), (y), (scale))
+
+/* Sub with normalization */
+#define __SUBNORMU(x, y, scale) __builtin_pulp_subuN((x), (y), (scale))
+#define __SUBNORMU_REG(x, y, scale) __builtin_pulp_subuN_r((x), (y), (scale))
+#define __SUBNORM(x, y, scale) __builtin_pulp_subN((x), (y), (scale))
+#define __SUBNORM_REG(x, y, scale) __builtin_pulp_subN_r((x), (y), (scale))
+
+/* Sub with normalization and rounding */
+#define __SUBROUNDNORMU(x, y, scale)                                           \
+  __builtin_pulp_subuRN((x), (y), (scale), (1 << ((scale)-1)))
+#define __SUBROUNDNORMU_REG(x, y, scale)                                       \
+  __builtin_pulp_subuRN_r((x), (y), (scale))
+#define __SUBROUNDNORM(x, y, scale)                                            \
+  __builtin_pulp_subRN((x), (y), (scale), (1 << ((scale)-1)))
+#define __SUBROUNDNORM_REG(x, y, scale)                                        \
+  __builtin_pulp_subRN_r((x), (y), (scale))
+
+/* Normalization and rounding */
+#define __ROUNDNORMU(x, scale)                                                 \
+  __builtin_pulp_adduRN((x), 0, (scale), (1 << ((scale)-1)))
+#define __ROUNDNORMU_REG(x, scale) __builtin_pulp_adduRN_r((x), 0, (scale))
+#define __ROUNDNORM(x, scale)                                                  \
+  __builtin_pulp_addRN((x), 0, (scale), (1 << ((scale)-1)))
+#define __ROUNDNORM_REG(x, scale) __builtin_pulp_addRN_r((x), 0, (scale))
+
+#define __COREID() __builtin_pulp_CoreId()
+#define __CLUSTERID() __builtin_pulp_ClusterId()
+#define __NCORE() __builtin_pulp_CoreCount()
+#define __ISFC() __builtin_pulp_IsFc()
+
+#define __SPRWRITE(x, y) __builtin_pulp_spr_write(x, y)
+#define __SPRREAD(x) __builtin_pulp_spr_read(x)
+#define __SPRREAD_VOL(x) __builtin_pulp_spr_read_vol(x)
+
+#define __SPRBITSET(spr, off) __builtin_pulp_spr_bit_set((spr), (off))
+#define __SPRBITCLR(spr, off) __builtin_pulp_spr_bit_clr((spr), (off))
+
+#define __SPRREADTHENWRITE(spr, x)                                             \
+  __builtin_pulp_read_then_spr_write((spr), (x))
+#define __SPRREADTHENBITSET(spr, off)                                          \
+  __builtin_pulp_read_then_spr_bit_set((spr), (off))
+#define __SPRREADTHENBITCLR(spr, off)                                          \
+  __builtin_pulp_read_then_spr_bit_clr((spr), (off))
+
+#define __READ_BASE_OFF(base, off) __builtin_pulp_read_base_off((base), (off))
+#define __WRITE_BASE_OFF(base, off, val)                                       \
+  __builtin_pulp_write_base_off((base), (off), (val))
+
+#define __READ_BASE_OFF_VOL(base, off)                                         \
+  __builtin_pulp_OffsetedRead((base), (off))
+#define __READ_BASE_OFF_HALF_VOL(base, off)                                    \
+  __builtin_pulp_OffsetedReadHalf((base), (off))
+#define __READ_BASE_OFF_BYTE_VOL(base, off)                                    \
+  __builtin_pulp_OffsetedReadByte((base), (off))
+
+#define __WRITE_BASE_OFF_VOL(x, base, off)                                     \
+  __builtin_pulp_OffsetedWrite((x), (base), (off))
+#define __WRITE_BASE_OFF_HALF_VOL(x, base, off)                                \
+  __builtin_pulp_OffsetedWriteHalf((x), (base), (off))
+#define __WRITE_BASE_OFF_BYTE_VOL(x, base, off)                                \
+  __builtin_pulp_OffsetedWriteByte((x), (base), (off))
+/* Utilities, Target independant */
+#define FIX2FP(Val, Precision) ((float)(Val) / (float)(1 << (Precision)))
+#define FP2FIXR(Val, Precision) ((int)((Val) * ((1 << (Precision)) - 1) + 0.5))
+#define FP2FIX(Val, Precision) ((int)((Val) * ((1 << (Precision)) - 1)))
+
+#endif
diff --git a/apps/common/xpulp/conv_2d.h b/apps/common/xpulp/conv_2d.h
new file mode 100644
index 000000000..31b0ad167
--- /dev/null
+++ b/apps/common/xpulp/conv_2d.h
@@ -0,0 +1,399 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+//         Davide Schiavone, ETH Zurich
+//         Sergio Mazzola, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+/*
+ * 2D Convolution 3x3 ----------------------------------
+ * kernel     = conv2d_3x3_unrolled_i8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = no
+ * unrolling  = whole 3x3 kernel
+ * simd       = no
+ */
+void conv2d_3x3_unrolled_i8_rv32im(int8_t const volatile *__restrict__ in,
+                                   uint32_t in_x, uint32_t in_y,
+                                   uint8_t const volatile *__restrict__ k,
+                                   int32_t volatile *__restrict__ out) {
+  int32_t sum;
+  uint32_t weight = 0;
+  for (int i = 0; i < 9; ++i) {
+    weight += k[i];
+  }
+
+  for (uint32_t i = 1; i < in_x - 1; ++i) {
+    for (uint32_t j = 1; j < in_y - 1; j++) {
+      sum = 0;
+      sum += in[(j - 1) * in_x + (i - 1)] * k[0];
+      sum += in[(j - 1) * in_x + (i + 0)] * k[1];
+      sum += in[(j - 1) * in_x + (i + 1)] * k[2];
+      sum += in[(j + 0) * in_x + (i - 1)] * k[3];
+      sum += in[(j + 0) * in_x + (i + 0)] * k[4];
+      sum += in[(j + 0) * in_x + (i + 1)] * k[5];
+      sum += in[(j + 1) * in_x + (i - 1)] * k[6];
+      sum += in[(j + 1) * in_x + (i + 0)] * k[7];
+      sum += in[(j + 1) * in_x + (i + 1)] * k[8];
+      out[j * in_x + i] = sum / (int)weight;
+    }
+  }
+}
+
+/*
+ * 2D Convolution 3x3 ----------------------------------
+ * kernel     = conv2d_3x3_unrolled_i8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = no
+ * unrolling  = whole 3x3 kernel
+ * simd       = no
+ */
+void conv2d_3x3_unrolled2_i8_rv32im(int8_t const volatile *__restrict__ in,
+                                    uint32_t in_x, uint32_t in_y,
+                                    uint8_t const volatile *__restrict__ k,
+                                    int32_t volatile *__restrict__ out) {
+  int32_t sum;
+  uint32_t weight = 0;
+
+  int8_t elem_00, elem_01, elem_02;
+  int8_t elem_10, elem_11, elem_12;
+  int8_t elem_20, elem_21, elem_22;
+
+  uint32_t j;
+
+  for (int i = 0; i < 9; ++i) {
+    weight += k[i];
+  }
+
+  for (uint32_t i = 1; i < in_x - 1; ++i) {
+    elem_00 = in[i - 1];
+    elem_01 = in[i + 0];
+    elem_02 = in[i + 1];
+    elem_10 = in[in_x + (i - 1)];
+    elem_11 = in[in_x + (i + 0)];
+    elem_12 = in[in_x + (i + 1)];
+    elem_20 = in[2 * in_x + (i - 1)];
+    elem_21 = in[2 * in_x + (i + 0)];
+    elem_22 = in[2 * in_x + (i + 1)];
+    for (j = 1; j < in_y - 1; j++) {
+      sum = 0;
+      sum += elem_00 * k[0];
+      sum += elem_01 * k[1];
+      sum += elem_02 * k[2];
+      sum += elem_10 * k[3];
+      sum += elem_11 * k[4];
+      sum += elem_12 * k[5];
+      sum += elem_20 * k[6];
+      sum += elem_21 * k[7];
+      sum += elem_22 * k[8];
+
+      elem_00 = elem_10;
+      elem_01 = elem_11;
+      elem_02 = elem_12;
+      elem_10 = elem_20;
+      elem_11 = elem_21;
+      elem_12 = elem_22;
+      elem_20 = in[(j + 2) * in_x + (i - 1)];
+      elem_21 = in[(j + 2) * in_x + (i + 0)];
+      elem_22 = in[(j + 2) * in_x + (i + 1)];
+
+      out[j * in_x + i] = sum / (int)weight;
+    }
+  }
+}
+
+/*
+ * 2D Convolution 3x3 ----------------------------------
+ * kernel     = conv_3x3_unrolled_i8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = no
+ * unrolling  = whole 3x3 kernel
+ * simd       = yes, Xpulpv2 intrinsics
+ *
+ * Insipired from Conv3x3_Vector from pulp-training
+ */
+#ifdef __XPULPIMG
+void conv2d_3x3_unrolled_i8_xpulpv2(
+    int8_t const volatile *__restrict__ In_Img,
+    int32_t volatile *__restrict__ Out_Img, uint32_t R, uint32_t C,
+    uint8_t const volatile *__restrict__ Kernel) {
+  v4u coeff_0, coeff_1, coeff_2;
+  v4s Img_0, Img_1, Img_2;
+  v4s new_data;
+  uint32_t r, c, t;
+  int32_t S;
+
+  uint32_t weight = 0;
+  for (int i = 0; i < 9; ++i) {
+    weight += Kernel[i];
+  }
+
+  coeff_0 = (v4u){Kernel[0], Kernel[1], Kernel[2], 0};
+  coeff_1 = (v4u){Kernel[3], Kernel[4], Kernel[5], 0};
+  coeff_2 = (v4u){Kernel[6], Kernel[7], Kernel[8], 0};
+
+  // image board is black
+  for (c = 1; c < C - 1; c++) {
+
+    Img_0 = (v4s){In_Img[c - 1], In_Img[c], In_Img[c + 1], 0};
+    Img_1 = (v4s){In_Img[c - 1 + R], In_Img[c + R], In_Img[c + 1 + R], 0};
+    Img_2 = (v4s){In_Img[c - 1 + R * 2], In_Img[c + R * 2],
+                  In_Img[c + 1 + R * 2], 0};
+
+    for (r = 1; r < R - 1; r++) {
+      t = r * R + c;
+      S = __builtin_pulp_dotsp4(Img_0, coeff_0);
+      S = __builtin_pulp_sdotsp4(Img_1, coeff_1, S);
+      S = __builtin_pulp_sdotsp4(Img_2, coeff_2, S);
+
+      Out_Img[t] = S / (int)weight;
+
+      // load a new rod
+      new_data = (v4s){In_Img[(r + 2) * R + c - 1], In_Img[(r + 2) * R + c],
+                       In_Img[(r + 2) * R + c + 1], 0};
+      // move the window: move each vector one line down
+      Img_0 = Img_1;
+      Img_1 = Img_2;
+      Img_2 = new_data;
+    }
+  }
+}
+#endif
+
+/*
+ * 2D Convolution 3x3 ----------------------------------
+ * kernel     = conv_3x3_unrolled2_i8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = no
+ * unrolling  = whole 3x3 kernel, 2 kernels per iteration
+ * simd       = yes, Xpulpv2 intrinsics
+ *
+ * Insipired from Conv3x3_Vector from pulp-training
+ */
+#ifdef __XPULPIMG
+void conv2d_3x3_unrolled2_i8_xpulpv2(
+    int8_t const volatile *__restrict__ In_Img,
+    int32_t volatile *__restrict__ Out_Img, uint32_t R, uint32_t C,
+    uint8_t const volatile *__restrict__ Kernel) {
+  v4u coeff_0, coeff_1, coeff_2;
+  v4s Img_00, Img_10, Img_20;
+  v4s Img_01, Img_11, Img_21;
+  v4s new_data_0, new_data_1;
+  uint32_t r, c;
+  int32_t S_0, S_1;
+
+  uint32_t weight = 0;
+  for (int i = 0; i < 9; ++i) {
+    weight += Kernel[i];
+  }
+
+  coeff_0 = (v4u){Kernel[0], Kernel[1], Kernel[2], 0};
+  coeff_1 = (v4u){Kernel[3], Kernel[4], Kernel[5], 0};
+  coeff_2 = (v4u){Kernel[6], Kernel[7], Kernel[8], 0};
+
+  // image board is black
+  for (c = 1; c < C / 2; c++) {
+
+    Img_00 = (v4s){In_Img[2 * c - 2], In_Img[2 * c - 1], In_Img[2 * c], 0};
+    Img_10 = (v4s){In_Img[2 * c - 2 + R], In_Img[2 * c - 1 + R],
+                   In_Img[2 * c + R], 0};
+    Img_20 = (v4s){In_Img[2 * c - 2 + R * 2], In_Img[2 * c - 1 + R * 2],
+                   In_Img[2 * c + R * 2], 0};
+
+    Img_01 = (v4s){In_Img[2 * c - 1], In_Img[2 * c], In_Img[2 * c + 1], 0};
+    Img_11 = (v4s){In_Img[2 * c - 1 + R], In_Img[2 * c + R],
+                   In_Img[2 * c + 1 + R], 0};
+    Img_21 = (v4s){In_Img[2 * c - 1 + R * 2], In_Img[2 * c + R * 2],
+                   In_Img[2 * c + 1 + R * 2], 0};
+
+    for (r = 1; r < R - 1; r++) {
+      S_0 = __builtin_pulp_dotsp4(Img_00, coeff_0);
+      S_1 = __builtin_pulp_dotsp4(Img_01, coeff_0);
+
+      S_0 = __builtin_pulp_sdotsp4(Img_10, coeff_1, S_0);
+      S_1 = __builtin_pulp_sdotsp4(Img_11, coeff_1, S_1);
+
+      S_0 = __builtin_pulp_sdotsp4(Img_20, coeff_2, S_0);
+      S_1 = __builtin_pulp_sdotsp4(Img_21, coeff_2, S_1);
+
+      int32_t res_0 = S_0 / (int)weight;
+      int32_t res_1 = S_1 / (int)weight;
+
+      // load a new rod
+      new_data_0 = (v4s){In_Img[(r + 2) * R + (2 * c - 1) - 1],
+                         In_Img[(r + 2) * R + (2 * c - 1)],
+                         In_Img[(r + 2) * R + (2 * c - 1) + 1], 0};
+      new_data_1 =
+          (v4s){In_Img[(r + 2) * R + 2 * c - 1], In_Img[(r + 2) * R + 2 * c],
+                In_Img[(r + 2) * R + 2 * c + 1], 0};
+      // move the window: move each vector one line down
+      Img_00 = Img_10;
+      Img_10 = Img_20;
+      Img_20 = new_data_0;
+      Img_01 = Img_11;
+      Img_11 = Img_21;
+      Img_21 = new_data_1;
+
+      Out_Img[r * R + (2 * c - 1)] = res_0;
+      Out_Img[r * R + 2 * c] = res_1;
+    }
+  }
+}
+#endif
+
+// Testing
+// Initialize the image in parallel
+void init_conv2d_image_i8(volatile int8_t *img, uint32_t img_x,
+                          uint32_t img_y) {
+  if (img_y > img_x) {
+    for (int i = 0; i < (int)img_y; ++i) {
+      for (int j = 0; j < (int)img_x; ++j) {
+        img[i * (int)img_x + j] = (int8_t)((i % 16) + (j % 4));
+      }
+    }
+  } else {
+    for (int j = 0; j < (int)img_x; ++j) {
+      for (int i = 0; i < (int)img_y; ++i) {
+        img[i * (int)img_x + j] = (int8_t)((i % 16) + (j % 4));
+      }
+    }
+  }
+}
+
+// Verify and reset the image
+int verify_conv2d_image_i8(volatile int32_t *img, uint32_t img_x,
+                           uint32_t img_y) {
+  for (int i = 1; i < (int)img_y - 1; ++i) {
+    int32_t y = i % 16;
+    if (i % 16 == 0)
+      y = 4;
+    if (i % 16 == 15)
+      y = 11;
+    for (int32_t j = 1; j < (int)img_x - 1; ++j) {
+      int32_t x = ((j % 4) / 2) + 1;
+      if ((int32_t)img[i * (int)img_x + j] != (int32_t)(x + y)) {
+        return (i + j) == 0 ? -1 : i * (int)img_x + j;
+      }
+      img[i * (int)img_x + j] = 0;
+    }
+  }
+  return 0;
+}
+
+// Verify and reset the image
+int verify_conv2d_image_i8_verbose(int32_t *img, uint32_t img_x,
+                                   uint32_t img_y) {
+  for (int i = 1; i < (int)img_y - 1; ++i) {
+    int32_t y = i % 16;
+    if (i % 16 == 0)
+      y = 4;
+    if (i % 16 == 15)
+      y = 11;
+    printf("|");
+    for (int32_t j = 1; j < (int)img_x - 1; ++j) {
+      int32_t x = ((j % 4) / 2) + 1;
+      printf(" %2u - %2u |", img[i * (int)img_x + j], x + y);
+    }
+    printf("\n");
+  }
+  return 0;
+}
+
+#ifdef __XPULPIMG
+void conv2d_3x3_unrolled_i8_xpulpv2_verbose(
+    int8_t const *__restrict__ In_Img, int32_t volatile *__restrict__ Out_Img,
+    uint32_t R, uint32_t C, uint8_t const volatile *__restrict__ Kernel) {
+  v4u coeff_0, coeff_1, coeff_2;
+  v4s Img_0, Img_1, Img_2;
+  v4s new_data;
+  uint32_t r, c, t;
+  volatile int32_t S;
+
+  uint32_t weight = 0;
+  for (int i = 0; i < 9; ++i) {
+    weight += Kernel[i];
+  }
+
+  coeff_0 = (v4u){Kernel[0], Kernel[1], Kernel[2], 0};
+  coeff_1 = (v4u){Kernel[3], Kernel[4], Kernel[5], 0};
+  coeff_2 = (v4u){Kernel[6], Kernel[7], Kernel[8], 0};
+
+  // image board is black
+  for (c = 1; c < C - 1; c++) {
+
+    Img_0 = (v4s){In_Img[c - 1], In_Img[c], In_Img[c + 1], 0};
+    Img_1 = (v4s){In_Img[c - 1 + R], In_Img[c + R], In_Img[c + 1 + R], 0};
+    Img_2 = (v4s){In_Img[c - 1 + R * 2], In_Img[c + R * 2],
+                  In_Img[c + 1 + R * 2], 0};
+
+    for (r = 1; r < R - 1; r++) {
+      printf("-------------\n");
+
+      printf("[ %u, %u, %u]\n", Img_0[0], Img_0[1], Img_0[2]);
+      printf("[ %u, %u, %u]\n", Img_1[0], Img_1[1], Img_1[2]);
+      printf("[ %u, %u, %u]\n", Img_2[0], Img_2[1], Img_2[2]);
+
+      t = r * R + c;
+      S = __builtin_pulp_dotsp4(Img_0, coeff_0);
+      S = __builtin_pulp_sdotsp4(Img_1, coeff_1, S);
+      S = __builtin_pulp_sdotsp4(Img_2, coeff_2, S);
+
+      printf("S = %d\n", S);
+      printf("S/weight = %d\n", S / (int)weight);
+
+      Out_Img[t] = S / (int)weight;
+      printf("Out_Img[%d] = %d\n", t, Out_Img[t]);
+
+      new_data = (v4s){In_Img[(r + 2) * R + c - 1], In_Img[(r + 2) * R + c],
+                       In_Img[(r + 2) * R + c + 1], 0};
+
+      // Move the window
+      /*
+        Three vectors:
+        Img_0 = {A0, A1, A2, 0}
+        Img_1 = {B0, B1, B2, 0}
+        Img_2 = {C0, C1, C2, 0}
+        Current Windonw:
+        XX XX XX
+        A0 A1 A2
+        B0 B1 B2
+        C0 C1 C2
+        D0 D1 D2
+        XX XX XX
+        We want to load next line (D0, D1, D2) in vector new_data
+        new_data = {D0, D1, D2, 0}
+        Move each vector one line down
+        Img_0 = Img_1
+        Img_1 = Img_2
+        Img_2 = new_data
+      */
+
+      Img_0 = Img_1;
+      Img_1 = Img_2;
+      Img_2 = new_data;
+    }
+  }
+}
+#endif
diff --git a/apps/common/xpulp/mat_mul.h b/apps/common/xpulp/mat_mul.h
new file mode 100644
index 000000000..78297c999
--- /dev/null
+++ b/apps/common/xpulp/mat_mul.h
@@ -0,0 +1,746 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+//         Sergio Mazzola, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+
+/* This library implements the matrix multiplication for several data widths
+ * in Zmultiple different ways. The functions all follow the following format:
+ *
+ * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ *
+ * Note that all the matrices dimensions must be multiples of 4; these
+ * kernels do not have clean-up code and remaining elements would not be
+ * considered, leading to wrong results
+ */
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x2_parallel_i8_rv32im
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ */
+void matmul_unrolled_2x2_parallel_i8_rv32im(int8_t const *__restrict__ A,
+                                            int8_t const *__restrict__ B,
+                                            int32_t *__restrict__ C, uint32_t M,
+                                            uint32_t N, uint32_t P, uint32_t id,
+                                            uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 8; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+  for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int8_t val_a00 = A[(i + 0) * N + k + 0];
+        int8_t val_a01 = A[(i + 0) * N + k + 1];
+        int8_t val_a10 = A[(i + 1) * N + k + 0];
+        int8_t val_a11 = A[(i + 1) * N + k + 1];
+        int8_t val_b00 = B[(k + 0) * P + j + 0];
+        int8_t val_b01 = B[(k + 0) * P + j + 1];
+        int8_t val_b10 = B[(k + 1) * P + j + 0];
+        int8_t val_b11 = B[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      C[(i + 0) * P + j + 0] = c00;
+      C[(i + 0) * P + j + 1] = c01;
+      C[(i + 1) * P + j + 0] = c10;
+      C[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x2_parallel_i16_rv32im
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ */
+void matmul_unrolled_2x2_parallel_i16_rv32im(int16_t const *__restrict__ A,
+                                             int16_t const *__restrict__ B,
+                                             int32_t *__restrict__ C,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t id, uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 8; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+  for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int16_t val_a00 = A[(i + 0) * N + k + 0];
+        int16_t val_a01 = A[(i + 0) * N + k + 1];
+        int16_t val_a10 = A[(i + 1) * N + k + 0];
+        int16_t val_a11 = A[(i + 1) * N + k + 1];
+        int16_t val_b00 = B[(k + 0) * P + j + 0];
+        int16_t val_b01 = B[(k + 0) * P + j + 1];
+        int16_t val_b10 = B[(k + 1) * P + j + 0];
+        int16_t val_b11 = B[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      C[(i + 0) * P + j + 0] = c00;
+      C[(i + 0) * P + j + 1] = c01;
+      C[(i + 1) * P + j + 0] = c10;
+      C[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x4_i8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = no
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ *
+ * Original plp_mat_mult_i8s_xpulpv2 from pulp-dsp
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_2x4_i8_xpulpv2(const int8_t *__restrict__ pSrcA,
+                                    const int8_t *__restrict__ pSrcB,
+                                    int32_t *__restrict__ pDstC, uint32_t M,
+                                    uint32_t N, uint32_t P) {
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  for (i = 0; i < M / 2; i++) {
+    for (k = 0; k < P / 4; k++) {
+
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      for (j = 0; j < N / 4; j++) {
+
+        v4s aVec0 = *((v4s *)&(pSrcA[(i * 2) * N + (j * 4)]));
+        v4s aVec1 = *((v4s *)&(pSrcA[(i * 2 + 1) * N + (j * 4)]));
+
+        v4s temp0 = *((v4s *)&(pSrcB[(j * 4) * P + (k * 4)]));
+        v4s temp1 = *((v4s *)&(pSrcB[(j * 4 + 1) * P + (k * 4)]));
+        v4s temp2 = *((v4s *)&(pSrcB[(j * 4 + 2) * P + (k * 4)]));
+        v4s temp3 = *((v4s *)&(pSrcB[(j * 4 + 3) * P + (k * 4)]));
+
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      pDstC[(i * 2) * P + (k * 4)] = sum00;
+      pDstC[(i * 2) * P + (k * 4 + 1)] = sum01;
+      pDstC[(i * 2) * P + (k * 4 + 2)] = sum02;
+      pDstC[(i * 2) * P + (k * 4 + 3)] = sum03;
+      pDstC[(i * 2 + 1) * P + (k * 4)] = sum10;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 1)] = sum11;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 2)] = sum12;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 3)] = sum13;
+    }
+  }
+}
+#endif
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x4_parallel_i8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ *
+ * Original plp_mat_mult_i8p_xpulpv2 from pulp-dsp
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_2x4_parallel_i8_xpulpv2(const int8_t *__restrict__ pSrcA,
+                                             const int8_t *__restrict__ pSrcB,
+                                             int32_t *__restrict__ pDstC,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t core_id,
+                                             uint32_t numThreads) {
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    for (i = 0; i < M / 2; i++) {
+
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      for (j = 0; j < N / 4; j++) {
+
+        v4s aVec0 = *((v4s *)&(pSrcA[(i * 2) * N + (j * 4)]));
+        v4s aVec1 = *((v4s *)&(pSrcA[(i * 2 + 1) * N + (j * 4)]));
+
+        v4s temp0 = *((v4s *)&(pSrcB[(j * 4) * P + (k * 4)]));
+        v4s temp1 = *((v4s *)&(pSrcB[(j * 4 + 1) * P + (k * 4)]));
+        v4s temp2 = *((v4s *)&(pSrcB[(j * 4 + 2) * P + (k * 4)]));
+        v4s temp3 = *((v4s *)&(pSrcB[(j * 4 + 3) * P + (k * 4)]));
+
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      pDstC[(i * 2) * P + (k * 4)] = sum00;
+      pDstC[(i * 2) * P + (k * 4 + 1)] = sum01;
+      pDstC[(i * 2) * P + (k * 4 + 2)] = sum02;
+      pDstC[(i * 2) * P + (k * 4 + 3)] = sum03;
+      pDstC[(i * 2 + 1) * P + (k * 4)] = sum10;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 1)] = sum11;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 2)] = sum12;
+      pDstC[(i * 2 + 1) * P + (k * 4 + 3)] = sum13;
+    }
+  }
+}
+#endif
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2
+ * data type  = 8-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (2x4 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * other      = using pointer incrementing insteady of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_i8p_xpulpv2 from pulp-dsp
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(
+    const int8_t *__restrict__ pSrcA, const int8_t *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads) {
+  // Masks for shuffles
+  static v4s mask0 = {0, 1, 4, 5};
+  static v4s mask1 = {2, 3, 6, 7};
+  static v4s mask2 = {0, 2, 4, 6};
+  static v4s mask3 = {1, 3, 5, 7};
+
+  // Loop counter for P
+  uint32_t k = 0;
+  // Row decrement for A matrix
+  int32_t const N_decr = -(int)N + 4;
+  // Row increment for C matrix
+  uint32_t const P_incr = (P * 4) - 12;
+
+  for (k = core_id; k < P / 4; k += numThreads) {
+    const int8_t *idx_a = &pSrcA[0];      // start_a
+    int32_t *idx_c = &pDstC[k * 4];       // start_c
+    int32_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 4)
+    while (idx_c < end_c) {
+
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum02 = 0;
+      int32_t sum03 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum12 = 0;
+      int32_t sum13 = 0;
+
+      int8_t const *end_a = idx_a + N;
+      const int8_t *idx_b = &pSrcB[k * 4]; // start_b
+      while (idx_a < end_a) {
+
+        v4s aVec0, aVec1;
+        v4s temp0, temp1, temp2, temp3;
+
+        __asm__ volatile(
+            "p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+            "p.lw %[a1], %[a_decr](%[addr_a]!) \n\t"
+            "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t2], %[b_incr](%[addr_b]!) \n\t"
+            "p.lw %[t3], %[b_incr](%[addr_b]!) \n\t"
+            : [ a0 ] "=&r"(aVec0), [ a1 ] "=&r"(aVec1), [ t0 ] "=&r"(temp0),
+              [ t1 ] "=&r"(temp1), [ t2 ] "=&r"(temp2), [ t3 ] "=&r"(temp3),
+              [ addr_a ] "+&r"(idx_a), [ addr_b ] "+&r"(idx_b)
+            : [ a_incr ] "r"(N), [ a_decr ] "r"(N_decr), [ b_incr ] "r"(P)
+            : "memory");
+        /* The asm code above implements the following commented C code */
+        // go to next row, same column
+        // v4s aVec0 = *((v4s *)idx_a); idx_a += N;
+        // go to previous row, one column forward
+        // v4s aVec1 = *((v4s *)idx_a); idx_a -= N - 4;
+        // v4s temp0 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp1 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp2 = *((v4s *)idx_b); idx_b += P;
+        // v4s temp3 = *((v4s *)idx_b); idx_b += P;
+
+        // Shuffles to transpose at runtime the chunk extracted from B before
+        // multiplying with A chunk temp0-3 variables needed because shuffles
+        // use rD as source, but also modify it, thus we need a copy of their
+        // content to use it twice in their original form
+        v4s temp4 = __builtin_shuffle(temp0, temp1, mask0); // 0,1,4,5
+        v4s temp5 = __builtin_shuffle(temp2, temp3, mask0); // 8,9,12,13
+        v4s temp6 = __builtin_shuffle(temp0, temp1, mask1); // 2,3,6,7
+        v4s temp7 = __builtin_shuffle(temp2, temp3, mask1); // 3,7,11,15
+
+        v4s bVec0 = __builtin_shuffle(temp4, temp5, mask2); // 0,4,8,12
+        v4s bVec1 = __builtin_shuffle(temp4, temp5, mask3); // 1,5,9,13
+        v4s bVec2 = __builtin_shuffle(temp6, temp7, mask2); // 2,6,10,14
+        v4s bVec3 = __builtin_shuffle(temp6, temp7, mask3); // 3,7,11,15
+
+        sum00 = __SUMDOTP4(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP4(aVec0, bVec1, sum01);
+        sum02 = __SUMDOTP4(aVec0, bVec2, sum02);
+        sum03 = __SUMDOTP4(aVec0, bVec3, sum03);
+        sum10 = __SUMDOTP4(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP4(aVec1, bVec1, sum11);
+        sum12 = __SUMDOTP4(aVec1, bVec2, sum12);
+        sum13 = __SUMDOTP4(aVec1, bVec3, sum13);
+      }
+
+      __asm__ volatile(
+          "p.sw %[s00], 4(%[addr_c]!) \n\t"
+          "p.sw %[s01], 4(%[addr_c]!) \n\t"
+          "p.sw %[s02], 4(%[addr_c]!) \n\t"
+          "p.sw %[s03], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s10], 4(%[addr_c]!) \n\t"
+          "p.sw %[s11], 4(%[addr_c]!) \n\t"
+          "p.sw %[s12], 4(%[addr_c]!) \n\t"
+          "p.sw %[s13], %[c_incr](%[addr_c]!) \n\t"
+          : [ addr_c ] "+&r"(idx_c)
+          : [ s00 ] "r"(sum00), [ s01 ] "r"(sum01), [ s02 ] "r"(sum02),
+            [ s03 ] "r"(sum03), [ s10 ] "r"(sum10), [ s11 ] "r"(sum11),
+            [ s12 ] "r"(sum12), [ s13 ] "r"(sum13), [ c_incr ] "r"(P_incr)
+          : "memory");
+      /* The asm code above implements the following commented C code */
+      // *(idx_c++) = sum00;
+      // *(idx_c++) = sum01;
+      // *(idx_c++) = sum02;
+      // *(idx_c) = sum03; idx_c += P - 3;
+      // *(idx_c++) = sum10;
+      // *(idx_c++) = sum11;
+      // *(idx_c++) = sum12;
+      // *(idx_c) = sum13; idx_c += P - 3;
+
+      idx_a += N; // adjust A matrix pointer
+    }
+  }
+}
+#endif
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_4x2_parallel_i16_xpulpv2
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (4x2 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ *
+ * Original plp_mat_mult_i16p_xpulpv2 from pulp-dsp
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_4x2_parallel_i16_xpulpv2(const int16_t *__restrict__ pSrcA,
+                                              const int16_t *__restrict__ pSrcB,
+                                              int32_t *__restrict__ pDstC,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t core_id,
+                                              uint32_t numThreads) {
+  uint32_t i = 0; // loop counter for M
+  uint32_t j = 0; // loop counter for N
+  uint32_t k = 0; // loop counter for P
+
+  for (k = core_id; k < P / 2; k += numThreads) {
+    for (i = 0; i < M / 4; i++) {
+
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum20 = 0;
+      int32_t sum21 = 0;
+      int32_t sum30 = 0;
+      int32_t sum31 = 0;
+
+      for (j = 0; j < N / 2; j++) {
+
+        v2s aVec0 = *((v2s *)&(pSrcA[(i * 4) * N + (j * 2)]));
+        v2s aVec1 = *((v2s *)&(pSrcA[(i * 4 + 1) * N + (j * 2)]));
+        v2s aVec2 = *((v2s *)&(pSrcA[(i * 4 + 2) * N + (j * 2)]));
+        v2s aVec3 = *((v2s *)&(pSrcA[(i * 4 + 3) * N + (j * 2)]));
+
+        v2s bTemp0 = *((v2s *)&(pSrcB[(j * 2) * P + (k * 2)]));
+        v2s bTemp1 = *((v2s *)&(pSrcB[(j * 2 + 1) * P + (k * 2)]));
+
+        v2s bVec0 = __builtin_shuffle(bTemp0, bTemp1, (v2s){0, 2});
+        v2s bVec1 = __builtin_shuffle(bTemp0, bTemp1, (v2s){1, 3});
+
+        sum00 = __SUMDOTP2(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP2(aVec0, bVec1, sum01);
+        sum10 = __SUMDOTP2(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP2(aVec1, bVec1, sum11);
+        sum20 = __SUMDOTP2(aVec2, bVec0, sum20);
+        sum21 = __SUMDOTP2(aVec2, bVec1, sum21);
+        sum30 = __SUMDOTP2(aVec3, bVec0, sum30);
+        sum31 = __SUMDOTP2(aVec3, bVec1, sum31);
+      }
+
+      pDstC[(i * 4) * P + (k * 2)] = sum00;
+      pDstC[(i * 4) * P + (k * 2 + 1)] = sum01;
+      pDstC[(i * 4 + 1) * P + (k * 2)] = sum10;
+      pDstC[(i * 4 + 1) * P + (k * 2 + 1)] = sum11;
+      pDstC[(i * 4 + 2) * P + (k * 2)] = sum20;
+      pDstC[(i * 4 + 2) * P + (k * 2 + 1)] = sum21;
+      pDstC[(i * 4 + 3) * P + (k * 2)] = sum30;
+      pDstC[(i * 4 + 3) * P + (k * 2 + 1)] = sum31;
+    }
+  }
+}
+#endif
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2
+ * data type  = 16-bit integer
+ * multi-core = yes
+ * unrolling  = 8 elements of C per iteration (4x2 chunks)
+ * simd       = yes, Xpulpv2 intrinsics
+ * other      = using pointer incrementing insteady of array
+ *              indexing and loads/stores explicitly written
+ *              in asm, for optimal register utilization
+ *
+ * Inspired from plp_mat_mult_i16p_xpulpv2 from pulp-dsp
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2(
+    const int16_t *__restrict__ pSrcA, const int16_t *__restrict__ pSrcB,
+    int32_t *__restrict__ pDstC, uint32_t M, uint32_t N, uint32_t P,
+    uint32_t core_id, uint32_t numThreads) {
+  // Loop counter for P
+  uint32_t k = 0;
+  // Increment for A matrix = 1 row forward
+  uint32_t const A_incr = N * sizeof(int16_t);
+  // Decrement for A matrix = 3 rows backward and 2 words forward
+  int32_t const A_decr =
+      -(int)(N * 3 * sizeof(int16_t)) + 2 * (int)sizeof(int16_t);
+  // Increment for B matrix = 1 row forward
+  uint32_t const B_incr = P * sizeof(int16_t); // bytes in 1 row
+  // Increment for C matrix = 1 row forward and 1 word backward
+  uint32_t const C_incr = (P * sizeof(int32_t)) - sizeof(int32_t);
+
+  for (k = core_id; k < P / 2; k += numThreads) {
+    const int16_t *idx_a = &pSrcA[0];     // start_a
+    int32_t *idx_c = &pDstC[k * 2];       // start_c
+    int32_t const *end_c = &pDstC[P * M]; // actually (P * M) + (k * 2)
+
+    while (idx_c < end_c) {
+
+      int32_t sum00 = 0;
+      int32_t sum01 = 0;
+      int32_t sum10 = 0;
+      int32_t sum11 = 0;
+      int32_t sum20 = 0;
+      int32_t sum21 = 0;
+      int32_t sum30 = 0;
+      int32_t sum31 = 0;
+
+      int16_t const *end_a = idx_a + N;
+      const int16_t *idx_b = &pSrcB[k * 2]; // start_b
+
+      while (idx_a < end_a) {
+
+        v2s aVec0, aVec1, aVec2, aVec3;
+        v2s bTemp0, bTemp1;
+
+        __asm__ volatile("p.lw %[a0], %[a_incr](%[addr_a]!) \n\t"
+                         "p.lw %[a1], %[a_incr](%[addr_a]!) \n\t"
+                         "p.lw %[a2], %[a_incr](%[addr_a]!) \n\t"
+                         "p.lw %[a3], %[a_decr](%[addr_a]!) \n\t"
+                         "p.lw %[t0], %[b_incr](%[addr_b]!) \n\t"
+                         "p.lw %[t1], %[b_incr](%[addr_b]!) \n\t"
+                         : [ a0 ] "=&r"(aVec0), [ a1 ] "=&r"(aVec1),
+                           [ a2 ] "=&r"(aVec2), [ a3 ] "=&r"(aVec3),
+                           [ t0 ] "=&r"(bTemp0), [ t1 ] "=&r"(bTemp1),
+                           [ addr_a ] "+&r"(idx_a), [ addr_b ] "+&r"(idx_b)
+                         : [ a_incr ] "r"(A_incr), [ a_decr ] "r"(A_decr),
+                           [ b_incr ] "r"(B_incr)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // v2s aVec0 = *((v2s *)&(pSrcA[(i * 4) * N + (j * 2)]));
+        // v2s aVec1 = *((v2s *)&(pSrcA[(i * 4 + 1) * N + (j * 2)]));
+        // v2s aVec2 = *((v2s *)&(pSrcA[(i * 4 + 2) * N + (j * 2)]));
+        // v2s aVec3 = *((v2s *)&(pSrcA[(i * 4 + 3) * N + (j * 2)]));
+        // v2s bTemp0 = *((v2s *)&(pSrcB[(j * 2) * P + (k * 2)]));
+        // v2s bTemp1 = *((v2s *)&(pSrcB[(j * 2 + 1) * P + (k * 2)]));
+
+        v2s bVec0 = __builtin_shuffle(bTemp0, bTemp1, (v2s){0, 2});
+        v2s bVec1 = __builtin_shuffle(bTemp0, bTemp1, (v2s){1, 3});
+
+        sum00 = __SUMDOTP2(aVec0, bVec0, sum00);
+        sum01 = __SUMDOTP2(aVec0, bVec1, sum01);
+        sum10 = __SUMDOTP2(aVec1, bVec0, sum10);
+        sum11 = __SUMDOTP2(aVec1, bVec1, sum11);
+        sum20 = __SUMDOTP2(aVec2, bVec0, sum20);
+        sum21 = __SUMDOTP2(aVec2, bVec1, sum21);
+        sum30 = __SUMDOTP2(aVec3, bVec0, sum30);
+        sum31 = __SUMDOTP2(aVec3, bVec1, sum31);
+      }
+
+      __asm__ volatile(
+          "p.sw %[s00], 4(%[addr_c]!) \n\t"
+          "p.sw %[s01], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s10], 4(%[addr_c]!) \n\t"
+          "p.sw %[s11], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s20], 4(%[addr_c]!) \n\t"
+          "p.sw %[s21], %[c_incr](%[addr_c]!) \n\t"
+          "p.sw %[s30], 4(%[addr_c]!) \n\t"
+          "p.sw %[s31], %[c_incr](%[addr_c]!) \n\t"
+          : [ addr_c ] "+&r"(idx_c)
+          : [ s00 ] "r"(sum00), [ s01 ] "r"(sum01), [ s10 ] "r"(sum10),
+            [ s11 ] "r"(sum11), [ s20 ] "r"(sum20), [ s21 ] "r"(sum21),
+            [ s30 ] "r"(sum30), [ s31 ] "r"(sum31), [ c_incr ] "r"(C_incr)
+          : "memory");
+      /* The asm code above implements the following commented C code */
+      // pDstC[(i * 4) * P + (k * 2)] = sum00;
+      // pDstC[(i * 4) * P + (k * 2 + 1)] = sum01;
+      // pDstC[(i * 4 + 1) * P + (k * 2)] = sum10;
+      // pDstC[(i * 4 + 1) * P + (k * 2 + 1)] = sum11;
+      // pDstC[(i * 4 + 2) * P + (k * 2)] = sum20;
+      // pDstC[(i * 4 + 2) * P + (k * 2 + 1)] = sum21;
+      // pDstC[(i * 4 + 3) * P + (k * 2)] = sum30;
+      // pDstC[(i * 4 + 3) * P + (k * 2 + 1)] = sum31;
+
+      idx_a += N * 3;
+    }
+  }
+}
+#endif
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x2_parallel_i32_rv32im
+ * data type  = 32-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ */
+void matmul_unrolled_2x2_parallel_i32_rv32im(int32_t const *__restrict__ A,
+                                             int32_t const *__restrict__ B,
+                                             int32_t *__restrict__ C,
+                                             uint32_t M, uint32_t N, uint32_t P,
+                                             uint32_t id, uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 8; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+  for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+      for (uint32_t k = 0; k < N; k += 2) {
+        // Explicitly load the values first to help with scheduling
+        int32_t val_a00 = A[(i + 0) * N + k + 0];
+        int32_t val_a01 = A[(i + 0) * N + k + 1];
+        int32_t val_a10 = A[(i + 1) * N + k + 0];
+        int32_t val_a11 = A[(i + 1) * N + k + 1];
+        int32_t val_b00 = B[(k + 0) * P + j + 0];
+        int32_t val_b01 = B[(k + 0) * P + j + 1];
+        int32_t val_b10 = B[(k + 1) * P + j + 0];
+        int32_t val_b11 = B[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      C[(i + 0) * P + j + 0] = c00;
+      C[(i + 0) * P + j + 1] = c01;
+      C[(i + 1) * P + j + 0] = c10;
+      C[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+
+/*
+ * Matrix multiplication ----------------------------------
+ * kernel     = matmul_unrolled_2x2_parallel_i32_xpulpv2
+ * data type  = 32-bit integer
+ * multi-core = yes
+ * unrolling  = 4 elements of C per iteration (2x2 chunks)
+ * simd       = no
+ * other      = loads/stores explicitly written in asm
+ *              for optimal register utilization
+ */
+#ifdef __XPULPIMG
+void matmul_unrolled_2x2_parallel_i32_xpulpv2(int32_t const *__restrict__ A,
+                                              int32_t const *__restrict__ B,
+                                              int32_t *__restrict__ C,
+                                              uint32_t M, uint32_t N,
+                                              uint32_t P, uint32_t id,
+                                              uint32_t numThreads) {
+  // Parallelize by assigning each core one row
+  uint32_t const c = 8; // How many columns to split the matrix into
+  uint32_t const c_start = (P / c) * (id % c);
+  uint32_t const c_end = (P / c) * ((id % c) + 1);
+
+  uint32_t const A_incr = (N * sizeof(int32_t)) - sizeof(int32_t);
+  uint32_t const B_incr = (P * sizeof(int32_t)) - sizeof(int32_t);
+
+  for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) {
+    for (uint32_t j = c_start; j < c_end; j += 2) {
+      int32_t c00 = 0;
+      int32_t c01 = 0;
+      int32_t c10 = 0;
+      int32_t c11 = 0;
+
+      for (uint32_t k = 0; k < N; k += 2) {
+        const int32_t *idx_a = &A[i * N + k];
+        const int32_t *idx_b = &B[k * P + j];
+        int32_t val_a00, val_a01, val_a10, val_a11, val_b00, val_b01, val_b10,
+            val_b11;
+        __asm__ volatile("p.lw %[a00], 4(%[addr_a]!) \n\t"
+                         "p.lw %[a01], %[a_incr](%[addr_a]!) \n\t"
+                         "p.lw %[a10], 4(%[addr_a]!) \n\t"
+                         "p.lw %[a11], 0(%[addr_a]) \n\t"
+                         "p.lw %[b00], 4(%[addr_b]!) \n\t"
+                         "p.lw %[b01], %[b_incr](%[addr_b]!) \n\t"
+                         "p.lw %[b10], 4(%[addr_b]!) \n\t"
+                         "p.lw %[b11], 0(%[addr_b]) \n\t"
+                         : [ a00 ] "=&r"(val_a00), [ a01 ] "=&r"(val_a01),
+                           [ a10 ] "=&r"(val_a10), [ a11 ] "=&r"(val_a11),
+                           [ b00 ] "=&r"(val_b00), [ b01 ] "=&r"(val_b01),
+                           [ b10 ] "=&r"(val_b10), [ b11 ] "=&r"(val_b11),
+                           [ addr_a ] "+&r"(idx_a), [ addr_b ] "+&r"(idx_b)
+                         : [ a_incr ] "r"(A_incr), [ b_incr ] "r"(B_incr)
+                         : "memory");
+        /* The asm code above implements the following commented C code */
+        // int32_t val_a00 = A[(i + 0) * N + k + 0];
+        // int32_t val_a01 = A[(i + 0) * N + k + 1];
+        // int32_t val_a10 = A[(i + 1) * N + k + 0];
+        // int32_t val_a11 = A[(i + 1) * N + k + 1];
+        // int32_t val_b00 = B[(k + 0) * P + j + 0];
+        // int32_t val_b01 = B[(k + 0) * P + j + 1];
+        // int32_t val_b10 = B[(k + 1) * P + j + 0];
+        // int32_t val_b11 = B[(k + 1) * P + j + 1];
+        c00 += val_a00 * val_b00;
+        c00 += val_a01 * val_b10;
+        c01 += val_a00 * val_b01;
+        c01 += val_a01 * val_b11;
+        c10 += val_a10 * val_b00;
+        c10 += val_a11 * val_b10;
+        c11 += val_a10 * val_b01;
+        c11 += val_a11 * val_b11;
+      }
+      int32_t *idx_c = &C[i * P + j];
+      __asm__ volatile("p.sw %[s00], 4(%[addr_c]!) \n\t"
+                       "p.sw %[s01], %[c_incr](%[addr_c]!) \n\t"
+                       "p.sw %[s10], 4(%[addr_c]!) \n\t"
+                       "p.sw %[s11], 0(%[addr_c]) \n\t"
+                       : [ addr_c ] "+&r"(idx_c)
+                       : [ s00 ] "r"(c00), [ s01 ] "r"(c01), [ s10 ] "r"(c10),
+                         [ s11 ] "r"(c11), [ c_incr ] "r"(B_incr)
+                       : "memory");
+      /* The asm code above implements the following commented C code */
+      // C[(i + 0) * P + j + 0] = c00;
+      // C[(i + 0) * P + j + 1] = c01;
+      // C[(i + 1) * P + j + 0] = c10;
+      // C[(i + 1) * P + j + 1] = c11;
+    }
+  }
+}
+#endif
diff --git a/apps/conv2d_i8/main.c b/apps/conv2d_i8/main.c
new file mode 100644
index 000000000..81ebee8ec
--- /dev/null
+++ b/apps/conv2d_i8/main.c
@@ -0,0 +1,111 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/conv_2d.h"
+
+#define M 32
+#define N 32
+#define KERNEL_N 3
+//#define VERBOSE_IN
+//#define VERBOSE_OUT
+
+volatile int8_t in[M * N] __attribute__((section(".l1_prio")));
+volatile int32_t out[M * N] __attribute__((section(".l1_prio")));
+volatile uint8_t kernel[KERNEL_N * KERNEL_N] __attribute__((section(".l1")));
+volatile int error __attribute__((section(".l1")));
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id, num_cores);
+
+  mempool_barrier(num_cores, num_cores / 2);
+
+  if (core_id == 0) {
+    // Initialize error
+    error = 0;
+    // Initialize kernel
+    kernel[0] = 1;
+    kernel[1] = 2;
+    kernel[2] = 1;
+
+    kernel[3] = 2;
+    kernel[4] = 4;
+    kernel[5] = 2;
+
+    kernel[6] = 1;
+    kernel[7] = 2;
+    kernel[8] = 1;
+
+    // Initialize img
+    init_conv2d_image_i8(in, N, M);
+
+#ifdef VERBOSE_IN
+    printf("A:\n");
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        printf("%4u ", in[i * N + j]);
+      }
+      printf("\n");
+    }
+    printf("kernel:\n");
+    for (int i = 0; i < KERNEL_N; i++) {
+      for (int j = 0; j < KERNEL_N; j++) {
+        printf("%4u ", kernel[i * KERNEL_N + j]);
+      }
+      printf("\n");
+    }
+#endif
+
+    mempool_start_benchmark();
+#ifdef __XPULPIMG
+    conv2d_3x3_unrolled2_i8_xpulpv2(in, out, M, N, kernel);
+#else
+    conv2d_3x3_unrolled2_i8_rv32im(in, N, M, kernel, out);
+#endif
+    mempool_stop_benchmark();
+
+#ifdef VERBOSE_OUT
+    printf("out:\n");
+    for (int i = 1; i < M - 1; i++) {
+      for (int j = 1; j < N - 1; j++) {
+        printf("%4u ", out[i * N + j]);
+      }
+      printf("\n");
+    }
+#endif
+
+    // verify_conv2d_image_i8_verbose(out, N, M);
+    // Check result
+    if (verify_conv2d_image_i8(out, N, M)) {
+      error = 1;
+    }
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores, 4 * num_cores);
+
+  return error;
+}
diff --git a/apps/convolution/main.c b/apps/convolution/main.c
index 990b23c34..df47be73d 100644
--- a/apps/convolution/main.c
+++ b/apps/convolution/main.c
@@ -125,7 +125,7 @@ int main() {
     mempool_barrier(num_cores, num_cores * 4);
     // Check result
     if (verify_conv2d_image(out, N, M, core_id, num_cores)) {
-      __atomic_fetch_or(&error, i, __ATOMIC_SEQ_CST);
+      amo_or(&error, (unsigned)i);
     }
   }
 
diff --git a/apps/matmul_i16/main.c b/apps/matmul_i16/main.c
new file mode 100644
index 000000000..e25b12a3e
--- /dev/null
+++ b/apps/matmul_i16/main.c
@@ -0,0 +1,151 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/mat_mul.h"
+
+// Define Matrix dimensions:
+// C = AB with A=[MxN], B=[NxP], C=[MxP]
+#define matrix_M 64
+#define matrix_N 64
+#define matrix_P 64
+
+int16_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int16_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+
+int volatile error __attribute__((section(".l1")));
+
+void init_matrix(int16_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int16_t a, int16_t b, int16_t c, uint32_t core_id,
+                 uint32_t num_cores) {
+  uint32_t const split = 8; // How many rows/columns to split the matrix into
+  if (num_columns > num_rows) {
+    // Parallelize over columns
+    uint32_t const c_start = (num_rows / split) * (core_id % split);
+    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
+    for (uint32_t j = (core_id / split); j < num_columns;
+         j += (num_cores / split)) {
+      for (uint32_t i = c_start; i < c_end; ++i) {
+        matrix[i * num_columns + j] = a * (int16_t)i + b * (int16_t)j + c;
+      }
+    }
+  } else {
+    // Parallelize over rows
+    uint32_t const c_start = (num_columns / split) * (core_id % split);
+    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
+    for (uint32_t i = (core_id / split); i < num_rows;
+         i += (num_cores / split)) {
+      for (uint32_t j = c_start; j < c_end; ++j) {
+        matrix[i * num_columns + j] = a * (int16_t)i + b * (int16_t)j + c;
+      }
+    }
+  }
+}
+
+// Initialize the matrices in parallel
+int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                  uint32_t inner_dim, int16_t aa, int16_t ab, int16_t ac,
+                  int16_t ba, int16_t bb, int16_t bc, uint32_t core_id,
+                  uint32_t num_cores) {
+  // Convert to signed
+  int32_t n = (int32_t)inner_dim;
+  // Parallelize over rows
+  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      int32_t ii = (int32_t)i;
+      int32_t jj = (int32_t)j;
+      int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj +
+                     (int32_t)ac * bc) *
+                    n;
+      int32_t qua =
+          (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) *
+           (n * (n - 1))) /
+          2;
+      int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
+      int32_t golden = lin + qua + cub;
+      if (matrix[i * num_columns + j] != golden) {
+        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
+      }
+      matrix[i * num_columns + j] = 0;
+    }
+  }
+  return 0;
+}
+
+int test_matrix_multiplication(int16_t *__restrict__ A, int16_t *__restrict__ B,
+                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
+                               uint32_t P, uint32_t core_id,
+                               uint32_t num_cores) {
+  int16_t const A_a = 1;
+  int16_t const A_b = 1;
+  int16_t const A_c = -40;
+  int16_t const B_a = 0;
+  int16_t const B_b = 1;
+  int16_t const B_c = 19;
+
+  // Initialize Matrices
+  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
+  // Wait at barrier until everyone is ready
+  mempool_barrier(num_cores, num_cores / 2);
+  // Execute function to test.
+  mempool_start_benchmark();
+
+#ifdef __XPULPIMG
+  matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2(A, B, C, M, N, P, core_id,
+                                                     num_cores);
+#else
+  matmul_unrolled_2x2_parallel_i16_rv32im(A, B, C, M, N, P, core_id, num_cores);
+#endif
+
+  mempool_stop_benchmark();
+  // Wait at barrier befor checking
+  mempool_barrier(num_cores, num_cores * 4);
+  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
+                    num_cores)) {
+    error = 1;
+    return -1;
+  }
+  return 0;
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id, num_cores);
+
+  if (core_id == 0) {
+    error = 0;
+  }
+
+  // Test the Matrix multiplication
+  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
+                             matrix_P, core_id, num_cores);
+  // wait until all cores have finished
+  mempool_barrier(num_cores, num_cores * 4);
+
+  return error;
+}
diff --git a/apps/matmul_i32/main.c b/apps/matmul_i32/main.c
new file mode 100644
index 000000000..6fb3d422e
--- /dev/null
+++ b/apps/matmul_i32/main.c
@@ -0,0 +1,149 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/mat_mul.h"
+
+// Define Matrix dimensions:
+// C = AB with A=[MxN], B=[NxP], C=[MxP]
+#define matrix_M 64
+#define matrix_N 32
+#define matrix_P 64
+
+int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+
+int volatile error __attribute__((section(".l1")));
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
+                 uint32_t num_cores) {
+  uint32_t const split = 8; // How many rows/columns to split the matrix into
+  if (num_columns > num_rows) {
+    // Parallelize over columns
+    uint32_t const c_start = (num_rows / split) * (core_id % split);
+    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
+    for (uint32_t j = (core_id / split); j < num_columns;
+         j += (num_cores / split)) {
+      for (uint32_t i = c_start; i < c_end; ++i) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  } else {
+    // Parallelize over rows
+    uint32_t const c_start = (num_columns / split) * (core_id % split);
+    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
+    for (uint32_t i = (core_id / split); i < num_rows;
+         i += (num_cores / split)) {
+      for (uint32_t j = c_start; j < c_end; ++j) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  }
+}
+
+// Initialize the matrices in parallel
+int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                  uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac,
+                  int32_t ba, int32_t bb, int32_t bc, uint32_t core_id,
+                  uint32_t num_cores) {
+  // Convert to signed
+  int32_t n = (int32_t)inner_dim;
+  // Parallelize over rows
+  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      int32_t ii = (int32_t)i;
+      int32_t jj = (int32_t)j;
+      int32_t lin =
+          (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n;
+      int32_t qua =
+          ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) /
+          2;
+      int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
+      int32_t golden = lin + qua + cub;
+      if (matrix[i * num_columns + j] != golden) {
+        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
+      }
+      matrix[i * num_columns + j] = 0;
+    }
+  }
+  return 0;
+}
+
+int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B,
+                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
+                               uint32_t P, uint32_t core_id,
+                               uint32_t num_cores) {
+  int32_t const A_a = 1;
+  int32_t const A_b = 1;
+  int32_t const A_c = -32;
+  int32_t const B_a = 2;
+  int32_t const B_b = 1;
+  int32_t const B_c = 16;
+
+  // Initialize Matrices
+  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
+  // Wait at barrier until everyone is ready
+  mempool_barrier(num_cores, num_cores / 2);
+  // Execute function to test.
+  mempool_start_benchmark();
+
+#ifdef __XPULPIMG
+  matmul_unrolled_2x2_parallel_i32_xpulpv2(A, B, C, M, N, P, core_id,
+                                           num_cores);
+#else
+  matmul_unrolled_2x2_parallel_i32_rv32im(A, B, C, M, N, P, core_id, num_cores);
+#endif
+
+  mempool_stop_benchmark();
+  // Wait at barrier befor checking
+  mempool_barrier(num_cores, num_cores * 4);
+  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
+                    num_cores)) {
+    error = 1;
+    return -1;
+  }
+  return 0;
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id, num_cores);
+
+  if (core_id == 0) {
+    error = 0;
+  }
+
+  // Test the Matrix multiplication
+  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
+                             matrix_P, core_id, num_cores);
+  // wait until all cores have finished
+  mempool_barrier(num_cores, num_cores * 4);
+
+  return error;
+}
diff --git a/apps/matmul_i8/main.c b/apps/matmul_i8/main.c
new file mode 100644
index 000000000..6182458a8
--- /dev/null
+++ b/apps/matmul_i8/main.c
@@ -0,0 +1,153 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Samuel Riedel, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/mat_mul.h"
+
+// Define Matrix dimensions:
+// C = AB with A=[MxN], B=[NxP], C=[MxP]
+#define matrix_M 64
+#define matrix_N 64
+#define matrix_P 64
+
+int8_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int8_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+
+int volatile error __attribute__((section(".l1")));
+
+void init_matrix(int8_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int8_t a, int8_t b, int8_t c, uint32_t core_id,
+                 uint32_t num_cores) {
+  uint32_t const split = 8; // How many rows/columns to split the matrix into
+  if (num_columns > num_rows) {
+    // Parallelize over columns
+    uint32_t const c_start = (num_rows / split) * (core_id % split);
+    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
+    for (uint32_t j = (core_id / split); j < num_columns;
+         j += (num_cores / split)) {
+      for (uint32_t i = c_start; i < c_end; ++i) {
+        matrix[i * num_columns + j] = a * (int8_t)i + b * (int8_t)j + c;
+      }
+    }
+  } else {
+    // Parallelize over rows
+    uint32_t const c_start = (num_columns / split) * (core_id % split);
+    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
+    for (uint32_t i = (core_id / split); i < num_rows;
+         i += (num_cores / split)) {
+      for (uint32_t j = c_start; j < c_end; ++j) {
+        matrix[i * num_columns + j] = a * (int8_t)i + b * (int8_t)j + c;
+      }
+    }
+  }
+}
+
+// Initialize the matrices in parallel
+int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                  uint32_t inner_dim, int8_t aa, int8_t ab, int8_t ac,
+                  int8_t ba, int8_t bb, int8_t bc, uint32_t core_id,
+                  uint32_t num_cores) {
+  // Convert to signed
+  int32_t n = (int32_t)inner_dim;
+  // Parallelize over rows
+  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      int32_t ii = (int32_t)i;
+      int32_t jj = (int32_t)j;
+      int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj +
+                     (int32_t)ac * bc) *
+                    n;
+      int32_t qua =
+          (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) *
+           (n * (n - 1))) /
+          2;
+      int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
+      int32_t golden = lin + qua + cub;
+      if (matrix[i * num_columns + j] != golden) {
+        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
+      }
+      matrix[i * num_columns + j] = 0;
+    }
+  }
+  return 0;
+}
+
+int test_matrix_multiplication(int8_t *__restrict__ A, int8_t *__restrict__ B,
+                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
+                               uint32_t P, uint32_t core_id,
+                               uint32_t num_cores) {
+  int8_t const A_a = 1;
+  int8_t const A_b = 1;
+  int8_t const A_c = -40;
+  int8_t const B_a = 0;
+  int8_t const B_b = 1;
+  int8_t const B_c = 19;
+
+  // Initialize Matrices
+  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
+  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
+  // Wait at barrier until everyone is ready
+  mempool_barrier(num_cores, num_cores / 2);
+  // Execute function to test.
+  mempool_start_benchmark();
+
+#ifdef __XPULPIMG
+  matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
+                                                    num_cores);
+  // matmul_unrolled_2x4_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
+  // num_cores);
+#else
+  matmul_unrolled_2x2_parallel_i8_rv32im(A, B, C, M, N, P, core_id, num_cores);
+#endif
+
+  mempool_stop_benchmark();
+  // Wait at barrier befor checking
+  mempool_barrier(num_cores, num_cores * 4);
+  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
+                    num_cores)) {
+    error = 1;
+    return -1;
+  }
+  return 0;
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id, num_cores);
+
+  if (core_id == 0) {
+    error = 0;
+  }
+
+  // Test the Matrix multiplication
+  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
+                             matrix_P, core_id, num_cores);
+  // wait until all cores have finished
+  mempool_barrier(num_cores, num_cores * 4);
+
+  return error;
+}
diff --git a/apps/riscv-tests/isa/Makefile b/apps/riscv-tests/isa/Makefile
index c7f9b5607..542e05167 100644
--- a/apps/riscv-tests/isa/Makefile
+++ b/apps/riscv-tests/isa/Makefile
@@ -52,9 +52,11 @@ vpath %.S $(src_dir)
 	$(RISCV_OBJDUMP) $< > $@
 
 %.out: %
+	PATH="$(MEMPOOL_DIR)/install/riscv-isa-sim/bin:$$PATH"; \
 	$(RISCV_SIM) --isa=rv64gc $< 2> $@
 
 %.out32: %
+	PATH="$(MEMPOOL_DIR)/install/riscv-isa-sim/bin:$$PATH"; \
 	$(RISCV_SIM) --isa=rv32gc $< 2> $@
 
 define compile_template
diff --git a/apps/riscv-tests/isa/macros/scalar/test_macros.h b/apps/riscv-tests/isa/macros/scalar/test_macros.h
index 0eacde614..10b31a5e5 100644
--- a/apps/riscv-tests/isa/macros/scalar/test_macros.h
+++ b/apps/riscv-tests/isa/macros/scalar/test_macros.h
@@ -3,6 +3,7 @@
 #ifndef __TEST_MACROS_SCALAR_H
 #define __TEST_MACROS_SCALAR_H
 
+// clang-format off
 
 #-----------------------------------------------------------------------
 # Helper macros
@@ -141,6 +142,109 @@ test_ ## testnum: \
       inst x0, x1, ZEXT_UIMM5(imm); \
     )
 
+#-----------------------------------------------------------------------
+# Tests for Xpulpimg instructions with 6-bit unsigned immediate operand
+#-----------------------------------------------------------------------
+
+#define ZEXT_UIMM6(x) ((x) & 0x3F)
+
+#define TEST_UIMM6_OP( testnum, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x1, MASK_XLEN(val1); \
+      inst x14, x1, ZEXT_UIMM6(imm); \
+    )
+
+#define TEST_UIMM6_SRC1_EQ_DEST( testnum, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x1, result, \
+      li  x1, MASK_XLEN(val1); \
+      inst x1, x1, ZEXT_UIMM6(imm); \
+    )
+
+#define TEST_UIMM6_DEST_BYPASS( testnum, nop_cycles, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x6, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      inst x14, x1, ZEXT_UIMM6(imm); \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      addi  x6, x14, 0; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_UIMM6_SRC1_BYPASS( testnum, nop_cycles, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      inst x14, x1, ZEXT_UIMM6(imm); \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_UIMM6_ZEROSRC1( testnum, inst, result, imm ) \
+    TEST_CASE( testnum, x1, result, \
+      inst x1, x0, ZEXT_UIMM6(imm); \
+    )
+
+#define TEST_UIMM6_ZERODEST( testnum, inst, val1, imm ) \
+    TEST_CASE( testnum, x0, 0, \
+      li  x1, MASK_XLEN(val1); \
+      inst x0, x1, ZEXT_UIMM6(imm); \
+    )
+
+#-----------------------------------------------------------------------
+# Tests for Xpulpimg instructions with 6-bit signed immediate operand
+#-----------------------------------------------------------------------
+#define SEXT_IMM6(x) ((x) | (-(((x) >> 5) & 1) << 5))
+
+#define TEST_SIMM6_OP( testnum, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x1, MASK_XLEN(val1); \
+      inst x14, x1, SEXT_IMM6(imm); \
+    )
+
+#define TEST_SIMM6_SRC1_EQ_DEST( testnum, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x1, result, \
+      li  x1, MASK_XLEN(val1); \
+      inst x1, x1, SEXT_IMM6(imm); \
+    )
+
+#define TEST_SIMM6_DEST_BYPASS( testnum, nop_cycles, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x6, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      inst x14, x1, SEXT_IMM6(imm); \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      addi  x6, x14, 0; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_SIMM6_SRC1_BYPASS( testnum, nop_cycles, inst, result, val1, imm ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      inst x14, x1, SEXT_IMM6(imm); \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_SIMM6_ZEROSRC1( testnum, inst, result, imm ) \
+    TEST_CASE( testnum, x1, result, \
+      inst x1, x0, SEXT_IMM6(imm); \
+    )
+
+#define TEST_SIMM6_ZERODEST( testnum, inst, val1, imm ) \
+    TEST_CASE( testnum, x0, 0, \
+      li  x1, MASK_XLEN(val1); \
+      inst x0, x1, SEXT_IMM6(imm); \
+    )
+
 #-----------------------------------------------------------------------
 # Tests for an instruction with register operands
 #-----------------------------------------------------------------------
@@ -264,7 +368,164 @@ test_ ## testnum: \
     )
 
 #-----------------------------------------------------------------------
-# Test memory instructions
+# Tests for instructions with 3 register operands
+#-----------------------------------------------------------------------
+
+#define TEST_RRR_OP( testnum, inst, result, val1, val2, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x1, MASK_XLEN(val1); \
+      li  x2, MASK_XLEN(val2); \
+      li x14, MASK_XLEN(val3); \
+      inst x14, x1, x2; \
+    )
+
+#define TEST_RRR_SRC1_EQ_DEST( testnum, inst, result, val1, val2 ) \
+    TEST_CASE( testnum, x1, result, \
+      li  x1, MASK_XLEN(val1); \
+      li  x2, MASK_XLEN(val2); \
+      inst x1, x1, x2; \
+    )
+
+#define TEST_RRR_SRC2_EQ_DEST( testnum, inst, result, val1, val2 ) \
+    TEST_CASE( testnum, x2, result, \
+      li  x1, MASK_XLEN(val1); \
+      li  x2, MASK_XLEN(val2); \
+      inst x2, x1, x2; \
+    )
+
+#define TEST_RRR_SRC12_EQ_DEST( testnum, inst, result, val1 ) \
+    TEST_CASE( testnum, x1, result, \
+      li  x1, MASK_XLEN(val1); \
+      inst x1, x1, x1; \
+    )
+
+#define TEST_RRR_DEST_BYPASS( testnum, nop_cycles, inst, result, val1, val2, val3 ) \
+    TEST_CASE( testnum, x6, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      li  x2, MASK_XLEN(val2); \
+      li x14, MASK_XLEN(val3); \
+      inst x14, x1, x2; \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      addi  x6, x14, 0; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_RRR_SRC12_BYPASS( testnum, src1_nops, src2_nops, inst, result, val1, val2, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x4, 0; \
+1:    li x14, MASK_XLEN(val3); \
+      li  x1, MASK_XLEN(val1); \
+      TEST_INSERT_NOPS_ ## src1_nops \
+      li  x2, MASK_XLEN(val2); \
+      TEST_INSERT_NOPS_ ## src2_nops \
+      inst x14, x1, x2; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_RRR_SRC21_BYPASS( testnum, src1_nops, src2_nops, inst, result, val1, val2, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x4, 0; \
+1:    li x14, MASK_XLEN(val3); \
+      li  x2, MASK_XLEN(val2); \
+      TEST_INSERT_NOPS_ ## src1_nops \
+      li  x1, MASK_XLEN(val1); \
+      TEST_INSERT_NOPS_ ## src2_nops \
+      inst x14, x1, x2; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+# Actually here we have 3 sources; to avoid too many tests we test rD source bypass only on its own
+#define TEST_RRR_SRC3_BYPASS( testnum, nop_cycles, inst, result, val1, val2, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x4, 0; \
+1:    li  x1, MASK_XLEN(val1); \
+      li  x2, MASK_XLEN(val2); \
+      li x14, MASK_XLEN(val3); \
+      TEST_INSERT_NOPS_ ## nop_cycles \
+      inst x14, x1, x2; \
+      addi  x4, x4, 1; \
+      li  x5, 2; \
+      bne x4, x5, 1b \
+    )
+
+#define TEST_RRR_ZEROSRC1( testnum, inst, result, val1, val2 ) \
+    TEST_CASE( testnum, x2, result, \
+      li x1, MASK_XLEN(val1); \
+      li x2, MASK_XLEN(val2); \
+      inst x2, x0, x1; \
+    )
+
+#define TEST_RRR_ZEROSRC2( testnum, inst, result, val1, val2 ) \
+    TEST_CASE( testnum, x2, result, \
+      li x1, MASK_XLEN(val1); \
+      li x2, MASK_XLEN(val2); \
+      inst x2, x1, x0; \
+    )
+
+#define TEST_RRR_ZEROSRC3( testnum, inst, result, val1, val2 ) \
+    TEST_CASE( testnum, x14, result, \
+      li x1, MASK_XLEN(val1); \
+      li x2, MASK_XLEN(val2); \
+      li x14, 0; \
+      inst x14, x1, x2; \
+    )
+
+#define TEST_RRR_ZEROSRC12( testnum, inst, result, val1 ) \
+    TEST_CASE( testnum, x1, result, \
+      li x1, MASK_XLEN(val1); \
+      inst x1, x0, x0; \
+    )
+
+#define TEST_RRR_ZEROSRC123( testnum, inst, result ) \
+    TEST_CASE( testnum, x1, result, \
+      li x1, 0; \
+      inst x1, x0, x0; \
+    )
+
+#define TEST_RRR_ZERODEST( testnum, inst, val1, val2 ) \
+    TEST_CASE( testnum, x0, 0, \
+      li x1, MASK_XLEN(val1); \
+      li x2, MASK_XLEN(val2); \
+      inst x0, x1, x2; \
+    )
+
+#-----------------------------------------------------------------------
+# Tests for Xpulpimg instructions with 2 register operands (rd and rs1)
+# and a 6-bit unsigned immediate input
+#-----------------------------------------------------------------------
+
+#define TEST_RR_UIMM6_OP( testnum, inst, result, val1, imm, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x1, MASK_XLEN(val1); \
+      li x14, MASK_XLEN(val3); \
+      inst x14, x1, ZEXT_UIMM6(imm); \
+    )
+
+# TODO(smazzola): finish to write macros to cover all tests types
+
+#-----------------------------------------------------------------------
+# Tests for Xpulpimg instructions with 2 register operands (rd and rs1)
+# and a 6-bit signed immediate input
+#-----------------------------------------------------------------------
+
+#define TEST_RR_SIMM6_OP( testnum, inst, result, val1, imm, val3 ) \
+    TEST_CASE( testnum, x14, result, \
+      li  x1, MASK_XLEN(val1); \
+      li x14, MASK_XLEN(val3); \
+      inst x14, x1, SEXT_IMM6(imm); \
+    )
+
+# TODO(smazzola): finish to write macros to cover all tests types
+
+#-----------------------------------------------------------------------
+# Test memory instructions (immediate offset)
 #-----------------------------------------------------------------------
 
 #define TEST_LD_OP( testnum, inst, result, offset, base ) \
@@ -340,6 +601,356 @@ test_ ## testnum: \
     li  x5, 2; \
     bne x4, x5, 1b \
 
+#-----------------------------------------------------------------------
+# Test post-increment memory instructions (immediate offset)
+#-----------------------------------------------------------------------
+
+#define TEST_LD_POST_OP( testnum, inst, load_result, offset, base ) \
+    TEST_CASE( testnum, x14, load_result, \
+      la  x1, base; \
+      addi x15, x1, offset; \
+      inst x14, offset(x1!); \
+    ) \
+    bne x1, x15, fail;
+
+#define TEST_ST_POST_OP( testnum, store_inst, load_inst, store_result, offset, base ) \
+    TEST_CASE( testnum, x14, store_result, \
+      la  x1, base; \
+      la x15, base; \
+      li  x2, store_result; \
+      store_inst x2, offset(x1!); \
+      load_inst x14, 0(x15); \
+    ) \
+    addi x15, x15, offset; \
+    bne x1, x15, fail;
+
+#define TEST_LD_POST_DEST1_BYPASS( testnum, nop_cycles, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    inst x14, offset(x1!); \
+    TEST_INSERT_NOPS_ ## nop_cycles \
+    addi  x6, x14, 0; \
+    li  x7, load_result; \
+    bne x6, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b; \
+
+#define TEST_LD_POST_DEST2_BYPASS( testnum, nop_cycles, inst, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    addi x7, x1, offset; \
+    inst x14, offset(x1!); \
+    TEST_INSERT_NOPS_ ## nop_cycles \
+    addi  x6, x1, 0; \
+    bne x6, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b; \
+
+#define TEST_LD_POST_SRC1_BYPASS( testnum, nop_cycles, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    TEST_INSERT_NOPS_ ## nop_cycles \
+    inst x14, offset(x1!); \
+    li  x7, load_result; \
+    bne x14, x7, fail; \
+    la  x15, base; \
+    addi x15, x15, offset; \
+    bne x1, x15, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_ST_POST_SRC12_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    la  x2, base; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, offset(x2!); \
+    la x15, base; \
+    load_inst x14, 0(x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi x15, x15, offset; \
+    bne x2, x15, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_ST_POST_SRC21_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x2, base; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, offset(x2!); \
+    la x15, base; \
+    load_inst x14, 0(x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi x15, x15, offset; \
+    bne x2, x15, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+
+# You have to make sure Mem[base] =! value, otherwise this test might wrongly succeed
+#define TEST_LD_POST_DEST1_WAW( testnum, inst, value, base ) \
+    TEST_CASE( testnum, x14, value, \
+      la  x1, base; \
+      inst x14, 0(x1!); \
+      li x14, value; \
+    )
+
+#-----------------------------------------------------------------------
+# Test memory instructions (register offset)
+#-----------------------------------------------------------------------
+
+#define TEST_LD_RR_OP( testnum, inst, load_result, offset, base ) \
+    TEST_CASE( testnum, x14, load_result, \
+      la  x1, base; \
+      li x16, offset; \
+      inst x14, x16(x1); \
+    ) \
+    la x15, base; \
+    bne x1, x15, fail;
+
+#define TEST_ST_RR_OP( testnum, store_inst, load_inst, store_result, offset, base ) \
+    TEST_CASE( testnum, x14, store_result, \
+      la  x1, base; \
+      la x15, base; \
+      li x16, offset; \
+      li  x2, store_result; \
+      store_inst x2, x16(x1); \
+      load_inst x14, offset(x15); \
+    ) \
+    bne x1, x15, fail;
+
+#define TEST_LD_RR_DEST_BYPASS( testnum, nop_cycles, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    li x16, offset; \
+    inst x14, x16(x1); \
+    TEST_INSERT_NOPS_ ## nop_cycles \
+    addi  x6, x14, 0; \
+    li  x7, load_result; \
+    bne x6, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b; \
+
+#define TEST_LD_RR_SRC12_BYPASS( testnum, src1_nops, src2_nops, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    inst x14, x16(x1); \
+    li  x7, load_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_LD_RR_SRC21_BYPASS( testnum, src1_nops, src2_nops, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  li x16, offset; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    la  x1, base; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    inst x14, x16(x1); \
+    li  x7, load_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+# Actually reg-reg stores have 3 sources; to avoid too many tests we
+# only test rs1 and rs3 bypass (rs2 bypass already tested by others)
+#define TEST_ST_RR_SRC12_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x2, base; \
+    li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, x16(x2); \
+    la x15, base; \
+    load_inst x14, offset(x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_ST_RR_SRC21_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x2, base; \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, x16(x2); \
+    la x15, base; \
+    load_inst x14, offset(x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+# You have to make sure Mem[base] =! value, otherwise this test might wrongly succeed
+#define TEST_LD_RR_DEST1_WAW( testnum, inst, value, base ) \
+    TEST_CASE( testnum, x14, value, \
+      la  x1, base; \
+      inst x14, x0(x1); \
+      li x14, value; \
+    ) \
+
+#-----------------------------------------------------------------------
+# Test post-increment memory instructions (register offset)
+#-----------------------------------------------------------------------
+
+#define TEST_LD_RR_POST_OP( testnum, inst, load_result, offset, base ) \
+    TEST_CASE( testnum, x14, load_result, \
+      la  x1, base; \
+      li x16, offset; \
+      inst x14, x16(x1!); \
+    ) \
+    la x15, base; \
+    addi x15, x15, offset; \
+    bne x1, x15, fail;
+
+#define TEST_ST_RR_POST_OP( testnum, store_inst, load_inst, store_result, offset, base ) \
+    TEST_CASE( testnum, x14, store_result, \
+      la  x1, base; \
+      la x15, base; \
+      li x16, offset; \
+      li  x2, store_result; \
+      store_inst x2, x16(x1!); \
+      load_inst x14, 0(x15); \
+    ) \
+    addi x15, x15, offset; \
+    bne x1, x15, fail;
+
+#define TEST_LD_RR_POST_DEST_BYPASS( testnum, nop_cycles, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    li x16, offset; \
+    inst x14, x16(x1!); \
+    TEST_INSERT_NOPS_ ## nop_cycles \
+    addi  x6, x14, 0; \
+    li  x7, load_result; \
+    bne x6, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b; \
+
+#define TEST_LD_RR_POST_SRC12_BYPASS( testnum, src1_nops, src2_nops, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x1, base; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    inst x14, x16(x1!); \
+    li  x7, load_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_LD_RR_POST_SRC21_BYPASS( testnum, src1_nops, src2_nops, inst, load_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  li x16, offset; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    la  x1, base; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    inst x14, x16(x1!); \
+    li  x7, load_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+# Actually reg-reg stores have 3 sources; to avoid too many tests we
+# only test rs1 and rs3 bypass (rs2 bypass already tested by others)
+#define TEST_ST_RR_POST_SRC12_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x2, base; \
+    li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, x16(x2!); \
+    la x15, base; \
+    load_inst x14, 0 (x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+#define TEST_ST_RR_POST_SRC21_BYPASS( testnum, src1_nops, src2_nops, store_inst, load_inst, store_result, offset, base ) \
+test_ ## testnum: \
+    li  TESTNUM, testnum; \
+    li  x4, 0; \
+1:  la  x2, base; \
+    li x16, offset; \
+    TEST_INSERT_NOPS_ ## src1_nops \
+    li  x1, store_result; \
+    TEST_INSERT_NOPS_ ## src2_nops \
+    store_inst x1, x16(x2!); \
+    la x15, base; \
+    load_inst x14, 0(x15); \
+    li  x7, store_result; \
+    bne x14, x7, fail; \
+    addi  x4, x4, 1; \
+    li  x5, 2; \
+    bne x4, x5, 1b \
+
+# You have to make sure Mem[base] =! value, otherwise this test might wrongly succeed
+#define TEST_LD_RR_POST_DEST1_WAW( testnum, inst, value, base ) \
+    TEST_CASE( testnum, x14, value, \
+      la  x1, base; \
+      inst x14, x0(x1!); \
+      li x14, value; \
+    ) \
+
+#-----------------------------------------------------------------------
+# Test branch instructions
+#-----------------------------------------------------------------------
+
 #define TEST_BR2_OP_TAKEN( testnum, inst, val1, val2 ) \
 test_ ## testnum: \
     li  TESTNUM, testnum; \
@@ -736,4 +1347,6 @@ pass: \
 
 #define TEST_DATA
 
+// clang-format on
+
 #endif
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/Makefrag b/apps/riscv-tests/isa/rv32uxpulpimg/Makefrag
index f34ab11dd..8bf2c6741 100644
--- a/apps/riscv-tests/isa/rv32uxpulpimg/Makefrag
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/Makefrag
@@ -3,6 +3,12 @@
 #-----------------------------------------------------------------------
 
 rv32uxpulpimg_sc_tests = \
+  p_lb_irpost p_lbu_irpost p_lh_irpost p_lhu_irpost p_lw_irpost \
+  p_lb_rrpost p_lbu_rrpost p_lh_rrpost p_lhu_rrpost p_lw_rrpost \
+  p_lb_rr p_lbu_rr p_lh_rr p_lhu_rr p_lw_rr \
+  p_sb_irpost p_sh_irpost p_sw_irpost \
+  p_sb_rrpost p_sh_rrpost p_sw_rrpost \
+  p_sb_rr p_sh_rr p_sw_rr \
 	p_abs \
   p_slet p_sletu \
   p_min p_minu \
@@ -12,6 +18,28 @@ rv32uxpulpimg_sc_tests = \
   p_clip p_clipu \
   p_clipr p_clipur \
   p_beqimm p_bneimm \
+  p_mac p_msu \
+  pv_add \
+  pv_sub \
+  pv_avg pv_avgu \
+  pv_min pv_minu \
+  pv_max pv_maxu \
+  pv_srl \
+  pv_sra \
+  pv_sll \
+  pv_or \
+  pv_xor \
+  pv_and \
+  pv_abs \
+  pv_extract pv_extractu \
+  pv_insert \
+  pv_dotup \
+  pv_dotusp \
+  pv_dotsp \
+  pv_sdotup \
+  pv_sdotusp \
+  pv_sdotsp \
+  pv_shuffle2 \
 
 rv32uxpulpimg_p_tests = $(addprefix rv32uxpulpimg-p-, $(rv32uxpulpimg_sc_tests))
 rv32uxpulpimg_v_tests = $(addprefix rv32uxpulpimg-v-, $(rv32uxpulpimg_sc_tests))
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_irpost.S
new file mode 100644
index 000000000..2322ca2d6
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_irpost.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lb_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lb (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_POST_OP( 2, p.lb, 0xffffffff, 0,  tdat )
+  TEST_LD_POST_OP( 3, p.lb, 0xffffffff, 1,  tdat )
+  TEST_LD_POST_OP( 4, p.lb, 0x0000000f, 2, tdat3 )
+  TEST_LD_POST_OP( 5, p.lb, 0x0000000f, 3, tdat3 )
+
+  # Negative offset
+  TEST_LD_POST_OP( 6, p.lb, 0x00000000, -3, tdat1 )
+  TEST_LD_POST_OP( 7, p.lb, 0x00000000, -2, tdat1 )
+  TEST_LD_POST_OP( 8, p.lb, 0xfffffff0, -1, tdat2 )
+  TEST_LD_POST_OP( 9, p.lb, 0xfffffff0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_BYPASS( 10, 0, p.lb, 0xffffffff, 1, tdat0 )
+  TEST_LD_POST_DEST1_BYPASS( 11, 1, p.lb, 0x00000000, 1, tdat1 )
+  TEST_LD_POST_DEST1_BYPASS( 12, 2, p.lb, 0xfffffff0, 1, tdat2 )
+
+  TEST_LD_POST_DEST2_BYPASS( 13, 0, p.lb,  2, tdat0 )
+  TEST_LD_POST_DEST2_BYPASS( 14, 1, p.lb,  1, tdat1 )
+  TEST_LD_POST_DEST2_BYPASS( 15, 2, p.lb, -3, tdat3 )
+
+  TEST_LD_POST_SRC1_BYPASS( 16, 0, p.lb, 0xffffffff,  1, tdat0 )
+  TEST_LD_POST_SRC1_BYPASS( 17, 1, p.lb, 0x0000000f, -1, tdat3 )
+  TEST_LD_POST_SRC1_BYPASS( 18, 2, p.lb, 0xfffffff0,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_WAW( 19, p.lb,  25, tdat )
+  TEST_LD_POST_DEST1_WAW( 20, p.lb, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rr.S
new file mode 100644
index 000000000..6938e133d
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rr.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lb_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lb (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_OP( 2, p.lb, 0xffffffff, 0, tdat )
+  TEST_LD_RR_OP( 3, p.lb, 0x00000000, 1, tdat )
+  TEST_LD_RR_OP( 4, p.lb, 0xfffffff0, 2, tdat )
+  TEST_LD_RR_OP( 5, p.lb, 0x0000000f, 3, tdat )
+
+  # Negative offset
+  TEST_LD_RR_OP( 6, p.lb, 0xffffffff, -3, tdat3 )
+  TEST_LD_RR_OP( 7, p.lb, 0x00000000, -2, tdat3 )
+  TEST_LD_RR_OP( 8, p.lb, 0xfffffff0, -1, tdat3 )
+  TEST_LD_RR_OP( 9, p.lb, 0x0000000f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST_BYPASS( 10, 0, p.lb, 0x00000000, 1, tdat0 )
+  TEST_LD_RR_DEST_BYPASS( 11, 1, p.lb, 0xfffffff0, 1, tdat1 )
+  TEST_LD_RR_DEST_BYPASS( 12, 2, p.lb, 0x0000000f, 1, tdat2 )
+
+  TEST_LD_RR_SRC12_BYPASS( 13, 0, 0, p.lb, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 14, 0, 1, p.lb, 0xfffffff0, -1, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 15, 0, 2, p.lb, 0x0000000f,  1, tdat2 )
+  TEST_LD_RR_SRC12_BYPASS( 16, 1, 0, p.lb, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 17, 1, 1, p.lb, 0xfffffff0, -1, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 18, 2, 0, p.lb, 0x0000000f,  1, tdat2 )
+
+  TEST_LD_RR_SRC21_BYPASS( 19, 0, 0, p.lb, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 20, 0, 1, p.lb, 0xfffffff0, -1, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 21, 0, 2, p.lb, 0x0000000f,  1, tdat2 )
+  TEST_LD_RR_SRC21_BYPASS( 22, 1, 0, p.lb, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 23, 1, 1, p.lb, 0xfffffff0, -1, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 24, 2, 0, p.lb, 0x0000000f,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST1_WAW( 25, p.lb,  25, tdat )
+  TEST_LD_RR_DEST1_WAW( 26, p.lb, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rrpost.S
new file mode 100644
index 000000000..afa33f659
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lb_rrpost.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lb_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lb (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_POST_OP( 2, p.lb, 0xffffffff, 0,  tdat )
+  TEST_LD_RR_POST_OP( 3, p.lb, 0xffffffff, 1,  tdat )
+  TEST_LD_RR_POST_OP( 4, p.lb, 0x0000000f, 2, tdat3 )
+  TEST_LD_RR_POST_OP( 5, p.lb, 0x0000000f, 3, tdat3 )
+
+  # Negative offset
+  TEST_LD_RR_POST_OP( 6, p.lb, 0x00000000, -3, tdat1 )
+  TEST_LD_RR_POST_OP( 7, p.lb, 0x00000000, -2, tdat1 )
+  TEST_LD_RR_POST_OP( 8, p.lb, 0xfffffff0, -1, tdat2 )
+  TEST_LD_RR_POST_OP( 9, p.lb, 0xfffffff0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST_BYPASS( 10, 0, p.lb, 0xffffffff, 1, tdat0 )
+  TEST_LD_RR_POST_DEST_BYPASS( 11, 1, p.lb, 0x00000000, 1, tdat1 )
+  TEST_LD_RR_POST_DEST_BYPASS( 12, 2, p.lb, 0xfffffff0, 1, tdat2 )
+
+  TEST_LD_RR_POST_SRC12_BYPASS( 13, 0, 0, p.lb, 0xffffffff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 14, 0, 1, p.lb, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 15, 0, 2, p.lb, 0xfffffff0,  1, tdat2 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 16, 1, 0, p.lb, 0xffffffff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 17, 1, 1, p.lb, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 18, 2, 0, p.lb, 0xfffffff0,  1, tdat2 )
+
+  TEST_LD_RR_POST_SRC21_BYPASS( 19, 0, 0, p.lb, 0xffffffff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 20, 0, 1, p.lb, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 21, 0, 2, p.lb, 0xfffffff0,  1, tdat2 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 22, 1, 0, p.lb, 0xffffffff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 23, 1, 1, p.lb, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 24, 2, 0, p.lb, 0xfffffff0,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST1_WAW( 25, p.lb,  25, tdat )
+  TEST_LD_RR_POST_DEST1_WAW( 26, p.lb, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_irpost.S
new file mode 100644
index 000000000..a8c72a965
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_irpost.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lbu_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lbu (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_POST_OP( 2, p.lbu, 0x000000ff, 0,  tdat )
+  TEST_LD_POST_OP( 3, p.lbu, 0x000000ff, 1,  tdat )
+  TEST_LD_POST_OP( 4, p.lbu, 0x0000000f, 2, tdat3 )
+  TEST_LD_POST_OP( 5, p.lbu, 0x0000000f, 3, tdat3 )
+
+  # Negative offset
+  TEST_LD_POST_OP( 6, p.lbu, 0x00000000, -3, tdat1 )
+  TEST_LD_POST_OP( 7, p.lbu, 0x00000000, -2, tdat1 )
+  TEST_LD_POST_OP( 8, p.lbu, 0x000000f0, -1, tdat2 )
+  TEST_LD_POST_OP( 9, p.lbu, 0x000000f0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_BYPASS( 10, 0, p.lbu, 0x000000ff, 1, tdat0 )
+  TEST_LD_POST_DEST1_BYPASS( 11, 1, p.lbu, 0x00000000, 1, tdat1 )
+  TEST_LD_POST_DEST1_BYPASS( 12, 2, p.lbu, 0x000000f0, 1, tdat2 )
+
+  TEST_LD_POST_DEST2_BYPASS( 13, 0, p.lbu,  2, tdat0 )
+  TEST_LD_POST_DEST2_BYPASS( 14, 1, p.lbu,  1, tdat1 )
+  TEST_LD_POST_DEST2_BYPASS( 15, 2, p.lbu, -3, tdat3 )
+
+  TEST_LD_POST_SRC1_BYPASS( 16, 0, p.lbu, 0x000000ff,  1, tdat0 )
+  TEST_LD_POST_SRC1_BYPASS( 17, 1, p.lbu, 0x0000000f, -1, tdat3 )
+  TEST_LD_POST_SRC1_BYPASS( 18, 2, p.lbu, 0x000000f0,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_WAW( 19, p.lbu,  25, tdat )
+  TEST_LD_POST_DEST1_WAW( 20, p.lbu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rr.S
new file mode 100644
index 000000000..ba0b33235
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rr.S
@@ -0,0 +1,77 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lbu_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lbu (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_OP( 2, p.lbu, 0x000000ff, 0, tdat )
+  TEST_LD_RR_OP( 3, p.lbu, 0x00000000, 1, tdat )
+  TEST_LD_RR_OP( 4, p.lbu, 0x000000f0, 2, tdat )
+  TEST_LD_RR_OP( 5, p.lbu, 0x0000000f, 3, tdat )
+
+  # Negative offset
+  TEST_LD_RR_OP( 6, p.lbu, 0x000000ff, -3, tdat3 )
+  TEST_LD_RR_OP( 7, p.lbu, 0x00000000, -2, tdat3 )
+  TEST_LD_RR_OP( 8, p.lbu, 0x000000f0, -1, tdat3 )
+  TEST_LD_RR_OP( 9, p.lbu, 0x0000000f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST_BYPASS( 10, 0, p.lbu, 0x00000000, 1, tdat0 )
+  TEST_LD_RR_DEST_BYPASS( 11, 1, p.lbu, 0x000000f0, 1, tdat1 )
+  TEST_LD_RR_DEST_BYPASS( 12, 2, p.lbu, 0x0000000f, 1, tdat2 )
+
+  TEST_LD_RR_SRC12_BYPASS( 13, 0, 0, p.lbu, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 14, 0, 1, p.lbu, 0x000000f0, -1, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 15, 0, 2, p.lbu, 0x0000000f,  1, tdat2 )
+  TEST_LD_RR_SRC12_BYPASS( 16, 1, 0, p.lbu, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 17, 1, 1, p.lbu, 0x000000f0, -1, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 18, 2, 0, p.lbu, 0x0000000f,  1, tdat2 )
+
+  TEST_LD_RR_SRC21_BYPASS( 19, 0, 0, p.lbu, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 20, 0, 1, p.lbu, 0x000000f0, -1, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 21, 0, 2, p.lbu, 0x0000000f,  1, tdat2 )
+  TEST_LD_RR_SRC21_BYPASS( 22, 1, 0, p.lbu, 0x00000000,  1, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 23, 1, 1, p.lbu, 0x000000f0, -1, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 24, 2, 0, p.lbu, 0x0000000f,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST1_WAW( 25, p.lbu,  25, tdat )
+  TEST_LD_RR_DEST1_WAW( 26, p.lbu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
+
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rrpost.S
new file mode 100644
index 000000000..9582ca6f7
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lbu_rrpost.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lbu_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lbu (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_POST_OP( 2, p.lbu, 0x000000ff, 0,  tdat )
+  TEST_LD_RR_POST_OP( 3, p.lbu, 0x000000ff, 1,  tdat )
+  TEST_LD_RR_POST_OP( 4, p.lbu, 0x0000000f, 2, tdat3 )
+  TEST_LD_RR_POST_OP( 5, p.lbu, 0x0000000f, 3, tdat3 )
+
+  # Negative offset
+  TEST_LD_RR_POST_OP( 6, p.lbu, 0x00000000, -3, tdat1 )
+  TEST_LD_RR_POST_OP( 7, p.lbu, 0x00000000, -2, tdat1 )
+  TEST_LD_RR_POST_OP( 8, p.lbu, 0x000000f0, -1, tdat2 )
+  TEST_LD_RR_POST_OP( 9, p.lbu, 0x000000f0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST_BYPASS( 10, 0, p.lbu, 0x000000ff, 1, tdat0 )
+  TEST_LD_RR_POST_DEST_BYPASS( 11, 1, p.lbu, 0x00000000, 1, tdat1 )
+  TEST_LD_RR_POST_DEST_BYPASS( 12, 2, p.lbu, 0x000000f0, 1, tdat2 )
+
+  TEST_LD_RR_POST_SRC12_BYPASS( 13, 0, 0, p.lbu, 0x000000ff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 14, 0, 1, p.lbu, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 15, 0, 2, p.lbu, 0x000000f0,  1, tdat2 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 16, 1, 0, p.lbu, 0x000000ff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 17, 1, 1, p.lbu, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 18, 2, 0, p.lbu, 0x000000f0,  1, tdat2 )
+
+  TEST_LD_RR_POST_SRC21_BYPASS( 19, 0, 0, p.lbu, 0x000000ff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 20, 0, 1, p.lbu, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 21, 0, 2, p.lbu, 0x000000f0,  1, tdat2 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 22, 1, 0, p.lbu, 0x000000ff,  1, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 23, 1, 1, p.lbu, 0x0000000f, -1, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 24, 2, 0, p.lbu, 0x000000f0,  1, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST1_WAW( 25, p.lbu,  25, tdat )
+  TEST_LD_RR_POST_DEST1_WAW( 26, p.lbu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xff
+tdat1:  .byte 0x00
+tdat2:  .byte 0xf0
+tdat3:  .byte 0x0f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_irpost.S
new file mode 100644
index 000000000..ca376a2a6
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_irpost.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lh_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lh (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_POST_OP( 2, p.lh, 0x000000ff, 0,  tdat )
+  TEST_LD_POST_OP( 3, p.lh, 0x000000ff, 2,  tdat )
+  TEST_LD_POST_OP( 4, p.lh, 0xfffff00f, 4, tdat3 )
+  TEST_LD_POST_OP( 5, p.lh, 0xfffff00f, 6, tdat3 )
+
+  # Negative offset
+  TEST_LD_POST_OP( 6, p.lh, 0xffffff00, -6, tdat1 )
+  TEST_LD_POST_OP( 7, p.lh, 0xffffff00, -4, tdat1 )
+  TEST_LD_POST_OP( 8, p.lh, 0x00000ff0, -2, tdat2 )
+  TEST_LD_POST_OP( 9, p.lh, 0x00000ff0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_BYPASS( 10, 0, p.lh, 0x000000ff, 2, tdat0 )
+  TEST_LD_POST_DEST1_BYPASS( 11, 1, p.lh, 0xffffff00, 2, tdat1 )
+  TEST_LD_POST_DEST1_BYPASS( 12, 2, p.lh, 0x00000ff0, 2, tdat2 )
+
+  TEST_LD_POST_DEST2_BYPASS( 13, 0, p.lh,  4, tdat0 )
+  TEST_LD_POST_DEST2_BYPASS( 14, 1, p.lh,  2, tdat1 )
+  TEST_LD_POST_DEST2_BYPASS( 15, 2, p.lh, -6, tdat3 )
+
+  TEST_LD_POST_SRC1_BYPASS( 16, 0, p.lh, 0x000000ff,  2, tdat0 )
+  TEST_LD_POST_SRC1_BYPASS( 17, 1, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_POST_SRC1_BYPASS( 18, 2, p.lh, 0x00000ff0,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_WAW( 19, p.lh,  25, tdat )
+  TEST_LD_POST_DEST1_WAW( 20, p.lh, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rr.S
new file mode 100644
index 000000000..ebc5aabd9
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rr.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lh_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lh (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_OP( 2, p.lh, 0x000000ff, 0, tdat )
+  TEST_LD_RR_OP( 3, p.lh, 0xffffff00, 2, tdat )
+  TEST_LD_RR_OP( 4, p.lh, 0x00000ff0, 4, tdat )
+  TEST_LD_RR_OP( 5, p.lh, 0xfffff00f, 6, tdat )
+
+  # Negative offset
+  TEST_LD_RR_OP( 6, p.lh, 0x000000ff, -6, tdat3 )
+  TEST_LD_RR_OP( 7, p.lh, 0xffffff00, -4, tdat3 )
+  TEST_LD_RR_OP( 8, p.lh, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_OP( 9, p.lh, 0xfffff00f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST_BYPASS( 10, 0, p.lh, 0xffffff00, 2, tdat0 )
+  TEST_LD_RR_DEST_BYPASS( 11, 1, p.lh, 0x00000ff0, 2, tdat1 )
+  TEST_LD_RR_DEST_BYPASS( 12, 2, p.lh, 0xfffff00f, 2, tdat2 )
+
+  TEST_LD_RR_SRC12_BYPASS( 13, 0, 0, p.lh, 0xffffff00,  2, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 14, 0, 1, p.lh, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 15, 0, 2, p.lh, 0xfffff00f,  2, tdat2 )
+  TEST_LD_RR_SRC12_BYPASS( 16, 1, 0, p.lh, 0xffffff00,  2, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 17, 1, 1, p.lh, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 18, 2, 0, p.lh, 0xfffff00f,  2, tdat2 )
+
+  TEST_LD_RR_SRC21_BYPASS( 19, 0, 0, p.lh, 0xffffff00,  2, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 20, 0, 1, p.lh, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 21, 0, 2, p.lh, 0xfffff00f,  2, tdat2 )
+  TEST_LD_RR_SRC21_BYPASS( 22, 1, 0, p.lh, 0xffffff00,  2, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 23, 1, 1, p.lh, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 24, 2, 0, p.lh, 0xfffff00f,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST1_WAW( 25, p.lh,  25, tdat )
+  TEST_LD_RR_DEST1_WAW( 26, p.lh, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rrpost.S
new file mode 100644
index 000000000..64a6281aa
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lh_rrpost.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lh_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lh (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_POST_OP( 2, p.lh, 0x000000ff, 0,  tdat )
+  TEST_LD_RR_POST_OP( 3, p.lh, 0x000000ff, 2,  tdat )
+  TEST_LD_RR_POST_OP( 4, p.lh, 0xffffff00, 4, tdat1 )
+  TEST_LD_RR_POST_OP( 5, p.lh, 0xffffff00, 6, tdat1 )
+
+  # Negative offset
+  TEST_LD_RR_POST_OP( 6, p.lh, 0x00000ff0, -6, tdat2 )
+  TEST_LD_RR_POST_OP( 7, p.lh, 0x00000ff0, -4, tdat2 )
+  TEST_LD_RR_POST_OP( 8, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_RR_POST_OP( 9, p.lh, 0xfffff00f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST_BYPASS( 10, 0, p.lh, 0x000000ff, 2, tdat0 )
+  TEST_LD_RR_POST_DEST_BYPASS( 11, 1, p.lh, 0xffffff00, 2, tdat1 )
+  TEST_LD_RR_POST_DEST_BYPASS( 12, 2, p.lh, 0x00000ff0, 2, tdat2 )
+
+  TEST_LD_RR_POST_SRC12_BYPASS( 13, 0, 0, p.lh, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 14, 0, 1, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 15, 0, 2, p.lh, 0x00000ff0,  2, tdat2 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 16, 1, 0, p.lh, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 17, 1, 1, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 18, 2, 0, p.lh, 0x00000ff0,  2, tdat2 )
+
+  TEST_LD_RR_POST_SRC21_BYPASS( 19, 0, 0, p.lh, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 20, 0, 1, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 21, 0, 2, p.lh, 0x00000ff0,  2, tdat2 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 22, 1, 0, p.lh, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 23, 1, 1, p.lh, 0xfffff00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 24, 2, 0, p.lh, 0x00000ff0,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST1_WAW( 25, p.lh,  25, tdat )
+  TEST_LD_RR_POST_DEST1_WAW( 26, p.lh, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_irpost.S
new file mode 100644
index 000000000..72a3d86b8
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_irpost.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lhu_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lhu (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_POST_OP( 2, p.lhu, 0x000000ff, 0,  tdat )
+  TEST_LD_POST_OP( 3, p.lhu, 0x000000ff, 2,  tdat )
+  TEST_LD_POST_OP( 4, p.lhu, 0x0000f00f, 4, tdat3 )
+  TEST_LD_POST_OP( 5, p.lhu, 0x0000f00f, 6, tdat3 )
+
+  # Negative offset
+  TEST_LD_POST_OP( 6, p.lhu, 0x0000ff00, -6, tdat1 )
+  TEST_LD_POST_OP( 7, p.lhu, 0x0000ff00, -4, tdat1 )
+  TEST_LD_POST_OP( 8, p.lhu, 0x00000ff0, -2, tdat2 )
+  TEST_LD_POST_OP( 9, p.lhu, 0x00000ff0,  0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_BYPASS( 10, 0, p.lhu, 0x000000ff, 2, tdat0 )
+  TEST_LD_POST_DEST1_BYPASS( 11, 1, p.lhu, 0x0000ff00, 2, tdat1 )
+  TEST_LD_POST_DEST1_BYPASS( 12, 2, p.lhu, 0x00000ff0, 2, tdat2 )
+
+  TEST_LD_POST_DEST2_BYPASS( 13, 0, p.lhu,  4, tdat0 )
+  TEST_LD_POST_DEST2_BYPASS( 14, 1, p.lhu,  2, tdat1 )
+  TEST_LD_POST_DEST2_BYPASS( 15, 2, p.lhu, -6, tdat3 )
+
+  TEST_LD_POST_SRC1_BYPASS( 16, 0, p.lhu, 0x000000ff,  2, tdat0 )
+  TEST_LD_POST_SRC1_BYPASS( 17, 1, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_POST_SRC1_BYPASS( 18, 2, p.lhu, 0x00000ff0,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_WAW( 19, p.lhu,  25, tdat )
+  TEST_LD_POST_DEST1_WAW( 20, p.lhu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rr.S
new file mode 100644
index 000000000..a8c54fff9
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rr.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lhu_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lhu (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_OP( 2, p.lhu, 0x000000ff, 0, tdat )
+  TEST_LD_RR_OP( 3, p.lhu, 0x0000ff00, 2, tdat )
+  TEST_LD_RR_OP( 4, p.lhu, 0x00000ff0, 4, tdat )
+  TEST_LD_RR_OP( 5, p.lhu, 0x0000f00f, 6, tdat )
+
+  # Negative offset
+  TEST_LD_RR_OP( 6, p.lhu, 0x000000ff, -6, tdat3 )
+  TEST_LD_RR_OP( 7, p.lhu, 0x0000ff00, -4, tdat3 )
+  TEST_LD_RR_OP( 8, p.lhu, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_OP( 9, p.lhu, 0x0000f00f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST_BYPASS( 10, 0, p.lhu, 0x0000ff00, 2, tdat0 )
+  TEST_LD_RR_DEST_BYPASS( 11, 1, p.lhu, 0x00000ff0, 2, tdat1 )
+  TEST_LD_RR_DEST_BYPASS( 12, 2, p.lhu, 0x0000f00f, 2, tdat2 )
+
+  TEST_LD_RR_SRC12_BYPASS( 13, 0, 0, p.lhu, 0x0000ff00,  2, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 14, 0, 1, p.lhu, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 15, 0, 2, p.lhu, 0x0000f00f,  2, tdat2 )
+  TEST_LD_RR_SRC12_BYPASS( 16, 1, 0, p.lhu, 0x0000ff00,  2, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 17, 1, 1, p.lhu, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 18, 2, 0, p.lhu, 0x0000f00f,  2, tdat2 )
+
+  TEST_LD_RR_SRC21_BYPASS( 19, 0, 0, p.lhu, 0x0000ff00,  2, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 20, 0, 1, p.lhu, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 21, 0, 2, p.lhu, 0x0000f00f,  2, tdat2 )
+  TEST_LD_RR_SRC21_BYPASS( 22, 1, 0, p.lhu, 0x0000ff00,  2, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 23, 1, 1, p.lhu, 0x00000ff0, -2, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 24, 2, 0, p.lhu, 0x0000f00f,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST1_WAW( 25, p.lhu,  25, tdat )
+  TEST_LD_RR_DEST1_WAW( 26, p.lhu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rrpost.S
new file mode 100644
index 000000000..aee16032b
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lhu_rrpost.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lhu_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lhu (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_POST_OP( 2, p.lhu, 0x000000ff, 0,  tdat )
+  TEST_LD_RR_POST_OP( 3, p.lhu, 0x000000ff, 2,  tdat )
+  TEST_LD_RR_POST_OP( 4, p.lhu, 0x0000ff00, 4, tdat1 )
+  TEST_LD_RR_POST_OP( 5, p.lhu, 0x0000ff00, 6, tdat1 )
+
+  # Negative offset
+  TEST_LD_RR_POST_OP( 6, p.lhu, 0x00000ff0, -6, tdat2 )
+  TEST_LD_RR_POST_OP( 7, p.lhu, 0x00000ff0, -4, tdat2 )
+  TEST_LD_RR_POST_OP( 8, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_RR_POST_OP( 9, p.lhu, 0x0000f00f,  0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST_BYPASS( 10, 0, p.lhu, 0x000000ff, 2, tdat0 )
+  TEST_LD_RR_POST_DEST_BYPASS( 11, 1, p.lhu, 0x0000ff00, 2, tdat1 )
+  TEST_LD_RR_POST_DEST_BYPASS( 12, 2, p.lhu, 0x00000ff0, 2, tdat2 )
+
+  TEST_LD_RR_POST_SRC12_BYPASS( 13, 0, 0, p.lhu, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 14, 0, 1, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 15, 0, 2, p.lhu, 0x00000ff0,  2, tdat2 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 16, 1, 0, p.lhu, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 17, 1, 1, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 18, 2, 0, p.lhu, 0x00000ff0,  2, tdat2 )
+
+  TEST_LD_RR_POST_SRC21_BYPASS( 19, 0, 0, p.lhu, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 20, 0, 1, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 21, 0, 2, p.lhu, 0x00000ff0,  2, tdat2 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 22, 1, 0, p.lhu, 0x000000ff,  2, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 23, 1, 1, p.lhu, 0x0000f00f, -2, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 24, 2, 0, p.lhu, 0x00000ff0,  2, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST1_WAW( 25, p.lhu,  25, tdat )
+  TEST_LD_RR_POST_DEST1_WAW( 26, p.lhu, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0x00ff
+tdat1:  .half 0xff00
+tdat2:  .half 0x0ff0
+tdat3:  .half 0xf00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_irpost.S
new file mode 100644
index 000000000..ec62744a2
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_irpost.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lw_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lw (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_POST_OP( 2, p.lw, 0x00ff00ff,  0,  tdat )
+  TEST_LD_POST_OP( 3, p.lw, 0x00ff00ff,  4,  tdat )
+  TEST_LD_POST_OP( 4, p.lw, 0xf00ff00f,  8, tdat3 )
+  TEST_LD_POST_OP( 5, p.lw, 0xf00ff00f, 12, tdat3 )
+
+  # Negative offset
+  TEST_LD_POST_OP( 6, p.lw, 0xff00ff00, -12, tdat1 )
+  TEST_LD_POST_OP( 7, p.lw, 0xff00ff00,  -8, tdat1 )
+  TEST_LD_POST_OP( 8, p.lw, 0x0ff00ff0,  -4, tdat2 )
+  TEST_LD_POST_OP( 9, p.lw, 0x0ff00ff0,   0, tdat2 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_BYPASS( 10, 0, p.lw, 0x00ff00ff, 4, tdat0 )
+  TEST_LD_POST_DEST1_BYPASS( 11, 1, p.lw, 0xff00ff00, 4, tdat1 )
+  TEST_LD_POST_DEST1_BYPASS( 12, 2, p.lw, 0x0ff00ff0, 4, tdat2 )
+
+  TEST_LD_POST_DEST2_BYPASS( 13, 0, p.lw,  8, tdat0 )
+  TEST_LD_POST_DEST2_BYPASS( 14, 1, p.lw,  4, tdat1 )
+  TEST_LD_POST_DEST2_BYPASS( 15, 2, p.lw, -12, tdat3 )
+
+  TEST_LD_POST_SRC1_BYPASS( 16, 0, p.lw, 0x00ff00ff,  4, tdat0 )
+  TEST_LD_POST_SRC1_BYPASS( 17, 1, p.lw, 0xf00ff00f, -4, tdat3 )
+  TEST_LD_POST_SRC1_BYPASS( 18, 2, p.lw, 0x0ff00ff0,  4, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_POST_DEST1_WAW( 19, p.lw,  25, tdat )
+  TEST_LD_POST_DEST1_WAW( 20, p.lw, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0x00ff00ff
+tdat1:  .word 0xff00ff00
+tdat2:  .word 0x0ff00ff0
+tdat3:  .word 0xf00ff00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rr.S
new file mode 100644
index 000000000..f6000fd78
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rr.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lw_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lw (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_OP( 2, p.lw, 0x00ff00ff,  0, tdat )
+  TEST_LD_RR_OP( 3, p.lw, 0xff00ff00,  4, tdat )
+  TEST_LD_RR_OP( 4, p.lw, 0x0ff00ff0,  8, tdat )
+  TEST_LD_RR_OP( 5, p.lw, 0xf00ff00f, 12, tdat )
+
+  # Negative offset
+  TEST_LD_RR_OP( 6, p.lw, 0x00ff00ff, -12, tdat3 )
+  TEST_LD_RR_OP( 7, p.lw, 0xff00ff00,  -8, tdat3 )
+  TEST_LD_RR_OP( 8, p.lw, 0x0ff00ff0,  -4, tdat3 )
+  TEST_LD_RR_OP( 9, p.lw, 0xf00ff00f,   0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST_BYPASS( 10, 0, p.lw, 0xff00ff00, 4, tdat0 )
+  TEST_LD_RR_DEST_BYPASS( 11, 1, p.lw, 0x0ff00ff0, 4, tdat1 )
+  TEST_LD_RR_DEST_BYPASS( 12, 2, p.lw, 0xf00ff00f, 4, tdat2 )
+
+  TEST_LD_RR_SRC12_BYPASS( 13, 0, 0, p.lw, 0xff00ff00,  4, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 14, 0, 1, p.lw, 0x0ff00ff0, -4, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 15, 0, 2, p.lw, 0xf00ff00f,  4, tdat2 )
+  TEST_LD_RR_SRC12_BYPASS( 16, 1, 0, p.lw, 0xff00ff00,  4, tdat0 )
+  TEST_LD_RR_SRC12_BYPASS( 17, 1, 1, p.lw, 0x0ff00ff0, -4, tdat3 )
+  TEST_LD_RR_SRC12_BYPASS( 18, 2, 0, p.lw, 0xf00ff00f,  4, tdat2 )
+
+  TEST_LD_RR_SRC21_BYPASS( 19, 0, 0, p.lw, 0xff00ff00,  4, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 20, 0, 1, p.lw, 0x0ff00ff0, -4, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 21, 0, 2, p.lw, 0xf00ff00f,  4, tdat2 )
+  TEST_LD_RR_SRC21_BYPASS( 22, 1, 0, p.lw, 0xff00ff00,  4, tdat0 )
+  TEST_LD_RR_SRC21_BYPASS( 23, 1, 1, p.lw, 0x0ff00ff0, -4, tdat3 )
+  TEST_LD_RR_SRC21_BYPASS( 24, 2, 0, p.lw, 0xf00ff00f,  4, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_DEST1_WAW( 25, p.lw,  25, tdat )
+  TEST_LD_RR_DEST1_WAW( 26, p.lw, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0x00ff00ff
+tdat1:  .word 0xff00ff00
+tdat2:  .word 0x0ff00ff0
+tdat3:  .word 0xf00ff00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rrpost.S
new file mode 100644
index 000000000..bdec214d1
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_lw_rrpost.S
@@ -0,0 +1,76 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_lw_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.lw (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_LD_RR_POST_OP( 2, p.lw, 0x00ff00ff,  0,  tdat )
+  TEST_LD_RR_POST_OP( 3, p.lw, 0x00ff00ff,  4,  tdat )
+  TEST_LD_RR_POST_OP( 4, p.lw, 0xff00ff00,  8, tdat1 )
+  TEST_LD_RR_POST_OP( 5, p.lw, 0xff00ff00, 12, tdat1 )
+
+  # Negative offset
+  TEST_LD_RR_POST_OP( 6, p.lw, 0x0ff00ff0, -12, tdat2 )
+  TEST_LD_RR_POST_OP( 7, p.lw, 0x0ff00ff0,  -8, tdat2 )
+  TEST_LD_RR_POST_OP( 8, p.lw, 0xf00ff00f,  -4, tdat3 )
+  TEST_LD_RR_POST_OP( 9, p.lw, 0xf00ff00f,   0, tdat3 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST_BYPASS( 10, 0, p.lw, 0x00ff00ff, 4, tdat0 )
+  TEST_LD_RR_POST_DEST_BYPASS( 11, 1, p.lw, 0xff00ff00, 4, tdat1 )
+  TEST_LD_RR_POST_DEST_BYPASS( 12, 2, p.lw, 0x0ff00ff0, 4, tdat2 )
+
+  TEST_LD_RR_POST_SRC12_BYPASS( 13, 0, 0, p.lw, 0x00ff00ff,  4, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 14, 0, 1, p.lw, 0xf00ff00f, -4, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 15, 0, 2, p.lw, 0x0ff00ff0,  4, tdat2 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 16, 1, 0, p.lw, 0x00ff00ff,  4, tdat0 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 17, 1, 1, p.lw, 0xf00ff00f, -4, tdat3 )
+  TEST_LD_RR_POST_SRC12_BYPASS( 18, 2, 0, p.lw, 0x0ff00ff0,  4, tdat2 )
+
+  TEST_LD_RR_POST_SRC21_BYPASS( 19, 0, 0, p.lw, 0x00ff00ff,  4, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 20, 0, 1, p.lw, 0xf00ff00f, -4, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 21, 0, 2, p.lw, 0x0ff00ff0,  4, tdat2 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 22, 1, 0, p.lw, 0x00ff00ff,  4, tdat0 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 23, 1, 1, p.lw, 0xf00ff00f, -4, tdat3 )
+  TEST_LD_RR_POST_SRC21_BYPASS( 24, 2, 0, p.lw, 0x0ff00ff0,  4, tdat2 )
+
+  #-------------------------------------------------------------
+  # Test write-after-write hazard
+  #-------------------------------------------------------------
+
+  TEST_LD_RR_POST_DEST1_WAW( 25, p.lw,  25, tdat )
+  TEST_LD_RR_POST_DEST1_WAW( 26, p.lw, -76, tdat )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0x00ff00ff
+tdat1:  .word 0xff00ff00
+tdat2:  .word 0x0ff00ff0
+tdat3:  .word 0xf00ff00f
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_mac.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_mac.S
new file mode 100644
index 000000000..3ccd7f41b
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_mac.S
@@ -0,0 +1,88 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_mac.S
+#-----------------------------------------------------------------------------
+#
+# Test p.mac instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_OP( 2, p.mac, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )
+  TEST_RRR_OP( 3, p.mac, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF )
+  TEST_RRR_OP( 4, p.mac, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 )
+  TEST_RRR_OP( 5, p.mac, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF )
+  TEST_RRR_OP( 6, p.mac, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 )
+  TEST_RRR_OP( 7, p.mac, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF )
+  TEST_RRR_OP( 8, p.mac, 0x00000001, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 )
+  TEST_RRR_OP( 9, p.mac, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF )
+
+  TEST_RRR_OP( 10, p.mac, 0x4CF50B3F, 0xB463DADE, 0x1C2D45F3, 0x67287485 )
+  TEST_RRR_OP( 11, p.mac, 0x01C8425D, 0x5DE547E9, 0xBE923643, 0x20B94A62 )
+  TEST_RRR_OP( 12, p.mac, 0x125000F5, 0xD5042C35, 0x113E2192, 0xD265F5BB )
+  TEST_RRR_OP( 13, p.mac, 0x117DE9BB, 0x0762A9A5, 0xAB420127, 0x9B426C98 )
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_SRC1_EQ_DEST( 14, p.mac, 0x0CEE5928, 0x113E2192, 0x1C2D45F3 )
+  TEST_RRR_SRC2_EQ_DEST( 15, p.mac, 0x1F280E82, 0x7E139C55, 0xBE923643 )
+
+  TEST_RRR_SRC12_EQ_DEST( 16, p.mac, 0x22EE857E, 0x84BB8025 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_DEST_BYPASS( 17, 0, p.mac,  5, 1, 2, 3 )
+  TEST_RRR_DEST_BYPASS( 18, 1, p.mac, 26, 4, 5, 6 )
+  TEST_RRR_DEST_BYPASS( 19, 2, p.mac, 65, 7, 8, 9 )
+
+  TEST_RRR_SRC12_BYPASS( 20, 0, 0, p.mac,   5,  1,  2,  3 )
+  TEST_RRR_SRC12_BYPASS( 21, 0, 1, p.mac,  26,  4,  5,  6 )
+  TEST_RRR_SRC12_BYPASS( 22, 0, 2, p.mac,  65,  7,  8,  9 )
+  TEST_RRR_SRC12_BYPASS( 23, 1, 0, p.mac,   5, -1, -2,  3 )
+  TEST_RRR_SRC12_BYPASS( 24, 1, 1, p.mac,  14,  4,  5, -6 )
+  TEST_RRR_SRC12_BYPASS( 25, 2, 0, p.mac, -47, -7,  8,  9 )
+
+  TEST_RRR_SRC21_BYPASS( 26, 0, 0, p.mac,   5,  1,  2,  3 )
+  TEST_RRR_SRC21_BYPASS( 27, 0, 1, p.mac,  26,  4,  5,  6 )
+  TEST_RRR_SRC21_BYPASS( 28, 0, 2, p.mac,  65,  7,  8,  9 )
+  TEST_RRR_SRC21_BYPASS( 29, 1, 0, p.mac,   5, -1, -2,  3 )
+  TEST_RRR_SRC21_BYPASS( 30, 1, 1, p.mac,  14,  4,  5, -6 )
+  TEST_RRR_SRC21_BYPASS( 31, 2, 0, p.mac, -47, -7,  8,  9 )
+
+  TEST_RRR_SRC3_BYPASS( 32, 0, p.mac, 26,  4,  5, 6 )
+  TEST_RRR_SRC3_BYPASS( 33, 1, p.mac, 65,  7,  8, 9 )
+  TEST_RRR_SRC3_BYPASS( 34, 2, p.mac,  5, -1, -2, 3 )
+
+  TEST_RRR_ZEROSRC1( 35, p.mac, 10, -5, 10 )
+  TEST_RRR_ZEROSRC2( 36, p.mac,  7, 32,  7 )
+  TEST_RRR_ZEROSRC3( 37, p.mac, -8, -1,  8 )
+
+  TEST_RRR_ZEROSRC12( 38, p.mac, -3, -3 )
+
+  TEST_RRR_ZEROSRC123( 39, p.mac, 0 )
+
+  TEST_RRR_ZERODEST( 40, p.mac, 34, -10 )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_msu.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_msu.S
new file mode 100644
index 000000000..46f7b5866
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_msu.S
@@ -0,0 +1,88 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_msu.S
+#-----------------------------------------------------------------------------
+#
+# Test p.msu instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_OP( 2, p.msu, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )
+  TEST_RRR_OP( 3, p.msu, 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF )
+  TEST_RRR_OP( 4, p.msu, 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 )
+  TEST_RRR_OP( 5, p.msu, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF )
+  TEST_RRR_OP( 6, p.msu, 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 )
+  TEST_RRR_OP( 7, p.msu, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF )
+  TEST_RRR_OP( 8, p.msu, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 )
+  TEST_RRR_OP( 9, p.msu, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF )
+
+  TEST_RRR_OP( 10, p.msu, 0x815BDDCB, 0xB463DADE, 0x1C2D45F3, 0x67287485 )
+  TEST_RRR_OP( 11, p.msu, 0x3FAA5267, 0x5DE547E9, 0xBE923643, 0x20B94A62 )
+  TEST_RRR_OP( 12, p.msu, 0x927BEA81, 0xD5042C35, 0x113E2192, 0xD265F5BB )
+  TEST_RRR_OP( 13, p.msu, 0x2506EF75, 0x0762A9A5, 0xAB420127, 0x9B426C98 )
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_SRC1_EQ_DEST( 14, p.msu, 0x158DE9FC, 0x113E2192, 0x1C2D45F3 )
+  TEST_RRR_SRC2_EQ_DEST( 15, p.msu, 0x5DFC5E04, 0x7E139C55, 0xBE923643 )
+
+  TEST_RRR_SRC12_EQ_DEST( 16, p.msu, 0xE6887ACC, 0x84BB8025 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_RRR_DEST_BYPASS( 17, 0, p.msu,   1, 1, 2, 3 )
+  TEST_RRR_DEST_BYPASS( 18, 1, p.msu, -14, 4, 5, 6 )
+  TEST_RRR_DEST_BYPASS( 19, 2, p.msu, -47, 7, 8, 9 )
+
+  TEST_RRR_SRC12_BYPASS( 20, 0, 0, p.msu,   1,  1,  2,  3 )
+  TEST_RRR_SRC12_BYPASS( 21, 0, 1, p.msu, -14,  4,  5,  6 )
+  TEST_RRR_SRC12_BYPASS( 22, 0, 2, p.msu, -47,  7,  8,  9 )
+  TEST_RRR_SRC12_BYPASS( 23, 1, 0, p.msu,   1, -1, -2,  3 )
+  TEST_RRR_SRC12_BYPASS( 24, 1, 1, p.msu, -26,  4,  5, -6 )
+  TEST_RRR_SRC12_BYPASS( 25, 2, 0, p.msu,  65, -7,  8,  9 )
+
+  TEST_RRR_SRC21_BYPASS( 26, 0, 0, p.msu,   1,  1,  2,  3 )
+  TEST_RRR_SRC21_BYPASS( 27, 0, 1, p.msu, -14,  4,  5,  6 )
+  TEST_RRR_SRC21_BYPASS( 28, 0, 2, p.msu, -47,  7,  8,  9 )
+  TEST_RRR_SRC21_BYPASS( 29, 1, 0, p.msu,   1, -1, -2,  3 )
+  TEST_RRR_SRC21_BYPASS( 30, 1, 1, p.msu, -26,  4,  5, -6 )
+  TEST_RRR_SRC21_BYPASS( 31, 2, 0, p.msu,  65, -7,  8,  9 )
+
+  TEST_RRR_SRC3_BYPASS( 32, 0, p.msu, -14,  4,  5,  6 )
+  TEST_RRR_SRC3_BYPASS( 33, 1, p.msu, -47,  7,  8,  9 )
+  TEST_RRR_SRC3_BYPASS( 34, 2, p.msu,   1, -1, -2,  3 )
+
+  TEST_RRR_ZEROSRC1( 35, p.msu, 10, -5, 10 )
+  TEST_RRR_ZEROSRC2( 36, p.msu,  7, 32,  7 )
+  TEST_RRR_ZEROSRC3( 37, p.msu,  8, -1,  8 )
+
+  TEST_RRR_ZEROSRC12( 38, p.msu, -3, -3 )
+
+  TEST_RRR_ZEROSRC123( 39, p.msu, 0 )
+
+  TEST_RRR_ZERODEST( 40, p.msu, 34, -10 )
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_irpost.S
new file mode 100644
index 000000000..d5ddf3a03
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_irpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sb_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sb (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_POST_OP( 2, p.sb, lb, 0xffffffaa, 0, tdat )
+  TEST_ST_POST_OP( 3, p.sb, lb, 0x00000000, 1, tdat )
+  TEST_ST_POST_OP( 4, p.sb, lh, 0xffffefa0, 2, tdat )
+  TEST_ST_POST_OP( 5, p.sb, lb, 0x0000000a, 3, tdat )
+
+  # Negative offset
+  TEST_ST_POST_OP( 6, p.sb, lb, 0xffffffaa, -6, tdat8 )
+  TEST_ST_POST_OP( 7, p.sb, lb, 0x00000000, -5, tdat8 )
+  TEST_ST_POST_OP( 8, p.sb, lb, 0xffffffa0, -3, tdat8 )
+  TEST_ST_POST_OP( 9, p.sb, lb, 0x0000000a, -1, tdat8 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_POST_SRC12_BYPASS( 10, 0, 0, p.sb, lb, 0xffffffdd, 0, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 11, 0, 1, p.sb, lb, 0xffffffcd, 1, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 12, 0, 2, p.sb, lb, 0xffffffcc, 2, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 13, 1, 0, p.sb, lb, 0xffffffbc, 3, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 14, 1, 1, p.sb, lb, 0xffffffbb, 4, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 15, 2, 0, p.sb, lb, 0xffffffab, 5, tdat );
+
+  TEST_ST_POST_SRC21_BYPASS( 16, 0, 0, p.sb, lb, 0x00000033, 0, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 17, 0, 1, p.sb, lb, 0x00000023, 1, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 18, 0, 2, p.sb, lb, 0x00000022, 2, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 19, 1, 0, p.sb, lb, 0x00000012, 3, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 20, 1, 1, p.sb, lb, 0x00000011, 4, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 21, 2, 0, p.sb, lb, 0x00000001, 5, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xef
+tdat1:  .byte 0xef
+tdat2:  .byte 0xef
+tdat3:  .byte 0xef
+tdat4:  .byte 0xef
+tdat5:  .byte 0xef
+tdat6:  .byte 0xef
+tdat7:  .byte 0xef
+tdat8:  .byte 0xef
+tdat9:  .byte 0xef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rr.S
new file mode 100644
index 000000000..6b501b487
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rr.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sb_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sb (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_OP( 2, p.sb, lb, 0xffffffaa, 0, tdat )
+  TEST_ST_RR_OP( 3, p.sb, lb, 0x00000000, 1, tdat )
+  TEST_ST_RR_OP( 4, p.sb, lh, 0xffffefa0, 2, tdat )
+  TEST_ST_RR_OP( 5, p.sb, lb, 0x0000000a, 3, tdat )
+
+  # Negative offset
+  TEST_ST_RR_OP( 6, p.sb, lb, 0xffffffaa, -6, tdat8 )
+  TEST_ST_RR_OP( 7, p.sb, lb, 0x00000000, -5, tdat8 )
+  TEST_ST_RR_OP( 8, p.sb, lb, 0xffffffa0, -3, tdat8 )
+  TEST_ST_RR_OP( 9, p.sb, lb, 0x0000000a, -1, tdat8 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_SRC12_BYPASS( 10, 0, 0, p.sb, lb, 0xffffffdd, 0, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 11, 0, 1, p.sb, lb, 0xffffffcd, 1, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 12, 0, 2, p.sb, lb, 0xffffffcc, 2, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 13, 1, 0, p.sb, lb, 0xffffffbc, 3, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 14, 1, 1, p.sb, lb, 0xffffffbb, 4, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 15, 2, 0, p.sb, lb, 0xffffffab, 5, tdat );
+
+  TEST_ST_RR_SRC21_BYPASS( 16, 0, 0, p.sb, lb, 0x00000033, 0, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 17, 0, 1, p.sb, lb, 0x00000023, 1, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 18, 0, 2, p.sb, lb, 0x00000022, 2, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 19, 1, 0, p.sb, lb, 0x00000012, 3, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 20, 1, 1, p.sb, lb, 0x00000011, 4, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 21, 2, 0, p.sb, lb, 0x00000001, 5, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xef
+tdat1:  .byte 0xef
+tdat2:  .byte 0xef
+tdat3:  .byte 0xef
+tdat4:  .byte 0xef
+tdat5:  .byte 0xef
+tdat6:  .byte 0xef
+tdat7:  .byte 0xef
+tdat8:  .byte 0xef
+tdat9:  .byte 0xef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rrpost.S
new file mode 100644
index 000000000..3ed706fde
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sb_rrpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sb_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sb (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_POST_OP( 2, p.sb, lb, 0xffffffaa, 0, tdat1 )
+  TEST_ST_RR_POST_OP( 3, p.sb, lb, 0x00000000, 1, tdat1 )
+  TEST_ST_RR_POST_OP( 4, p.sb, lh, 0xffffefa0, 2, tdat2 )
+  TEST_ST_RR_POST_OP( 5, p.sb, lb, 0x0000000a, 3, tdat2 )
+
+  # Negative offset
+  TEST_ST_RR_POST_OP( 6, p.sb, lb, 0xffffffaa, -6, tdat8 )
+  TEST_ST_RR_POST_OP( 7, p.sb, lb, 0x00000000, -5, tdat8 )
+  TEST_ST_RR_POST_OP( 8, p.sb, lb, 0xffffffa0, -3, tdat8 )
+  TEST_ST_RR_POST_OP( 9, p.sb, lb, 0x0000000a, -1, tdat8 )
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_POST_SRC12_BYPASS( 10, 0, 0, p.sb, lb, 0xffffffdd, 0, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 11, 0, 1, p.sb, lb, 0xffffffcd, 1, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 12, 0, 2, p.sb, lb, 0xffffffcc, 2, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 13, 1, 0, p.sb, lb, 0xffffffbc, 3, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 14, 1, 1, p.sb, lb, 0xffffffbb, 4, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 15, 2, 0, p.sb, lb, 0xffffffab, 5, tdat );
+
+  TEST_ST_RR_POST_SRC21_BYPASS( 16, 0, 0, p.sb, lb, 0x00000033, 0, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 17, 0, 1, p.sb, lb, 0x00000023, 1, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 18, 0, 2, p.sb, lb, 0x00000022, 2, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 19, 1, 0, p.sb, lb, 0x00000012, 3, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 20, 1, 1, p.sb, lb, 0x00000011, 4, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 21, 2, 0, p.sb, lb, 0x00000001, 5, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .byte 0xef
+tdat1:  .byte 0xef
+tdat2:  .byte 0xef
+tdat3:  .byte 0xef
+tdat4:  .byte 0xef
+tdat5:  .byte 0xef
+tdat6:  .byte 0xef
+tdat7:  .byte 0xef
+tdat8:  .byte 0xef
+tdat9:  .byte 0xef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_irpost.S
new file mode 100644
index 000000000..32c0b376b
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_irpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sh_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sh (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_POST_OP( 2, p.sh, lh, 0x000000aa, 0, tdat );
+  TEST_ST_POST_OP( 3, p.sh, lh, 0xffffaa00, 2, tdat );
+  TEST_ST_POST_OP( 4, p.sh, lw, 0xbeef0aa0, 4, tdat );
+  TEST_ST_POST_OP( 5, p.sh, lh, 0xffffa00a, 6, tdat );
+
+  # Negative offset
+  TEST_ST_POST_OP( 6, p.sh, lh, 0x000000aa, -6, tdat8 );
+  TEST_ST_POST_OP( 7, p.sh, lh, 0xffffaa00, -4, tdat8 );
+  TEST_ST_POST_OP( 8, p.sh, lh, 0x00000aa0, -2, tdat8 );
+  TEST_ST_POST_OP( 9, p.sh, lh, 0xffffa00a,  0,  tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_POST_SRC12_BYPASS( 10, 0, 0, p.sh, lh, 0xffffccdd,  0, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 11, 0, 1, p.sh, lh, 0xffffbccd,  2, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 12, 0, 2, p.sh, lh, 0xffffbbcc,  4, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 13, 1, 0, p.sh, lh, 0xffffabbc,  6, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 14, 1, 1, p.sh, lh, 0xffffaabb,  8, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 15, 2, 0, p.sh, lh, 0xffffdaab, 10, tdat );
+
+  TEST_ST_POST_SRC21_BYPASS( 16, 0, 0, p.sh, lh, 0x00002233,  0, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 17, 0, 1, p.sh, lh, 0x00001223,  2, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 18, 0, 2, p.sh, lh, 0x00001122,  4, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 19, 1, 0, p.sh, lh, 0x00000112,  6, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 20, 1, 1, p.sh, lh, 0x00000011,  8, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 21, 2, 0, p.sh, lh, 0x00003001, 10, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0xbeef
+tdat1:  .half 0xbeef
+tdat2:  .half 0xbeef
+tdat3:  .half 0xbeef
+tdat4:  .half 0xbeef
+tdat5:  .half 0xbeef
+tdat6:  .half 0xbeef
+tdat7:  .half 0xbeef
+tdat8:  .half 0xbeef
+tdat9:  .half 0xbeef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rr.S
new file mode 100644
index 000000000..0c5f4cbcb
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rr.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sh_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sh (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_OP( 2, p.sh, lh, 0x000000aa, 0, tdat );
+  TEST_ST_RR_OP( 3, p.sh, lh, 0xffffaa00, 2, tdat );
+  TEST_ST_RR_OP( 4, p.sh, lw, 0xbeef0aa0, 4, tdat );
+  TEST_ST_RR_OP( 5, p.sh, lh, 0xffffa00a, 6, tdat );
+
+  # Negative offset
+  TEST_ST_RR_OP( 6, p.sh, lh, 0x000000aa, -6, tdat8 );
+  TEST_ST_RR_OP( 7, p.sh, lh, 0xffffaa00, -4, tdat8 );
+  TEST_ST_RR_OP( 8, p.sh, lh, 0x00000aa0, -2, tdat8 );
+  TEST_ST_RR_OP( 9, p.sh, lh, 0xffffa00a,  0,  tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_SRC12_BYPASS( 10, 0, 0, p.sh, lh, 0xffffccdd,  0, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 11, 0, 1, p.sh, lh, 0xffffbccd,  2, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 12, 0, 2, p.sh, lh, 0xffffbbcc,  4, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 13, 1, 0, p.sh, lh, 0xffffabbc,  6, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 14, 1, 1, p.sh, lh, 0xffffaabb,  8, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 15, 2, 0, p.sh, lh, 0xffffdaab, 10, tdat );
+
+  TEST_ST_RR_SRC21_BYPASS( 16, 0, 0, p.sh, lh, 0x00002233,  0, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 17, 0, 1, p.sh, lh, 0x00001223,  2, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 18, 0, 2, p.sh, lh, 0x00001122,  4, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 19, 1, 0, p.sh, lh, 0x00000112,  6, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 20, 1, 1, p.sh, lh, 0x00000011,  8, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 21, 2, 0, p.sh, lh, 0x00003001, 10, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0xbeef
+tdat1:  .half 0xbeef
+tdat2:  .half 0xbeef
+tdat3:  .half 0xbeef
+tdat4:  .half 0xbeef
+tdat5:  .half 0xbeef
+tdat6:  .half 0xbeef
+tdat7:  .half 0xbeef
+tdat8:  .half 0xbeef
+tdat9:  .half 0xbeef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rrpost.S
new file mode 100644
index 000000000..5dafda6d9
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sh_rrpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sh_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sh (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_POST_OP( 2, p.sh, lh, 0x000000aa, 0, tdat );
+  TEST_ST_RR_POST_OP( 3, p.sh, lh, 0xffffaa00, 2, tdat );
+  TEST_ST_RR_POST_OP( 4, p.sh, lw, 0xbeef0aa0, 4, tdat );
+  TEST_ST_RR_POST_OP( 5, p.sh, lh, 0xffffa00a, 6, tdat );
+
+  # Negative offset
+  TEST_ST_RR_POST_OP( 6, p.sh, lh, 0x000000aa, -6, tdat8 );
+  TEST_ST_RR_POST_OP( 7, p.sh, lh, 0xffffaa00, -4, tdat8 );
+  TEST_ST_RR_POST_OP( 8, p.sh, lh, 0x00000aa0, -2, tdat8 );
+  TEST_ST_RR_POST_OP( 9, p.sh, lh, 0xffffa00a,  0,  tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_POST_SRC12_BYPASS( 10, 0, 0, p.sh, lh, 0xffffccdd,  0, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 11, 0, 1, p.sh, lh, 0xffffbccd,  2, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 12, 0, 2, p.sh, lh, 0xffffbbcc,  4, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 13, 1, 0, p.sh, lh, 0xffffabbc,  6, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 14, 1, 1, p.sh, lh, 0xffffaabb,  8, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 15, 2, 0, p.sh, lh, 0xffffdaab, 10, tdat );
+
+  TEST_ST_RR_POST_SRC21_BYPASS( 16, 0, 0, p.sh, lh, 0x00002233,  0, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 17, 0, 1, p.sh, lh, 0x00001223,  2, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 18, 0, 2, p.sh, lh, 0x00001122,  4, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 19, 1, 0, p.sh, lh, 0x00000112,  6, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 20, 1, 1, p.sh, lh, 0x00000011,  8, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 21, 2, 0, p.sh, lh, 0x00003001, 10, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .half 0xbeef
+tdat1:  .half 0xbeef
+tdat2:  .half 0xbeef
+tdat3:  .half 0xbeef
+tdat4:  .half 0xbeef
+tdat5:  .half 0xbeef
+tdat6:  .half 0xbeef
+tdat7:  .half 0xbeef
+tdat8:  .half 0xbeef
+tdat9:  .half 0xbeef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_irpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_irpost.S
new file mode 100644
index 000000000..bd8c174d4
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_irpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sw_irpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sw (immediate-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_POST_OP( 2, p.sw, lw, 0x00aa00aa,  0, tdat );
+  TEST_ST_POST_OP( 3, p.sw, lw, 0xaa00aa00,  4, tdat );
+  TEST_ST_POST_OP( 4, p.sw, lw, 0x0aa00aa0,  8, tdat );
+  TEST_ST_POST_OP( 5, p.sw, lw, 0xa00aa00a, 12, tdat );
+
+  # Negative offset
+  TEST_ST_POST_OP( 6, p.sw, lw, 0x00aa00aa, -12, tdat8 );
+  TEST_ST_POST_OP( 7, p.sw, lw, 0xaa00aa00,  -8, tdat8 );
+  TEST_ST_POST_OP( 8, p.sw, lw, 0x0aa00aa0,  -4, tdat8 );
+  TEST_ST_POST_OP( 9, p.sw, lw, 0xa00aa00a,   0, tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_POST_SRC12_BYPASS( 10, 0, 0, p.sw, lw, 0xaabbccdd,  0, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 11, 0, 1, p.sw, lw, 0xdaabbccd,  4, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 12, 0, 2, p.sw, lw, 0xddaabbcc,  8, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 13, 1, 0, p.sw, lw, 0xcddaabbc, 12, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 14, 1, 1, p.sw, lw, 0xccddaabb, 16, tdat );
+  TEST_ST_POST_SRC12_BYPASS( 15, 2, 0, p.sw, lw, 0xbccddaab, 20, tdat );
+
+  TEST_ST_POST_SRC21_BYPASS( 16, 0, 0, p.sw, lw, 0x00112233,  0, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 17, 0, 1, p.sw, lw, 0x30011223,  4, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 18, 0, 2, p.sw, lw, 0x33001122,  8, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 19, 1, 0, p.sw, lw, 0x23300112, 12, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 20, 1, 1, p.sw, lw, 0x22330011, 16, tdat );
+  TEST_ST_POST_SRC21_BYPASS( 21, 2, 0, p.sw, lw, 0x12233001, 20, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0xdeadbeef
+tdat1:  .word 0xdeadbeef
+tdat2:  .word 0xdeadbeef
+tdat3:  .word 0xdeadbeef
+tdat4:  .word 0xdeadbeef
+tdat5:  .word 0xdeadbeef
+tdat6:  .word 0xdeadbeef
+tdat7:  .word 0xdeadbeef
+tdat8:  .word 0xdeadbeef
+tdat9:  .word 0xdeadbeef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rr.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rr.S
new file mode 100644
index 000000000..6a6a53e38
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rr.S
@@ -0,0 +1,72 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sw_rr.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sw (register-register) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_OP( 2, p.sw, lw, 0x00aa00aa,  0, tdat );
+  TEST_ST_RR_OP( 3, p.sw, lw, 0xaa00aa00,  4, tdat );
+  TEST_ST_RR_OP( 4, p.sw, lw, 0x0aa00aa0,  8, tdat );
+  TEST_ST_RR_OP( 5, p.sw, lw, 0xa00aa00a, 12, tdat );
+
+  # Negative offset
+  TEST_ST_RR_OP( 6, p.sw, lw, 0x00aa00aa, -12, tdat8 );
+  TEST_ST_RR_OP( 7, p.sw, lw, 0xaa00aa00,  -8, tdat8 );
+  TEST_ST_RR_OP( 8, p.sw, lw, 0x0aa00aa0,  -4, tdat8 );
+  TEST_ST_RR_OP( 9, p.sw, lw, 0xa00aa00a,   0, tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_SRC12_BYPASS( 10, 0, 0, p.sw, lw, 0xaabbccdd,  0, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 11, 0, 1, p.sw, lw, 0xdaabbccd,  4, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 12, 0, 2, p.sw, lw, 0xddaabbcc,  8, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 13, 1, 0, p.sw, lw, 0xcddaabbc, 12, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 14, 1, 1, p.sw, lw, 0xccddaabb, 16, tdat );
+  TEST_ST_RR_SRC12_BYPASS( 15, 2, 0, p.sw, lw, 0xbccddaab, 20, tdat );
+
+  TEST_ST_RR_SRC21_BYPASS( 16, 0, 0, p.sw, lw, 0x00112233,  0, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 17, 0, 1, p.sw, lw, 0x30011223,  4, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 18, 0, 2, p.sw, lw, 0x33001122,  8, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 19, 1, 0, p.sw, lw, 0x23300112, 12, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 20, 1, 1, p.sw, lw, 0x22330011, 16, tdat );
+  TEST_ST_RR_SRC21_BYPASS( 21, 2, 0, p.sw, lw, 0x12233001, 20, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0xdeadbeef
+tdat1:  .word 0xdeadbeef
+tdat2:  .word 0xdeadbeef
+tdat3:  .word 0xdeadbeef
+tdat4:  .word 0xdeadbeef
+tdat5:  .word 0xdeadbeef
+tdat6:  .word 0xdeadbeef
+tdat7:  .word 0xdeadbeef
+tdat8:  .word 0xdeadbeef
+tdat9:  .word 0xdeadbeef
+
+RVTEST_DATA_END
+
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rrpost.S b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rrpost.S
new file mode 100644
index 000000000..ce9c58d1a
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/p_sw_rrpost.S
@@ -0,0 +1,71 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# p_sw_rrpost.S
+#-----------------------------------------------------------------------------
+#
+# Test p.sw (register-register post-increment) instruction.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Basic tests
+  #-------------------------------------------------------------
+
+  # Positive offset
+  TEST_ST_RR_POST_OP( 2, p.sw, lw, 0x00aa00aa,  0, tdat );
+  TEST_ST_RR_POST_OP( 3, p.sw, lw, 0xaa00aa00,  4, tdat );
+  TEST_ST_RR_POST_OP( 4, p.sw, lw, 0x0aa00aa0,  8, tdat );
+  TEST_ST_RR_POST_OP( 5, p.sw, lw, 0xa00aa00a, 12, tdat );
+
+  # Negative offset
+  TEST_ST_RR_POST_OP( 6, p.sw, lw, 0x00aa00aa, -12, tdat8 );
+  TEST_ST_RR_POST_OP( 7, p.sw, lw, 0xaa00aa00,  -8, tdat8 );
+  TEST_ST_RR_POST_OP( 8, p.sw, lw, 0x0aa00aa0,  -4, tdat8 );
+  TEST_ST_RR_POST_OP( 9, p.sw, lw, 0xa00aa00a,   0, tdat8 );
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  TEST_ST_RR_POST_SRC12_BYPASS( 10, 0, 0, p.sw, lw, 0xaabbccdd,  0, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 11, 0, 1, p.sw, lw, 0xdaabbccd,  4, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 12, 0, 2, p.sw, lw, 0xddaabbcc,  8, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 13, 1, 0, p.sw, lw, 0xcddaabbc, 12, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 14, 1, 1, p.sw, lw, 0xccddaabb, 16, tdat );
+  TEST_ST_RR_POST_SRC12_BYPASS( 15, 2, 0, p.sw, lw, 0xbccddaab, 20, tdat );
+
+  TEST_ST_RR_POST_SRC21_BYPASS( 16, 0, 0, p.sw, lw, 0x00112233,  0, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 17, 0, 1, p.sw, lw, 0x30011223,  4, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 18, 0, 2, p.sw, lw, 0x33001122,  8, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 19, 1, 0, p.sw, lw, 0x23300112, 12, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 20, 1, 1, p.sw, lw, 0x22330011, 16, tdat );
+  TEST_ST_RR_POST_SRC21_BYPASS( 21, 2, 0, p.sw, lw, 0x12233001, 20, tdat );
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+tdat:
+tdat0:  .word 0xdeadbeef
+tdat1:  .word 0xdeadbeef
+tdat2:  .word 0xdeadbeef
+tdat3:  .word 0xdeadbeef
+tdat4:  .word 0xdeadbeef
+tdat5:  .word 0xdeadbeef
+tdat6:  .word 0xdeadbeef
+tdat7:  .word 0xdeadbeef
+tdat8:  .word 0xdeadbeef
+tdat9:  .word 0xdeadbeef
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_abs.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_abs.S
new file mode 100644
index 000000000..79b4eeecd
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_abs.S
@@ -0,0 +1,52 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_abs.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.abs instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.abs.h
+  TEST_R_OP( 2, pv.abs.h, 0x10081554, 0xEFF8EAAC );
+  TEST_R_OP( 3, pv.abs.h, 0x369800DA, 0x3698FF26 );
+  TEST_R_OP( 4, pv.abs.h, 0x7C127B74, 0x7C12848C );
+  # pv.abs.b
+  TEST_R_OP( 5, pv.abs.b, 0x3A444335, 0x3ABC4335 );
+  TEST_R_OP( 6, pv.abs.b, 0x2B743B7C, 0x2B8C3B7C );
+  TEST_R_OP( 7, pv.abs.b, 0x70362066, 0x70362066 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_R_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_R_DEST_BYPASS
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_add.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_add.S
new file mode 100644
index 000000000..0287cc57c
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_add.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_add.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.add instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.add.h
+  TEST_RR_OP( 2, pv.add.h, 0xC1ACF68C, 0xF014169D, 0xD198DFEF );
+  TEST_RR_OP( 3, pv.add.h, 0x795F026B, 0x7ABB8DD7, 0xFEA47494 );
+  TEST_RR_OP( 4, pv.add.h, 0x8ABE2A6C, 0xAA4F3E71, 0xE06FEBFB );
+  # pv.add.sc.h
+  TEST_RR_OP( 5, pv.add.sc.h, 0x603D0BE0, 0xFC7EA821, 0x94BF63BF );
+  TEST_RR_OP( 6, pv.add.sc.h, 0x4A3F89DF, 0x8E28CDC8, 0x3230BC17 );
+  TEST_RR_OP( 7, pv.add.sc.h, 0x2034B556, 0x506CE58E, 0x4436CFC8 );
+  # pv.add.sci.h
+  TEST_SIMM6_OP(  8, pv.add.sci.h, 0x77371C0E, 0x772C1C03, 11 );
+  TEST_SIMM6_OP(  9, pv.add.sci.h, 0xD1BA3380, 0xD1AF3375, 11 );
+  TEST_SIMM6_OP( 10, pv.add.sci.h, 0x6E73CC2D, 0x6E68CC22, 11 );
+  # pv.add.b
+  TEST_RR_OP( 11, pv.add.b, 0x8A1518C0, 0x3E50B3BE, 0x4CC56502 );
+  TEST_RR_OP( 12, pv.add.b, 0xE8E21596, 0x7ECB21CB, 0x6A17F4CB );
+  TEST_RR_OP( 13, pv.add.b, 0xD48653D2, 0x2E741840, 0xA6123B92 );
+  # pv.add.sc.b
+  TEST_RR_OP( 14, pv.add.sc.b, 0xC96CF4FF, 0x52F57D88, 0x86A5D077 );
+  TEST_RR_OP( 15, pv.add.sc.b, 0x877D91A1, 0x2F253949, 0x694FD558 );
+  TEST_RR_OP( 16, pv.add.sc.b, 0xC6646B7D, 0x28C6CDDF, 0x1E09659E );
+  # pv.add.sci.b
+  TEST_SIMM6_OP( 17, pv.add.sci.b, 0x3820508C, 0x2D154581, 11 );
+  TEST_SIMM6_OP( 18, pv.add.sci.b, 0xBF98380C, 0xB48D2D01, 11 );
+  TEST_SIMM6_OP( 19, pv.add.sci.b, 0x90AAEB98, 0x859FE08D, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_and.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_and.S
new file mode 100644
index 000000000..328104676
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_and.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_and.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.and instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.and.h
+  TEST_RR_OP( 2, pv.and.h, 0xE1C028D0, 0xE1D16DD8, 0xE7E4A8F0 );
+  TEST_RR_OP( 3, pv.and.h, 0x30111070, 0xB0111070, 0x3715D975 );
+  TEST_RR_OP( 4, pv.and.h, 0x04000084, 0x04040AA4, 0xBD7314C7 );
+  # pv.and.sc.h
+  TEST_RR_OP( 5, pv.and.sc.h, 0x18008480, 0x5818A5AB, 0x7C269E80 );
+  TEST_RR_OP( 6, pv.and.sc.h, 0x0E0B0683, 0xCF2B6697, 0x4E211ECB );
+  TEST_RR_OP( 7, pv.and.sc.h, 0x08070806, 0x28376966, 0xD7848E0F );
+  # pv.and.sci.h
+  TEST_UIMM6_OP(  8, pv.and.sci.h, 0x00010003, 0xBFE568E7, 11 );
+  TEST_UIMM6_OP(  9, pv.and.sci.h, 0x000A0001, 0xC08A6275, 11 );
+  TEST_UIMM6_OP( 10, pv.and.sci.h, 0x000A0008, 0xDFEE3E6C, 11 );
+  # pv.and.b
+  TEST_RR_OP( 11, pv.and.b, 0xA106671C, 0xE317675C, 0xADC6E7BF );
+  TEST_RR_OP( 12, pv.and.b, 0x036080A8, 0xB360A0A8, 0x077A84AC );
+  TEST_RR_OP( 13, pv.and.b, 0x0B430011, 0x1FDB225B, 0xCB431CB5 );
+  # pv.and.sc.b
+  TEST_RR_OP( 14, pv.and.sc.b, 0xE4892568, 0xE48B3778, 0xCCC46AED );
+  TEST_RR_OP( 15, pv.and.sc.b, 0x00000000, 0x0D96B284, 0x8B596F00 );
+  TEST_RR_OP( 16, pv.and.sc.b, 0x49672C2F, 0xC9672CBF, 0x6AC7706F );
+  # pv.and.sci.b
+  TEST_UIMM6_OP( 17, pv.and.sci.b, 0x0B090808, 0x8F29C848, 11 );
+  TEST_UIMM6_OP( 18, pv.and.sci.b, 0x000A0908, 0x30EA9D78, 11 );
+  TEST_UIMM6_OP( 19, pv.and.sci.b, 0x03000801, 0x83743C41, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_avg.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_avg.S
new file mode 100644
index 000000000..0a7e1ede3
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_avg.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_avg.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.avg instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.avg.h
+  TEST_RR_OP( 2, pv.avg.h, 0xDFA53D57, 0x2C5F4D25, 0x92EC2D89 );
+  TEST_RR_OP( 3, pv.avg.h, 0x18A2C49C, 0xD09FBFB6, 0x60A5C983 );
+  TEST_RR_OP( 4, pv.avg.h, 0xD2902560, 0xE37F8F8F, 0xC1A2BB32 );
+  # pv.avg.sc.h
+  TEST_RR_OP( 5, pv.avg.sc.h, 0xF8B0DF51, 0x6A263768, 0xD18D873A );
+  TEST_RR_OP( 6, pv.avg.sc.h, 0x29B50628, 0xDA3A9320, 0xDB667930 );
+  TEST_RR_OP( 7, pv.avg.sc.h, 0x1CBDF112, 0x4376EC20, 0x13B2F605 );
+  # pv.avg.sci.h
+  TEST_SIMM6_OP(  8, pv.avg.sci.h, 0x2F8BD535, 0x5F0CAA60, 11 );
+  TEST_SIMM6_OP(  9, pv.avg.sci.h, 0x1F1B0A1B, 0x3E2B142C, 11 );
+  TEST_SIMM6_OP( 10, pv.avg.sci.h, 0x1E533C46, 0x3C9C7881, 11 );
+  # pv.avg.b
+  TEST_RR_OP( 11, pv.avg.b, 0xEF09DD01, 0x242B76A4, 0xBBE7445F );
+  TEST_RR_OP( 12, pv.avg.b, 0x2C31DBEE, 0x7B0B5CD3, 0xDE575B0A );
+  TEST_RR_OP( 13, pv.avg.b, 0xF11E19E0, 0x0278F0DE, 0xE0C543E3 );
+  # pv.avg.sc.b
+  TEST_RR_OP( 14, pv.avg.sc.b, 0x12E71EFC, 0x40E95813, 0xDE5394E5 );
+  TEST_RR_OP( 15, pv.avg.sc.b, 0x102204DA, 0xE005C975, 0xE6677040 );
+  TEST_RR_OP( 16, pv.avg.sc.b, 0x1E2ADA29, 0x2840A03D, 0xCF897515 );
+  # pv.avg.sci.b
+  TEST_SIMM6_OP( 17, pv.avg.sci.b, 0x04D5DEFE, 0xFDA0B1F1, 11 );
+  TEST_SIMM6_OP( 18, pv.avg.sci.b, 0xD4192A1E, 0x9E274932, 11 );
+  TEST_SIMM6_OP( 19, pv.avg.sci.b, 0x11E3CFE6, 0x17BC93C1, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_avgu.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_avgu.S
new file mode 100644
index 000000000..1d4c7de8d
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_avgu.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_avgu.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.avgu instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.avgu.h
+  TEST_RR_OP( 2, pv.avgu.h, 0x627F5574, 0xA12DA561, 0x23D10588 );
+  TEST_RR_OP( 3, pv.avgu.h, 0x5F5E7CE3, 0x979062E4, 0x272C96E3 );
+  TEST_RR_OP( 4, pv.avgu.h, 0x6D64331C, 0xF472E6FA, 0xE6567F3F );
+  # pv.avgu.sc.h
+  TEST_RR_OP( 5, pv.avgu.sc.h, 0x0CED14D1, 0xD924E8ED, 0xFFB240B6 );
+  TEST_RR_OP( 6, pv.avgu.sc.h, 0x127F3F7B, 0x7447CE40, 0x64E4B0B7 );
+  TEST_RR_OP( 7, pv.avgu.sc.h, 0x737C50C4, 0x7D7C380C, 0xB749697C );
+  # pv.avgu.sci.h
+  TEST_UIMM6_OP(  8, pv.avgu.sci.h, 0x76BB744A, 0xED6BE88A, 11 );
+  TEST_UIMM6_OP(  9, pv.avgu.sci.h, 0x3BD96A9F, 0x77A8D534, 11 );
+  TEST_UIMM6_OP( 10, pv.avgu.sci.h, 0x551A6EC8, 0xAA29DD86, 11 );
+  # pv.avgu.b
+  TEST_RR_OP( 11, pv.avgu.b, 0x366D332C, 0x8F75F8E9, 0xDD666F70 );
+  TEST_RR_OP( 12, pv.avgu.b, 0x166D3707, 0x5F0C48DF, 0xCECE2730 );
+  TEST_RR_OP( 13, pv.avgu.b, 0x13390E74, 0x2D0C048B, 0xFA67185E );
+  # pv.avgu.sc.b
+  TEST_RR_OP( 14, pv.avgu.sc.b, 0x20102F22, 0xFDDD1B00, 0x65EACB44 );
+  TEST_RR_OP( 15, pv.avgu.sc.b, 0x79130A10, 0x2156444F, 0xAF0796D1 );
+  TEST_RR_OP( 16, pv.avgu.sc.b, 0x44260042, 0x591DD256, 0xFBAE832F );
+  # pv.avgu.sci.b
+  TEST_UIMM6_OP( 17, pv.avgu.sci.b, 0x016B6549, 0xF7CBBF88, 11 );
+  TEST_UIMM6_OP( 18, pv.avgu.sci.b, 0x742F1E50, 0xDE543195, 11 );
+  TEST_UIMM6_OP( 19, pv.avgu.sci.b, 0x34686166, 0x5EC5B7C1, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotsp.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotsp.S
new file mode 100644
index 000000000..9242c8bb9
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotsp.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_dotsp.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.dotsp instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.dotsp.h
+  TEST_RR_OP( 2, pv.dotsp.h, 0x0023A592, 0xFE67FB1A, 0x50E4DE57 );
+  TEST_RR_OP( 3, pv.dotsp.h, 0x1EC63DAA, 0xADBC1E09, 0xA2C806FA );
+  TEST_RR_OP( 4, pv.dotsp.h, 0x222B210B, 0x3FDAEFE7, 0x7BA5CB0F );
+  # pv.dotsp.sc.h
+  TEST_RR_OP( 5, pv.dotsp.sc.h, 0xDE3EBCF3, 0x5E5C31BF, 0xCB24C409 );
+  TEST_RR_OP( 6, pv.dotsp.sc.h, 0x03F34EE4, 0xEC042250, 0x230A4695 );
+  TEST_RR_OP( 7, pv.dotsp.sc.h, 0x047909E6, 0x6BF5D085, 0x9AB012EF );
+  # pv.dotsp.sci.h
+  TEST_SIMM6_OP(  8, pv.dotsp.sci.h, 0xFFFD1338, 0x36D2FEAA, -14 );
+  TEST_SIMM6_OP(  9, pv.dotsp.sci.h, 0xFFFC68FB, 0x6752FECB,  -9 );
+  TEST_SIMM6_OP( 10, pv.dotsp.sci.h, 0x000098C4, 0x9747CFF5,  -1 );
+  # pv.dotsp.b
+  TEST_RR_OP( 11, pv.dotsp.b, 0x000003DA, 0xEB8A58F5, 0xCAECEE54 );
+  TEST_RR_OP( 12, pv.dotsp.b, 0xFFFFAD05, 0x47665939, 0x9E989665 );
+  TEST_RR_OP( 13, pv.dotsp.b, 0x00005335, 0x79D072B4, 0x5B8B4327 );
+  # pv.dotsp.sc.b
+  TEST_RR_OP( 14, pv.dotsp.sc.b, 0x000059EF, 0x6F622436, 0x1E1E694D );
+  TEST_RR_OP( 15, pv.dotsp.sc.b, 0x00001BDA, 0x77B8759A, 0xC1056E73 );
+  TEST_RR_OP( 16, pv.dotsp.sc.b, 0x00002238, 0x74740933, 0xF898DF1E );
+  # pv.dotsp.sci.b
+  TEST_SIMM6_OP( 17, pv.dotsp.sci.b, 0x0000006E, 0x4CD92920,  1 );
+  TEST_SIMM6_OP( 18, pv.dotsp.sci.b, 0xFFFFFE20, 0xAFCE7172, -5 );
+  TEST_SIMM6_OP( 19, pv.dotsp.sci.b, 0xFFFFF9FD, 0xDB25ABAA,  9 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotup.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotup.S
new file mode 100644
index 000000000..71d3e470d
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotup.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_dotup.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.dotup instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.dotup.h
+  TEST_RR_OP( 2, pv.dotup.h, 0x2A78A592, 0xFE67FB1A, 0x50E4DE57 );
+  TEST_RR_OP( 3, pv.dotup.h, 0x6F4A3DAA, 0xADBC1E09, 0xA2C806FA );
+  TEST_RR_OP( 4, pv.dotup.h, 0xDD21210B, 0x3FDAEFE7, 0x7BA5CB0F );
+  # pv.dotup.sc.h
+  TEST_RR_OP( 5, pv.dotup.sc.h, 0x6E59BCF3, 0x5E5C31BF, 0xCB24C409 );
+  TEST_RR_OP( 6, pv.dotup.sc.h, 0x4A884EE4, 0xEC042250, 0x230A4695 );
+  TEST_RR_OP( 7, pv.dotup.sc.h, 0x176809E6, 0x6BF5D085, 0x9AB012EF );
+  # pv.dotup.sci.h
+  TEST_UIMM6_OP(  8, pv.dotup.sci.h, 0x00148D3C, 0x36D2FEAA, 17 );
+  TEST_UIMM6_OP(  9, pv.dotup.sci.h, 0x0002CC3A, 0x6752FECB,  2 );
+  TEST_UIMM6_OP( 10, pv.dotup.sci.h, 0x000F6F94, 0x9747CFF5, 11 );
+  # pv.dotup.b
+  TEST_RR_OP( 11, pv.dotup.b, 0x0001DADA, 0xEB8A58F5, 0xCAECEE54 );
+  TEST_RR_OP( 12, pv.dotup.b, 0x0000B305, 0x47665939, 0x9E989665 );
+  TEST_RR_OP( 13, pv.dotup.b, 0x0000D535, 0x79D072B4, 0x5B8B4327 );
+  # pv.dotup.sc.b
+  TEST_RR_OP( 14, pv.dotup.sc.b, 0x000059EF, 0x6F622436, 0x1E1E694D );
+  TEST_RR_OP( 15, pv.dotup.sc.b, 0x000101DA, 0x77B8759A, 0xC1056E73 );
+  TEST_RR_OP( 16, pv.dotup.sc.b, 0x00002238, 0x74740933, 0xF898DF1E );
+  # pv.dotup.sci.b
+  TEST_UIMM6_OP( 17, pv.dotup.sci.b, 0x00000FBA, 0x4CD92920, 11 );
+  TEST_UIMM6_OP( 18, pv.dotup.sci.b, 0x00002140, 0xAFCE7172, 14 );
+  TEST_UIMM6_OP( 19, pv.dotup.sci.b, 0x00001053, 0xDB25ABAA,  7 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-uimm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-uimm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotusp.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotusp.S
new file mode 100644
index 000000000..17a92e165
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_dotusp.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_dotusp.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.dotusp instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.dotusp.h
+  TEST_RR_OP( 2, pv.dotusp.h, 0x2F5EA592, 0xFE67FB1A, 0x50E4DE57 );
+  TEST_RR_OP( 3, pv.dotusp.h, 0xC18E3DAA, 0xADBC1E09, 0xA2C806FA );
+  TEST_RR_OP( 4, pv.dotusp.h, 0xED3A210B, 0x3FDAEFE7, 0x7BA5CB0F );
+  # pv.dotusp.sc.h
+  TEST_RR_OP( 5, pv.dotusp.sc.h, 0xDE3EBCF3, 0x5E5C31BF, 0xCB24C409 );
+  TEST_RR_OP( 6, pv.dotusp.sc.h, 0x4A884EE4, 0xEC042250, 0x230A4695 );
+  TEST_RR_OP( 7, pv.dotusp.sc.h, 0x176809E6, 0x6BF5D085, 0x9AB012EF );
+  # pv.dotusp.sci.h
+  TEST_SIMM6_OP(  8, pv.dotusp.sci.h, 0xFFEF1338, 0x36D2FEAA, -14 );
+  TEST_SIMM6_OP(  9, pv.dotusp.sci.h, 0xFFF368FB, 0x6752FECB,  -9 );
+  TEST_SIMM6_OP( 10, pv.dotusp.sci.h, 0xFFFE98C4, 0x9747CFF5,  -1 );
+  # pv.dotusp.b
+  TEST_RR_OP( 11, pv.dotusp.b, 0x00000DDA, 0xEB8A58F5, 0xCAECEE54 );
+  TEST_RR_OP( 12, pv.dotusp.b, 0xFFFFAD05, 0x47665939, 0x9E989665 );
+  TEST_RR_OP( 13, pv.dotusp.b, 0x00000535, 0x79D072B4, 0x5B8B4327 );
+  # pv.dotusp.sc.b
+  TEST_RR_OP( 14, pv.dotusp.sc.b, 0x000059EF, 0x6F622436, 0x1E1E694D );
+  TEST_RR_OP( 15, pv.dotusp.sc.b, 0x000101DA, 0x77B8759A, 0xC1056E73 );
+  TEST_RR_OP( 16, pv.dotusp.sc.b, 0x00002238, 0x74740933, 0xF898DF1E );
+  # pv.dotusp.sci.b
+  TEST_SIMM6_OP( 17, pv.dotusp.sci.b, 0x0000016E, 0x4CD92920,  1 );
+  TEST_SIMM6_OP( 18, pv.dotusp.sci.b, 0xFFFFF420, 0xAFCE7172, -5 );
+  TEST_SIMM6_OP( 19, pv.dotusp.sci.b, 0x000014FD, 0xDB25ABAA,  9 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_extract.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_extract.S
new file mode 100644
index 000000000..5d0a0b70a
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_extract.S
@@ -0,0 +1,65 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_extract.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.extract instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Functional tests
+  #-------------------------------------------------------------
+
+  # pv.extract.h
+  TEST_SIMM6_OP( 2, pv.extract.h, 0x00000DEA, 0x53F90DEA, 0 );
+  TEST_SIMM6_OP( 3, pv.extract.h, 0x00000315, 0xC6990315, 0 );
+  TEST_SIMM6_OP( 4, pv.extract.h, 0x00005B08, 0xE1415B08, 0 );
+  TEST_SIMM6_OP( 5, pv.extract.h, 0x00003654, 0x3654249D, 1 );
+  TEST_SIMM6_OP( 6, pv.extract.h, 0x00002EE3, 0x2EE3D9FE, 1 );
+  TEST_SIMM6_OP( 7, pv.extract.h, 0xFFFF93B1, 0x93B1AA99, 1 );
+  # pv.extract.b
+  TEST_SIMM6_OP( 8, pv.extract.b, 0xFFFFFFD9, 0x53C073D9, 0 );
+  TEST_SIMM6_OP( 9, pv.extract.b, 0x0000001F, 0x269EFC1F, 0 );
+  TEST_SIMM6_OP( 10, pv.extract.b, 0xFFFFFFAB, 0x0E8CD3AB, 0 );
+  TEST_SIMM6_OP( 11, pv.extract.b, 0x0000004A, 0xF7964A55, 1 );
+  TEST_SIMM6_OP( 12, pv.extract.b, 0x0000006C, 0x1F366C84, 1 );
+  TEST_SIMM6_OP( 13, pv.extract.b, 0x0000005B, 0x11205B09, 1 );
+  TEST_SIMM6_OP( 14, pv.extract.b, 0x00000036, 0x2C36C818, 2 );
+  TEST_SIMM6_OP( 15, pv.extract.b, 0x00000003, 0x4C039923, 2 );
+  TEST_SIMM6_OP( 16, pv.extract.b, 0x0000007E, 0x057ED2EE, 2 );
+  TEST_SIMM6_OP( 17, pv.extract.b, 0x00000056, 0x56B005BB, 3 );
+  TEST_SIMM6_OP( 18, pv.extract.b, 0xFFFFFFE7, 0xE7798BAA, 3 );
+  TEST_SIMM6_OP( 19, pv.extract.b, 0xFFFFFFF3, 0xF3F956A2, 3 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_SIMM6_DEST_BYPASS, TEST_SIMM6_SRC1_BYPASS,
+  # TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_extractu.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_extractu.S
new file mode 100644
index 000000000..ccd6e37c0
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_extractu.S
@@ -0,0 +1,65 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_extractu.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.extractu instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Functional tests
+  #-------------------------------------------------------------
+
+  # pv.extractu.h
+  TEST_UIMM6_OP( 2, pv.extractu.h, 0x0000A18D, 0xED2CA18D, 0 );
+  TEST_UIMM6_OP( 3, pv.extractu.h, 0x00006A18, 0x3C576A18, 0 );
+  TEST_UIMM6_OP( 4, pv.extractu.h, 0x000040A2, 0x1DAB40A2, 0 );
+  TEST_UIMM6_OP( 5, pv.extractu.h, 0x0000BC96, 0xBC969BEC, 1 );
+  TEST_UIMM6_OP( 6, pv.extractu.h, 0x0000DF7E, 0xDF7E4D2B, 1 );
+  TEST_UIMM6_OP( 7, pv.extractu.h, 0x000099AE, 0x99AEE13C, 1 );
+  # pv.extractu.b
+  TEST_UIMM6_OP( 8, pv.extractu.b, 0x00000046, 0x8FA19B46, 0 );
+  TEST_UIMM6_OP( 9, pv.extractu.b, 0x0000009A, 0xE19C009A, 0 );
+  TEST_UIMM6_OP( 10, pv.extractu.b, 0x0000002A, 0x408D722A, 0 );
+  TEST_UIMM6_OP( 11, pv.extractu.b, 0x0000006C, 0xA2AF6C67, 1 );
+  TEST_UIMM6_OP( 12, pv.extractu.b, 0x0000001F, 0xDE671F25, 1 );
+  TEST_UIMM6_OP( 13, pv.extractu.b, 0x00000046, 0x325D46CE, 1 );
+  TEST_UIMM6_OP( 14, pv.extractu.b, 0x00000003, 0x4603F967, 2 );
+  TEST_UIMM6_OP( 15, pv.extractu.b, 0x000000C1, 0xDBC1292F, 2 );
+  TEST_UIMM6_OP( 16, pv.extractu.b, 0x000000D6, 0xE7D631CF, 2 );
+  TEST_UIMM6_OP( 17, pv.extractu.b, 0x00000020, 0x20B64275, 3 );
+  TEST_UIMM6_OP( 18, pv.extractu.b, 0x000000D6, 0xD64B2CC0, 3 );
+  TEST_UIMM6_OP( 19, pv.extractu.b, 0x00000084, 0x845485BD, 3 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_UIMM6_DEST_BYPASS, TEST_UIMM6_SRC1_BYPASS,
+  # TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_insert.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_insert.S
new file mode 100644
index 000000000..aa427d6ac
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_insert.S
@@ -0,0 +1,87 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_insert.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.insert instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Functional tests
+  #-------------------------------------------------------------
+
+  # load immediate in test register necessary before execution
+  # of each test case because the output is dependent on the
+  # previous state of rD
+
+  # pv.insert.h
+  li x14, 0x6ACB7454;
+  TEST_UIMM6_OP( 2, pv.insert.h, 0x6ACBF419, 0x3A12F419, 0 );
+  li x14, 0x2BCBE5BA;
+  TEST_UIMM6_OP( 3, pv.insert.h, 0x2BCB3FEE, 0x86013FEE, 0 );
+  li x14, 0x8E18DBE7;
+  TEST_UIMM6_OP( 4, pv.insert.h, 0x8E18C59F, 0x7153C59F, 0 );
+  li x14, 0x57DF0195;
+  TEST_UIMM6_OP( 5, pv.insert.h, 0x00F60195, 0x267700F6, 1 );
+  li x14, 0x7825C668;
+  TEST_UIMM6_OP( 6, pv.insert.h, 0x17F7C668, 0x04A017F7, 1 );
+  li x14, 0xDBC05DC7;
+  TEST_UIMM6_OP( 7, pv.insert.h, 0xF7455DC7, 0x3569F745, 1 );
+  # pv.insert.b
+  li x14, 0x5C93979B;
+  TEST_UIMM6_OP( 8, pv.insert.b, 0x5C93979C, 0x955C289C, 0 );
+  li x14, 0x4696DE77;
+  TEST_UIMM6_OP( 9, pv.insert.b, 0x4696DEB0, 0x00E6ADB0, 0 );
+  li x14, 0x48024613;
+  TEST_UIMM6_OP( 10, pv.insert.b, 0x4802465E, 0xE580375E, 0 );
+  li x14, 0x55963E26;
+  TEST_UIMM6_OP( 11, pv.insert.b, 0x5596FC26, 0x215C2AFC, 1 );
+  li x14, 0xAA2930B8;
+  TEST_UIMM6_OP( 12, pv.insert.b, 0xAA29DCB8, 0xE0318DDC, 1 );
+  li x14, 0x844521DE;
+  TEST_UIMM6_OP( 13, pv.insert.b, 0x84459DDE, 0x12ED4F9D, 1 );
+  li x14, 0xFE27DE9A;
+  TEST_UIMM6_OP( 14, pv.insert.b, 0xFED1DE9A, 0xC72B60D1, 2 );
+  li x14, 0x41075730;
+  TEST_UIMM6_OP( 15, pv.insert.b, 0x41065730, 0x63ED6A06, 2 );
+  li x14, 0xFD9C6336;
+  TEST_UIMM6_OP( 16, pv.insert.b, 0xFD426336, 0xA924A142, 2 );
+  li x14, 0x2A3A8341;
+  TEST_UIMM6_OP( 17, pv.insert.b, 0x513A8341, 0x6B50F251, 3 );
+  li x14, 0x59FBF2A7;
+  TEST_UIMM6_OP( 18, pv.insert.b, 0x31FBF2A7, 0x41767331, 3 );
+  li x14, 0xE056E2B2;
+  TEST_UIMM6_OP( 19, pv.insert.b, 0x8C56E2B2, 0x2B08038C, 3 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # TEST_UIMM6_DEST_BYPASS, TEST_UIMM6_SRC1_BYPASS,
+  # TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_max.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_max.S
new file mode 100644
index 000000000..20f4c69f8
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_max.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_max.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.max instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.max.h
+  TEST_RR_OP( 2, pv.max.h, 0x731E1846, 0xF4D3B4D4, 0x731E1846 );
+  TEST_RR_OP( 3, pv.max.h, 0x0E5963C7, 0x0E5963C7, 0xC078A04B );
+  TEST_RR_OP( 4, pv.max.h, 0x10AF37F3, 0x10AF37F3, 0xA4DA964F );
+  # pv.max.sc.h
+  TEST_RR_OP( 5, pv.max.sc.h, 0xFDCCFDCC, 0xC86DA7A4, 0x5AC6FDCC );
+  TEST_RR_OP( 6, pv.max.sc.h, 0x6F096F09, 0x1EBE021F, 0xAEAF6F09 );
+  TEST_RR_OP( 7, pv.max.sc.h, 0x72AF72AF, 0xCD6ACE5B, 0xA0D172AF );
+  # pv.max.sci.h
+  TEST_SIMM6_OP(  8, pv.max.sci.h, 0x000B1FDD, 0xAD6D1FDD, 11 );
+  TEST_SIMM6_OP(  9, pv.max.sci.h, 0x000B000B, 0xAAF6EBB0, 11 );
+  TEST_SIMM6_OP( 10, pv.max.sci.h, 0x252B000B, 0x252BC255, 11 );
+  # pv.max.b
+  TEST_RR_OP( 11, pv.max.b, 0xF2402D09, 0xB040FD9D, 0xF2842D09 );
+  TEST_RR_OP( 12, pv.max.b, 0x78652008, 0xD749FDBE, 0x78652008 );
+  TEST_RR_OP( 13, pv.max.b, 0xC5755F6A, 0xC5755F6A, 0xAD1CD088 );
+  # pv.max.sc.b
+  TEST_RR_OP( 14, pv.max.sc.b, 0x75757575, 0x01B6C06B, 0xC1698275 );
+  TEST_RR_OP( 15, pv.max.sc.b, 0x7B7B7B7B, 0x4A547B78, 0xCD4D377B );
+  TEST_RR_OP( 16, pv.max.sc.b, 0x5D7B5D5F, 0x027B0E5F, 0x595E995D );
+  # pv.max.sci.b
+  TEST_SIMM6_OP( 17, pv.max.sci.b, 0x0B0B0B0B, 0xEB06FBAB, 11 );
+  TEST_SIMM6_OP( 18, pv.max.sci.b, 0x56240B26, 0x56249726, 11 );
+  TEST_SIMM6_OP( 19, pv.max.sci.b, 0x5F32211E, 0x5F32211E, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_maxu.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_maxu.S
new file mode 100644
index 000000000..e5451559f
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_maxu.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_maxu.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.maxu instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.maxu.h
+  TEST_RR_OP( 2, pv.maxu.h, 0xBA529136, 0x3C369136, 0xBA524CAE );
+  TEST_RR_OP( 3, pv.maxu.h, 0xC9E65AD2, 0xC9E60677, 0x00145AD2 );
+  TEST_RR_OP( 4, pv.maxu.h, 0x42D67990, 0x42D67990, 0x244A0E31 );
+  # pv.maxu.sc.h
+  TEST_RR_OP( 5, pv.maxu.sc.h, 0x36D0CB1F, 0x36D0CB1F, 0x426D0434 );
+  TEST_RR_OP( 6, pv.maxu.sc.h, 0xAE6DE3C7, 0xAE6DE3C7, 0x6ACB58AD );
+  TEST_RR_OP( 7, pv.maxu.sc.h, 0xB6CDD3B0, 0xB6CDD3B0, 0x1CF29759 );
+  # pv.maxu.sci.h
+  TEST_UIMM6_OP(  8, pv.maxu.sci.h, 0xF503CA6A, 0xF503CA6A, 11 );
+  TEST_UIMM6_OP(  9, pv.maxu.sci.h, 0x6781179C, 0x6781179C, 11 );
+  TEST_UIMM6_OP( 10, pv.maxu.sci.h, 0xB778D8A3, 0xB778D8A3, 11 );
+  # pv.maxu.b
+  TEST_RR_OP( 11, pv.maxu.b, 0x17F9C1D2, 0x0DF91FD2, 0x1703C18D );
+  TEST_RR_OP( 12, pv.maxu.b, 0xD04FEFA4, 0x434FEFA4, 0xD032B42E );
+  TEST_RR_OP( 13, pv.maxu.b, 0x8A95BFF2, 0x56955708, 0x8A08BFF2 );
+  # pv.maxu.sc.b
+  TEST_RR_OP( 14, pv.maxu.sc.b, 0xE8E8E8E8, 0x318C6A64, 0x82B8BEE8 );
+  TEST_RR_OP( 15, pv.maxu.sc.b, 0xDFE73926, 0xDFE73909, 0xBFC58126 );
+  TEST_RR_OP( 16, pv.maxu.sc.b, 0x9DF09D9D, 0x6FF07641, 0x5689109D );
+  # pv.maxu.sci.b
+  TEST_UIMM6_OP( 17, pv.maxu.sci.b, 0x0B7062D8, 0x0A7062D8, 11 );
+  TEST_UIMM6_OP( 18, pv.maxu.sci.b, 0x0B469D5D, 0x07469D5D, 11 );
+  TEST_UIMM6_OP( 19, pv.maxu.sci.b, 0x4E875E27, 0x4E875E27, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_min.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_min.S
new file mode 100644
index 000000000..c402ffad2
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_min.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_min.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.min instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.min.h
+  TEST_RR_OP( 2, pv.min.h, 0x82E7A6AB, 0x8A92A6AB, 0x82E77B73 );
+  TEST_RR_OP( 3, pv.min.h, 0xE3770915, 0xE37773E8, 0x44330915 );
+  TEST_RR_OP( 4, pv.min.h, 0x85B0BA95, 0x85B0E12E, 0x6CA1BA95 );
+  # pv.min.sc.h
+  TEST_RR_OP( 5, pv.min.sc.h, 0x33A388EB, 0x33A388EB, 0xA73B6225 );
+  TEST_RR_OP( 6, pv.min.sc.h, 0xB2D1B2D1, 0x6C255634, 0xC06DB2D1 );
+  TEST_RR_OP( 7, pv.min.sc.h, 0xED24CC32, 0xED24CC32, 0x20DD20AE );
+  # pv.min.sci.h
+  TEST_SIMM6_OP(  8, pv.min.sci.h, 0x000BABA8, 0x3116ABA8, 11 );
+  TEST_SIMM6_OP(  9, pv.min.sci.h, 0xF270FB23, 0xF270FB23, 11 );
+  TEST_SIMM6_OP( 10, pv.min.sci.h, 0xF45DE902, 0xF45DE902, 11 );
+  # pv.min.b
+  TEST_RR_OP( 11, pv.min.b, 0x3BD1A58C, 0x3BF5A5CD, 0x59D1618C );
+  TEST_RR_OP( 12, pv.min.b, 0x99C52CBA, 0xF4D42C6F, 0x99C57ABA );
+  TEST_RR_OP( 13, pv.min.b, 0x13CB8AE9, 0x13CB8AE9, 0x47F8D538 );
+  # pv.min.sc.b
+  TEST_RR_OP( 14, pv.min.sc.b, 0xDCC161BB, 0xDCC161BB, 0x41A0EA7B );
+  TEST_RR_OP( 15, pv.min.sc.b, 0x3059A553, 0x3059A553, 0xB80EA978 );
+  TEST_RR_OP( 16, pv.min.sc.b, 0x97ECEFEF, 0x97EC4211, 0x8059FEEF );
+  # pv.min.sci.b
+  TEST_SIMM6_OP( 17, pv.min.sci.b, 0x0B0BB986, 0x732DB986, 11 );
+  TEST_SIMM6_OP( 18, pv.min.sci.b, 0xF40B0BE7, 0xF47567E7, 11 );
+  TEST_SIMM6_OP( 19, pv.min.sci.b, 0x0BF70B0B, 0x7DF77268, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_minu.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_minu.S
new file mode 100644
index 000000000..4c875e427
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_minu.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_minu.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.minu instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.minu.h
+  TEST_RR_OP( 2, pv.minu.h, 0x6A212A68, 0xE2B42A68, 0x6A212B4A );
+  TEST_RR_OP( 3, pv.minu.h, 0x50AA637D, 0xC219637D, 0x50AA84CC );
+  TEST_RR_OP( 4, pv.minu.h, 0x579039EE, 0x92C439EE, 0x5790DCDC );
+  # pv.minu.sc.h
+  TEST_RR_OP( 5, pv.minu.sc.h, 0x7EF4A8D0, 0x7EF4D67D, 0x268CA8D0 );
+  TEST_RR_OP( 6, pv.minu.sc.h, 0x8C9F3E8A, 0x9A9F3E8A, 0x2E6A8C9F );
+  TEST_RR_OP( 7, pv.minu.sc.h, 0x844C6178, 0x844C6178, 0x7A819ECF );
+  # pv.minu.sci.h
+  TEST_UIMM6_OP(  8, pv.minu.sci.h, 0x000B000B, 0x2E8024BF, 11 );
+  TEST_UIMM6_OP(  9, pv.minu.sci.h, 0x000B000B, 0x7070C7D7, 11 );
+  TEST_UIMM6_OP( 10, pv.minu.sci.h, 0x000B000B, 0x6955494F, 11 );
+  # pv.minu.b
+  TEST_RR_OP( 11, pv.minu.b, 0x46A4170C, 0x46F51795, 0xAAA4C60C );
+  TEST_RR_OP( 12, pv.minu.b, 0xA33FAB2C, 0xA33FAB2C, 0xE140C044 );
+  TEST_RR_OP( 13, pv.minu.b, 0x43015111, 0xF49B5111, 0x43018736 );
+  # pv.minu.sc.b
+  TEST_RR_OP( 14, pv.minu.sc.b, 0x3EB0291F, 0x3EBF291F, 0x909B9AB0 );
+  TEST_RR_OP( 15, pv.minu.sc.b, 0x000C0C0C, 0x00555837, 0x7F61610C );
+  TEST_RR_OP( 16, pv.minu.sc.b, 0x01010101, 0x47AB06B4, 0x0E0F9001 );
+  # pv.minu.sci.b
+  TEST_UIMM6_OP( 17, pv.minu.sci.b, 0x0B0B0B0B, 0xBEAF5AAB, 11 );
+  TEST_UIMM6_OP( 18, pv.minu.sci.b, 0x0B0B0B0B, 0xDC152410, 11 );
+  TEST_UIMM6_OP( 19, pv.minu.sci.b, 0x0B0B0B0B, 0x1DAD56C8, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_or.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_or.S
new file mode 100644
index 000000000..821377c14
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_or.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_or.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.or instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.or.h
+  TEST_RR_OP( 2, pv.or.h, 0xDFEFB3F3, 0x9E678370, 0x418EB1F3 );
+  TEST_RR_OP( 3, pv.or.h, 0x7EFFDD7D, 0x727D5079, 0x7C968D05 );
+  TEST_RR_OP( 4, pv.or.h, 0x5FAEEFD9, 0x4BAEA991, 0x1604EFD9 );
+  # pv.or.sc.h
+  TEST_RR_OP( 5, pv.or.sc.h, 0x7FB2EFF1, 0x5782C951, 0x886D2FB0 );
+  TEST_RR_OP( 6, pv.or.sc.h, 0x7F9E6FF9, 0x5E0E04F9, 0x248B6F98 );
+  TEST_RR_OP( 7, pv.or.sc.h, 0xBD9BBD4B, 0x85998C42, 0xD1F2B90B );
+  # pv.or.sci.h
+  TEST_UIMM6_OP(  8, pv.or.sci.h, 0x4F6F5CBF, 0x4F645CB5, 11 );
+  TEST_UIMM6_OP(  9, pv.or.sci.h, 0x34DF2B7B, 0x34DD2B73, 11 );
+  TEST_UIMM6_OP( 10, pv.or.sci.h, 0xE73F5DEF, 0xE73D5DE5, 11 );
+  # pv.or.b
+  TEST_RR_OP( 11, pv.or.b, 0xFDFAFC34, 0x25AA9830, 0xD8706434 );
+  TEST_RR_OP( 12, pv.or.b, 0x9C7BF5EF, 0x9C41746A, 0x003BC1ED );
+  TEST_RR_OP( 13, pv.or.b, 0x7BEBAEFF, 0x7B4BA8E2, 0x1BA3263F );
+  # pv.or.sc.b
+  TEST_RR_OP( 14, pv.or.sc.b, 0xFFF7B7B7, 0xFDE72320, 0x26977A97 );
+  TEST_RR_OP( 15, pv.or.sc.b, 0xFFD5F5FD, 0xEA55E02C, 0x0D23AFD5 );
+  TEST_RR_OP( 16, pv.or.sc.b, 0xDBDFFBD3, 0x9B8E6BC3, 0xCF31CDD3 );
+  # pv.or.sci.b
+  TEST_UIMM6_OP( 17, pv.or.sci.b, 0xCBCF2F9B, 0xC2CC2499, 11 );
+  TEST_UIMM6_OP( 18, pv.or.sci.b, 0x3BEF1BEB, 0x33EE13E0, 11 );
+  TEST_UIMM6_OP( 19, pv.or.sci.b, 0xFB5B5BEF, 0xF05252EE, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotsp.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotsp.S
new file mode 100644
index 000000000..c41784263
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotsp.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sdotsp.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sdotsp instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sdotsp.h
+  TEST_RRR_OP( 2, pv.sdotsp.h, 0x8588AF48, 0xFE67FB1A, 0x50E4DE57, 0x856509B6 );
+  TEST_RRR_OP( 3, pv.sdotsp.h, 0xA5102DA6, 0xADBC1E09, 0xA2C806FA, 0x8649EFFC );
+  TEST_RRR_OP( 4, pv.sdotsp.h, 0xB6C05945, 0x3FDAEFE7, 0x7BA5CB0F, 0x9495383A );
+  # pv.sdotsp.sc.h
+  TEST_RRR_OP( 5, pv.sdotsp.sc.h, 0x76464853, 0x5E5C31BF, 0xCB24C409, 0x98078B60 );
+  TEST_RRR_OP( 6, pv.sdotsp.sc.h, 0xBAB1856D, 0xEC042250, 0x230A4695, 0xB6BE3689 );
+  TEST_RRR_OP( 7, pv.sdotsp.sc.h, 0xA318DEC3, 0x6BF5D085, 0x9AB012EF, 0x9E9FD4DD );
+  # pv.sdotsp.sci.h
+  TEST_RR_SIMM6_OP(  8, pv.sdotsp.sci.h, 0x6AA9C4BB, 0x36D2FEAA, -14, 0x6AACB183 );
+  TEST_RR_SIMM6_OP(  9, pv.sdotsp.sci.h, 0xA61C8356, 0x6752FECB,  -9, 0xA6201A5B );
+  TEST_RR_SIMM6_OP( 10, pv.sdotsp.sci.h, 0x968EF09B, 0x9747CFF5,  -1, 0x968E57D7 );
+  # pv.sdotsp.b
+  TEST_RRR_OP( 11, pv.sdotsp.b, 0x6BF81516, 0xEB8A58F5, 0xCAECEE54, 0x6BF8113C );
+  TEST_RRR_OP( 12, pv.sdotsp.b, 0x5D238DA6, 0x47665939, 0x9E989665, 0x5D23E0A1 );
+  TEST_RRR_OP( 13, pv.sdotsp.b, 0xC511714F, 0x79D072B4, 0x5B8B4327, 0xC5111E1A );
+  # pv.sdotsp.sc.b
+  TEST_RRR_OP( 14, pv.sdotsp.sc.b, 0x7C691AEB, 0x6F622436, 0x1E1E694D, 0x7C68C0FC );
+  TEST_RRR_OP( 15, pv.sdotsp.sc.b, 0xAC521CE2, 0x77B8759A, 0xC1056E73, 0xAC520108 );
+  TEST_RRR_OP( 16, pv.sdotsp.sc.b, 0xAEA211C3, 0x74740933, 0xF898DF1E, 0xAEA1EF8B );
+  # pv.sdotsp.sci.b
+  TEST_RR_SIMM6_OP( 17, pv.sdotsp.sci.b, 0x86CD84EE, 0x4CD92920,  1, 0x86CD8480 );
+  TEST_RR_SIMM6_OP( 18, pv.sdotsp.sci.b, 0x82399E03, 0xAFCE7172, -5, 0x82399FE3 );
+  TEST_RR_SIMM6_OP( 19, pv.sdotsp.sci.b, 0x3F752492, 0xDB25ABAA,  9, 0x3F752A95 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_SRC1_EQ_DEST,
+  # TEST_RRR_SRC2_EQ_DEST, TEST_RRR_SRC12_EQ_DEST
+  # for reg-simm6-reg instructions *macros still to be written*
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_DEST_BYPASS,
+  # TEST_RRR_SRC12_BYPASS, TEST_RRR_SRC21_BYPASS, TEST_RRR_SRC3_BYPASS,
+  # TEST_RRR_ZEROSRC1, TEST_RRR_ZEROSRC2, TEST_RRR_ZEROSRC3,
+  # TEST_RRR_ZEROSRC12, TEST_RRR_ZEROSRC123, TEST_RRR_ZERODEST
+  # for reg-simm6-reg instructions *macros still to be written*
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotup.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotup.S
new file mode 100644
index 000000000..7e99c6415
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotup.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sdotup.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sdotup instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sdotup.h
+  TEST_RRR_OP( 2, pv.sdotup.h, 0xAFDDAF48, 0xFE67FB1A, 0x50E4DE57, 0x856509B6 );
+  TEST_RRR_OP( 3, pv.sdotup.h, 0xF5942DA6, 0xADBC1E09, 0xA2C806FA, 0x8649EFFC );
+  TEST_RRR_OP( 4, pv.sdotup.h, 0x71B65945, 0x3FDAEFE7, 0x7BA5CB0F, 0x9495383A );
+  # pv.sdotup.sc.h
+  TEST_RRR_OP( 5, pv.sdotup.sc.h, 0x06614853, 0x5E5C31BF, 0xCB24C409, 0x98078B60 );
+  TEST_RRR_OP( 6, pv.sdotup.sc.h, 0x0146856D, 0xEC042250, 0x230A4695, 0xB6BE3689 );
+  TEST_RRR_OP( 7, pv.sdotup.sc.h, 0xB607DEC3, 0x6BF5D085, 0x9AB012EF, 0x9E9FD4DD );
+  # pv.sdotup.sci.h
+  TEST_RR_UIMM6_OP(  8, pv.sdotup.sci.h, 0x6AC13EBF, 0x36D2FEAA, 17, 0x6AACB183 );
+  TEST_RR_UIMM6_OP(  9, pv.sdotup.sci.h, 0xA622E695, 0x6752FECB,  2, 0xA6201A5B );
+  TEST_RR_UIMM6_OP( 10, pv.sdotup.sci.h, 0x969DC76B, 0x9747CFF5, 11, 0x968E57D7 );
+  # pv.sdotup.b
+  TEST_RRR_OP( 11, pv.sdotup.b, 0x6BF9EC16, 0xEB8A58F5, 0xCAECEE54, 0x6BF8113C );
+  TEST_RRR_OP( 12, pv.sdotup.b, 0x5D2493A6, 0x47665939, 0x9E989665, 0x5D23E0A1 );
+  TEST_RRR_OP( 13, pv.sdotup.b, 0xC511F34F, 0x79D072B4, 0x5B8B4327, 0xC5111E1A );
+  # pv.sdotup.sc.b
+  TEST_RRR_OP( 14, pv.sdotup.sc.b, 0x7C691AEB, 0x6F622436, 0x1E1E694D, 0x7C68C0FC );
+  TEST_RRR_OP( 15, pv.sdotup.sc.b, 0xAC5302E2, 0x77B8759A, 0xC1056E73, 0xAC520108 );
+  TEST_RRR_OP( 16, pv.sdotup.sc.b, 0xAEA211C3, 0x74740933, 0xF898DF1E, 0xAEA1EF8B );
+  # pv.sdotup.sci.b
+  TEST_RR_UIMM6_OP( 17, pv.sdotup.sci.b, 0x86CD943A, 0x4CD92920, 11, 0x86CD8480 );
+  TEST_RR_UIMM6_OP( 18, pv.sdotup.sci.b, 0x8239C123, 0xAFCE7172, 14, 0x82399FE3 );
+  TEST_RR_UIMM6_OP( 19, pv.sdotup.sci.b, 0x3F753AE8, 0xDB25ABAA,  7, 0x3F752A95 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_SRC1_EQ_DEST,
+  # TEST_RRR_SRC2_EQ_DEST, TEST_RRR_SRC12_EQ_DEST
+  # for reg-uimm6-reg instructions *macros still to be written*
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_DEST_BYPASS,
+  # TEST_RRR_SRC12_BYPASS, TEST_RRR_SRC21_BYPASS, TEST_RRR_SRC3_BYPASS,
+  # TEST_RRR_ZEROSRC1, TEST_RRR_ZEROSRC2, TEST_RRR_ZEROSRC3,
+  # TEST_RRR_ZEROSRC12, TEST_RRR_ZEROSRC123, TEST_RRR_ZERODEST
+  # for reg-uimm6-reg instructions *macros still to be written*
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotusp.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotusp.S
new file mode 100644
index 000000000..30f30ba24
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sdotusp.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sdotusp.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sdotusp instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sdotusp.h
+  TEST_RRR_OP( 2, pv.sdotusp.h, 0xB4C3AF48, 0xFE67FB1A, 0x50E4DE57, 0x856509B6 );
+  TEST_RRR_OP( 3, pv.sdotusp.h, 0x47D82DA6, 0xADBC1E09, 0xA2C806FA, 0x8649EFFC );
+  TEST_RRR_OP( 4, pv.sdotusp.h, 0x81CF5945, 0x3FDAEFE7, 0x7BA5CB0F, 0x9495383A );
+  # pv.sdotusp.sc.h
+  TEST_RRR_OP( 5, pv.sdotusp.sc.h, 0x76464853, 0x5E5C31BF, 0xCB24C409, 0x98078B60 );
+  TEST_RRR_OP( 6, pv.sdotusp.sc.h, 0x0146856D, 0xEC042250, 0x230A4695, 0xB6BE3689 );
+  TEST_RRR_OP( 7, pv.sdotusp.sc.h, 0xB607DEC3, 0x6BF5D085, 0x9AB012EF, 0x9E9FD4DD );
+  # pv.sdotusp.sci.h
+  TEST_RR_SIMM6_OP(  8, pv.sdotusp.sci.h, 0x6A9BC4BB, 0x36D2FEAA, -14, 0x6AACB183 );
+  TEST_RR_SIMM6_OP(  9, pv.sdotusp.sci.h, 0xA6138356, 0x6752FECB,  -9, 0xA6201A5B );
+  TEST_RR_SIMM6_OP( 10, pv.sdotusp.sci.h, 0x968CF09B, 0x9747CFF5,  -1, 0x968E57D7 );
+  # pv.sdotusp.b
+  TEST_RRR_OP( 11, pv.sdotusp.b, 0x6BF81F16, 0xEB8A58F5, 0xCAECEE54, 0x6BF8113C );
+  TEST_RRR_OP( 12, pv.sdotusp.b, 0x5D238DA6, 0x47665939, 0x9E989665, 0x5D23E0A1 );
+  TEST_RRR_OP( 13, pv.sdotusp.b, 0xC511234F, 0x79D072B4, 0x5B8B4327, 0xC5111E1A );
+  # pv.sdotusp.sc.b
+  TEST_RRR_OP( 14, pv.sdotusp.sc.b, 0x7C691AEB, 0x6F622436, 0x1E1E694D, 0x7C68C0FC );
+  TEST_RRR_OP( 15, pv.sdotusp.sc.b, 0xAC5302E2, 0x77B8759A, 0xC1056E73, 0xAC520108 );
+  TEST_RRR_OP( 16, pv.sdotusp.sc.b, 0xAEA211C3, 0x74740933, 0xF898DF1E, 0xAEA1EF8B );
+  # pv.sdotusp.sci.b
+  TEST_RR_SIMM6_OP( 17, pv.sdotusp.sci.b, 0x86CD85EE, 0x4CD92920,  1, 0x86CD8480 );
+  TEST_RR_SIMM6_OP( 18, pv.sdotusp.sci.b, 0x82399403, 0xAFCE7172, -5, 0x82399FE3 );
+  TEST_RR_SIMM6_OP( 19, pv.sdotusp.sci.b, 0x3F753F92, 0xDB25ABAA,  9, 0x3F752A95 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_SRC1_EQ_DEST,
+  # TEST_RRR_SRC2_EQ_DEST, TEST_RRR_SRC12_EQ_DEST
+  # for reg-simm6-reg instructions *macros still to be written*
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_DEST_BYPASS,
+  # TEST_RRR_SRC12_BYPASS, TEST_RRR_SRC21_BYPASS, TEST_RRR_SRC3_BYPASS,
+  # TEST_RRR_ZEROSRC1, TEST_RRR_ZEROSRC2, TEST_RRR_ZEROSRC3,
+  # TEST_RRR_ZEROSRC12, TEST_RRR_ZEROSRC123, TEST_RRR_ZERODEST
+  # for reg-simm6-reg instructions *macros still to be written*
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_shuffle2.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_shuffle2.S
new file mode 100644
index 000000000..fd3f2bf09
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_shuffle2.S
@@ -0,0 +1,70 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_shuffle2.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.shuffle2 instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.shuffle2.h
+  TEST_RRR_OP(  2, pv.shuffle2.h, 0xABD0A85B, 0xABD03F6E, 0x1D4B26D0, 0xF0C3A85B );
+  TEST_RRR_OP(  3, pv.shuffle2.h, 0x93A60706, 0x511B0706, 0xEB397322, 0x93A613C6 );
+  TEST_RRR_OP(  4, pv.shuffle2.h, 0x2BCE15F2, 0x9D2D15F2, 0x5C71278E, 0x2BCEDA18 );
+  TEST_RRR_OP(  5, pv.shuffle2.h, 0x2C48AA34, 0x2C48AA34, 0x4887D28E, 0x55247E80 );
+  TEST_RRR_OP(  6, pv.shuffle2.h, 0xE999ADE8, 0xADE8E999, 0xD26AD68F, 0x23A14961 );
+  TEST_RRR_OP(  7, pv.shuffle2.h, 0x0059517C, 0x6BF30059, 0xEFB6AF79, 0x517C1495 );
+  TEST_RRR_OP(  8, pv.shuffle2.h, 0xB7FEA035, 0xB7FED864, 0x5BBB1058, 0x4583A035 );
+  TEST_RRR_OP(  9, pv.shuffle2.h, 0xEA55FDC2, 0xFDC2EA55, 0x7292CF23, 0x4F82A53E );
+  TEST_RRR_OP( 10, pv.shuffle2.h, 0xBE7232CB, 0x32CBBE72, 0x6DB6060F, 0x22C33B63 );
+  TEST_RRR_OP( 11, pv.shuffle2.h, 0x4389A2A3, 0xCB19A2A3, 0x00BCDD22, 0xFB744389 );
+  # pv.shuffle2.b
+  TEST_RRR_OP( 12, pv.shuffle2.b, 0xDAD9ECA3, 0x35A309D9, 0x8AE410B6, 0x22DA0BEC );
+  TEST_RRR_OP( 13, pv.shuffle2.b, 0x0EF485F4, 0xCA850EB8, 0x256B969B, 0xF438D1D7 );
+  TEST_RRR_OP( 14, pv.shuffle2.b, 0x1414E4C0, 0xE433C0A1, 0xC8381F65, 0xAC7DBC14 );
+  TEST_RRR_OP( 15, pv.shuffle2.b, 0x81676762, 0x36DE6217, 0xC98AEA7D, 0x9D6781F4 );
+  TEST_RRR_OP( 16, pv.shuffle2.b, 0xD80DD8B4, 0xD8CE132C, 0x67D8BF89, 0x166FB40D );
+  TEST_RRR_OP( 17, pv.shuffle2.b, 0x7B9E0404, 0x657BF4D6, 0x06DB0232, 0x9E049D7E );
+  TEST_RRR_OP( 18, pv.shuffle2.b, 0xD15526EE, 0x617EEED1, 0xE4D33275, 0x55264DEE );
+  TEST_RRR_OP( 19, pv.shuffle2.b, 0x73AB4CAB, 0x43AB21CB, 0x4B2EC0BE, 0x7306984C );
+  TEST_RRR_OP( 20, pv.shuffle2.b, 0x5235C41D, 0x052B5263, 0x85BB52D0, 0x35C4A31D );
+  TEST_RRR_OP( 21, pv.shuffle2.b, 0xF1E0F194, 0xFFABF194, 0x35CBE594, 0xE0A7A1D1 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_SRC1_EQ_DEST,
+  # TEST_RRR_SRC2_EQ_DEST, TEST_RRR_SRC12_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for reg-reg-reg instructions TEST_RRR_DEST_BYPASS,
+  # TEST_RRR_SRC12_BYPASS, TEST_RRR_SRC21_BYPASS, TEST_RRR_SRC3_BYPASS,
+  # TEST_RRR_ZEROSRC1, TEST_RRR_ZEROSRC2, TEST_RRR_ZEROSRC3,
+  # TEST_RRR_ZEROSRC12, TEST_RRR_ZEROSRC123, TEST_RRR_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sll.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sll.S
new file mode 100644
index 000000000..3e44223bd
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sll.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sll.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sll instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sll.h
+  TEST_RR_OP( 2, pv.sll.h, 0x40000E80, 0xCC85D0E8, 0x000E0004 );
+  TEST_RR_OP( 3, pv.sll.h, 0xF0802C00, 0x83E1502C, 0x00070008 );
+  TEST_RR_OP( 4, pv.sll.h, 0x20005470, 0x8AA1551C, 0x000D0002 );
+  # pv.sll.sc.h
+  TEST_RR_OP( 5, pv.sll.sc.h, 0x81F03608, 0x103E26C1, 0x000A0003 );
+  TEST_RR_OP( 6, pv.sll.sc.h, 0x1B800F00, 0x0437CE1E, 0x00080007 );
+  TEST_RR_OP( 7, pv.sll.sc.h, 0xC7002900, 0xE5C75029, 0x000D0008 );
+  # pv.sll.sci.h
+  TEST_UIMM6_OP(  8, pv.sll.sci.h, 0x46002600, 0x48233B93, 9 );
+  TEST_UIMM6_OP(  9, pv.sll.sci.h, 0x9600AC00, 0x344B9356, 9 );
+  TEST_UIMM6_OP( 10, pv.sll.sci.h, 0x40002E00, 0xB2A0E417, 9 );
+  # pv.sll.b
+  TEST_RR_OP( 11, pv.sll.b, 0x7EE05CA8, 0x3FCE5C2A, 0x01040002 );
+  TEST_RR_OP( 12, pv.sll.b, 0xA45E8034, 0xE95E5934, 0x02000700 );
+  TEST_RR_OP( 13, pv.sll.b, 0xB0780068, 0xFB8FA8B4, 0x04030501 );
+  # pv.sll.sc.b
+  TEST_RR_OP( 14, pv.sll.sc.b, 0xF05000C0, 0x5FF510FC, 0x02020004 );
+  TEST_RR_OP( 15, pv.sll.sc.b, 0xA0C0B0A0, 0x2A6CFB1A, 0x01000604 );
+  TEST_RR_OP( 16, pv.sll.sc.b, 0x047E94F8, 0x823FCAFC, 0x07010601 );
+  # pv.sll.sci.b
+  TEST_UIMM6_OP( 17, pv.sll.sci.b, 0x60002020, 0x8BA0A901, 5 );
+  TEST_UIMM6_OP( 18, pv.sll.sci.b, 0x604020E0, 0x83CAE947, 5 );
+  TEST_UIMM6_OP( 19, pv.sll.sci.b, 0x0020A060, 0x98F185C3, 5 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sra.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sra.S
new file mode 100644
index 000000000..d0bfe0d89
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sra.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sra.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sra instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sra.h
+  TEST_RR_OP( 2, pv.sra.h, 0x2C4BFFF8, 0x5896C0A3, 0x0001000B );
+  TEST_RR_OP( 3, pv.sra.h, 0x005E760F, 0x5E43760F, 0x00080000 );
+  TEST_RR_OP( 4, pv.sra.h, 0x0059F46B, 0x166BA35F, 0x00060003 );
+  # pv.sra.sc.h
+  TEST_RR_OP( 5, pv.sra.sc.h, 0x00000005, 0x080A5F54, 0x000D000C );
+  TEST_RR_OP( 6, pv.sra.sc.h, 0xFF0400FB, 0xE0871F6D, 0x00020005 );
+  TEST_RR_OP( 7, pv.sra.sc.h, 0x00010000, 0x40FF2C98, 0x000B000E );
+  # pv.sra.sci.h
+  TEST_UIMM6_OP(  8, pv.sra.sci.h, 0xFFE70029, 0xCEB053F9, 9 );
+  TEST_UIMM6_OP(  9, pv.sra.sci.h, 0xFFD5FFF8, 0xAA4AF03F, 9 );
+  TEST_UIMM6_OP( 10, pv.sra.sci.h, 0x00340008, 0x68E511A2, 9 );
+  # pv.sra.b
+  TEST_RR_OP( 11, pv.sra.b, 0x08FF00F9, 0x11F61B9D, 0x01050704 );
+  TEST_RR_OP( 12, pv.sra.b, 0xFCFCFFF3, 0x8FE3F89C, 0x05030603 );
+  TEST_RR_OP( 13, pv.sra.b, 0x0096FFA3, 0x0296E1A3, 0x02000600 );
+  # pv.sra.sc.b
+  TEST_RR_OP( 14, pv.sra.sc.b, 0x66F6E2A1, 0x66F6E2A1, 0x01050600 );
+  TEST_RR_OP( 15, pv.sra.sc.b, 0x0DFD1A13, 0x36F56B4D, 0x03040302 );
+  TEST_RR_OP( 16, pv.sra.sc.b, 0x00000201, 0x16135625, 0x01040205 );
+  # pv.sra.sci.b
+  TEST_UIMM6_OP( 17, pv.sra.sci.b, 0xFCFC02FD, 0x848B57AD, 5 );
+  TEST_UIMM6_OP( 18, pv.sra.sci.b, 0x02FE02FC, 0x40CD5290, 5 );
+  TEST_UIMM6_OP( 19, pv.sra.sci.b, 0x02FCFF01, 0x549FFD20, 5 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_srl.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_srl.S
new file mode 100644
index 000000000..47ebe0e49
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_srl.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_srl.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.srl instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.srl.h
+  TEST_RR_OP( 2, pv.srl.h, 0x06030067, 0xC076CE34, 0x00050009 );
+  TEST_RR_OP( 3, pv.srl.h, 0x00A40047, 0xA41723DF, 0x00080007 );
+  TEST_RR_OP( 4, pv.srl.h, 0x00142A49, 0x52ADA926, 0x000A0002 );
+  # pv.srl.sc.h
+  TEST_RR_OP( 5, pv.srl.sc.h, 0x1EE01053, 0xF706829F, 0x00080003 );
+  TEST_RR_OP( 6, pv.srl.sc.h, 0x00000001, 0x3BC79528, 0x000E000F );
+  TEST_RR_OP( 7, pv.srl.sc.h, 0x001001BE, 0x04236F94, 0x000D0006 );
+  # pv.srl.sci.h
+  TEST_UIMM6_OP(  8, pv.srl.sci.h, 0x00450077, 0x8AA9EF3A, 9 );
+  TEST_UIMM6_OP(  9, pv.srl.sci.h, 0x0049006B, 0x93A9D63A, 9 );
+  TEST_UIMM6_OP( 10, pv.srl.sci.h, 0x003F0040, 0x7E0D81AF, 9 );
+  # pv.srl.b
+  TEST_RR_OP( 11, pv.srl.b, 0x030C6A01, 0xFAC8D4F6, 0x06040107 );
+  TEST_RR_OP( 12, pv.srl.b, 0x07000105, 0x3F0B94B5, 0x03050705 );
+  TEST_RR_OP( 13, pv.srl.b, 0x00311065, 0x29C54065, 0x07020200 );
+  # pv.srl.sc.b
+  TEST_RR_OP( 14, pv.srl.sc.b, 0x3A37353E, 0x746E6A7C, 0x02010701 );
+  TEST_RR_OP( 15, pv.srl.sc.b, 0x0A080A0D, 0xAA82A5D6, 0x01030204 );
+  TEST_RR_OP( 16, pv.srl.sc.b, 0x6F5D6D75, 0xDEBBDAEB, 0x03040701 );
+  # pv.srl.sci.b
+  TEST_UIMM6_OP( 17, pv.srl.sci.b, 0x06010704, 0xCD2DE193, 5 );
+  TEST_UIMM6_OP( 18, pv.srl.sci.b, 0x00030507, 0x0B64B9E8, 5 );
+  TEST_UIMM6_OP( 19, pv.srl.sci.b, 0x02070306, 0x50E572CB, 5 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_sub.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sub.S
new file mode 100644
index 000000000..72336e0d9
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_sub.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_sub.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.sub instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.sub.h
+  TEST_RR_OP( 2, pv.sub.h, 0x21549541, 0xC037A04F, 0x9EE30B0E );
+  TEST_RR_OP( 3, pv.sub.h, 0x0A8F87AF, 0xA6011B6C, 0x9B7293BD );
+  TEST_RR_OP( 4, pv.sub.h, 0xE3DFCB44, 0x2BD6C73C, 0x47F7FBF8 );
+  # pv.sub.sc.h
+  TEST_RR_OP( 5, pv.sub.sc.h, 0x005D10C7, 0xA7DFB849, 0xA0DAA782 );
+  TEST_RR_OP( 6, pv.sub.sc.h, 0x915182F6, 0xEB54DCF9, 0xE0AB5A03 );
+  TEST_RR_OP( 7, pv.sub.sc.h, 0x8ADB0963, 0xC72645AE, 0x44033C4B );
+  # pv.sub.sci.h
+  TEST_SIMM6_OP(  8, pv.sub.sci.h, 0x5F6A01D4, 0x5F7501DF, 11 );
+  TEST_SIMM6_OP(  9, pv.sub.sci.h, 0xEDEBEE05, 0xEDF6EE10, 11 );
+  TEST_SIMM6_OP( 10, pv.sub.sci.h, 0x5254F633, 0x525FF63E, 11 );
+  # pv.sub.b
+  TEST_RR_OP( 11, pv.sub.b, 0xCFA312C4, 0xD6B51AA7, 0x071208E3 );
+  TEST_RR_OP( 12, pv.sub.b, 0x399B9FC4, 0x273CF552, 0xEEA1568E );
+  TEST_RR_OP( 13, pv.sub.b, 0x75B1BB23, 0x3E6DD37D, 0xC9BC185A );
+  # pv.sub.sc.b
+  TEST_RR_OP( 14, pv.sub.sc.b, 0x85D1880A, 0xCE1AD153, 0x11D9D249 );
+  TEST_RR_OP( 15, pv.sub.sc.b, 0xF53BE607, 0xB0F6A1C2, 0xA4990EBB );
+  TEST_RR_OP( 16, pv.sub.sc.b, 0xB890FAF0, 0x4A228C82, 0x3DEA1692 );
+  # pv.sub.sci.b
+  TEST_SIMM6_OP( 17, pv.sub.sci.b, 0x5282B987, 0x5D8DC492, 11 );
+  TEST_SIMM6_OP( 18, pv.sub.sci.b, 0x12D59C9F, 0x1DE0A7AA, 11 );
+  TEST_SIMM6_OP( 19, pv.sub.sci.b, 0x6C6D5D05, 0x77786810, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_SIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_SIMM6_DEST_BYPASS,
+  # TEST_SIMM6_SRC1_BYPASS, TEST_SIMM6_ZEROSRC1, TEST_SIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/rv32uxpulpimg/pv_xor.S b/apps/riscv-tests/isa/rv32uxpulpimg/pv_xor.S
new file mode 100644
index 000000000..75fee565c
--- /dev/null
+++ b/apps/riscv-tests/isa/rv32uxpulpimg/pv_xor.S
@@ -0,0 +1,74 @@
+# See LICENSE for license details.
+
+#*****************************************************************************
+# pv_xor.S
+#-----------------------------------------------------------------------------
+#
+# Test pv.xor instructions.
+#
+
+#include "riscv_test.h"
+#include "test_macros.h"
+
+RVTEST_RV32U
+RVTEST_CODE_BEGIN
+
+  #-------------------------------------------------------------
+  # Arithmetic tests
+  #-------------------------------------------------------------
+
+  # pv.xor.h
+  TEST_RR_OP( 2, pv.xor.h, 0x66F696DC, 0x5FCE4AD5, 0x3938DC09 );
+  TEST_RR_OP( 3, pv.xor.h, 0x58A5BD3D, 0x672A5F61, 0x3F8FE25C );
+  TEST_RR_OP( 4, pv.xor.h, 0x339E302C, 0xE468E8F4, 0xD7F6D8D8 );
+  # pv.xor.sc.h
+  TEST_RR_OP( 5, pv.xor.sc.h, 0x5FB150BC, 0xC4A5CBA8, 0x43CC9B14 );
+  TEST_RR_OP( 6, pv.xor.sc.h, 0x48030479, 0xD7F09B8A, 0xCB019FF3 );
+  TEST_RR_OP( 7, pv.xor.sc.h, 0x0465D51A, 0x40CF91B0, 0x55DB44AA );
+  # pv.xor.sci.h
+  TEST_UIMM6_OP(  8, pv.xor.sci.h, 0x0F43E04C, 0x0F48E047, 11 );
+  TEST_UIMM6_OP(  9, pv.xor.sci.h, 0xEC22101C, 0xEC291017, 11 );
+  TEST_UIMM6_OP( 10, pv.xor.sci.h, 0x137F208C, 0x13742087, 11 );
+  # pv.xor.b
+  TEST_RR_OP( 11, pv.xor.b, 0x6A9EC5B4, 0x13518603, 0x79CF43B7 );
+  TEST_RR_OP( 12, pv.xor.b, 0xEE0CDAEA, 0x59CAB02D, 0xB7C66AC7 );
+  TEST_RR_OP( 13, pv.xor.b, 0x5B6E4CC1, 0x8B61A064, 0xD00FECA5 );
+  # pv.xor.sc.b
+  TEST_RR_OP( 14, pv.xor.sc.b, 0x40CF2054, 0x0F806F1B, 0x7CD0414F );
+  TEST_RR_OP( 15, pv.xor.sc.b, 0x89E5AA00, 0x127E319B, 0xC919409B );
+  TEST_RR_OP( 16, pv.xor.sc.b, 0xFC7E17F9, 0xAE2C45AB, 0xB9254252 );
+  # pv.xor.sci.b
+  TEST_UIMM6_OP( 17, pv.xor.sci.b, 0x2D2D131C, 0x26261817, 11 );
+  TEST_UIMM6_OP( 18, pv.xor.sci.b, 0x23EC42D8, 0x28E749D3, 11 );
+  TEST_UIMM6_OP( 19, pv.xor.sci.b, 0xCAA811C9, 0xC1A31AC2, 11 );
+
+  #-------------------------------------------------------------
+  # Source/Destination tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_SRC1_EQ_DEST,
+  # TEST_RR_SRC2_EQ_DEST, TEST_RR_SRC12_EQ_DEST
+  # for register-simm6 instructions TEST_UIMM6_SRC1_EQ_DEST
+
+  #-------------------------------------------------------------
+  # Bypassing tests
+  #-------------------------------------------------------------
+
+  # TODO(smazzola):
+  # for register-register instructions TEST_RR_DEST_BYPASS,
+  # TEST_RR_SRC12_BYPASS, TEST_RR_SRC21_BYPASS, TEST_RR_ZEROSRC1,
+  # TEST_RR_ZEROSRC2, TEST_RR_ZEROSRC12, TEST_RR_ZERODEST
+  # for register-simm6 instructions TEST_UIMM6_DEST_BYPASS,
+  # TEST_UIMM6_SRC1_BYPASS, TEST_UIMM6_ZEROSRC1, TEST_UIMM6_ZERODEST
+
+  TEST_PASSFAIL
+
+RVTEST_CODE_END
+
+  .data
+RVTEST_DATA_BEGIN
+
+  TEST_DATA
+
+RVTEST_DATA_END
diff --git a/apps/riscv-tests/isa/snitch_isa.mk b/apps/riscv-tests/isa/snitch_isa.mk
index 3a571c2e1..c914a0a8b 100644
--- a/apps/riscv-tests/isa/snitch_isa.mk
+++ b/apps/riscv-tests/isa/snitch_isa.mk
@@ -33,6 +33,12 @@ rv32um_snitch_sc_tests = \
 ifeq ($(xpulpimg),1)
 
 	rv32uxpulpimg_snitch_sc_tests = \
+		p_lb_irpost p_lbu_irpost p_lh_irpost p_lhu_irpost p_lw_irpost \
+		p_lb_rrpost p_lbu_rrpost p_lh_rrpost p_lhu_rrpost p_lw_rrpost \
+		p_lb_rr p_lbu_rr p_lh_rr p_lhu_rr p_lw_rr \
+		p_sb_irpost p_sh_irpost p_sw_irpost \
+		p_sb_rrpost p_sh_rrpost p_sw_rrpost \
+		p_sb_rr p_sh_rr p_sw_rr \
 		p_abs \
 		p_slet p_sletu \
 		p_min p_minu \
@@ -42,6 +48,28 @@ ifeq ($(xpulpimg),1)
 		p_clip p_clipu \
 		p_clipr p_clipur \
 		p_beqimm p_bneimm \
+  	p_mac p_msu \
+		pv_add \
+		pv_sub \
+		pv_avg pv_avgu \
+		pv_min pv_minu \
+		pv_max pv_maxu \
+		pv_srl \
+		pv_sra \
+		pv_sll \
+		pv_or \
+		pv_xor \
+		pv_and \
+		pv_abs \
+		pv_extract pv_extractu \
+		pv_insert \
+		pv_dotup \
+		pv_dotusp \
+		pv_dotsp \
+		pv_sdotup \
+		pv_sdotusp \
+		pv_sdotsp \
+		pv_shuffle2 \
 
 endif
 
diff --git a/apps/sleep_wakeup/main.c b/apps/sleep_wakeup/main.c
index f4b9bfa00..308a68689 100644
--- a/apps/sleep_wakeup/main.c
+++ b/apps/sleep_wakeup/main.c
@@ -6,7 +6,7 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-volatile uint32_t atomic __attribute__((section(".l2"))) = -1;
+volatile uint32_t atomic __attribute__((section(".l2"))) = (uint32_t)-1;
 
 extern volatile uint32_t tcdm_start_address_reg;
 extern volatile uint32_t tcdm_end_address_reg;
diff --git a/hardware/Makefile b/hardware/Makefile
index 836d7b457..941ae8df8 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -215,7 +215,7 @@ pre_trace:
 
 post_trace:
 	mkdir -p "$(result_dir)"
-	cp $(buildpath)/transcript "$(result_dir)/"
+	cp $(buildpath)/transcript "$(result_dir)/" | true
 	cp $(traceresult) "$(result_dir)"
 	cp $(trace) "$(result_dir)"
 
diff --git a/hardware/deps/snitch/src/mempool_cc.sv b/hardware/deps/snitch/src/mempool_cc.sv
index b6244db67..14f3efab7 100644
--- a/hardware/deps/snitch/src/mempool_cc.sv
+++ b/hardware/deps/snitch/src/mempool_cc.sv
@@ -255,8 +255,8 @@ module mempool_cc #(
     "acc_pid":      i_snitch.acc_pid_i,
     "acc_pdata_32": i_snitch.acc_pdata_i[31:0],
     // FPU offload
-    "fpu_offload":  (i_snitch.acc_qready_i && i_snitch.acc_qvalid_o && !snitch_pkg::shared_offload(i_snitch.acc_qdata_op_o)),
-    "is_seq_insn":  (i_snitch.inst_data_i ==? riscv_instr::FREP)
+    "fpu_offload":  1'b0,
+    "is_seq_insn":  1'b0
   };
 
   task fmt_extras (
@@ -321,8 +321,8 @@ module mempool_cc #(
           extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "acc_pid",     i_snitch.acc_pid_i,);
           extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "acc_pdata_32",i_snitch.acc_pdata_i[31:0],);
           // FPU offload
-          extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "fpu_offload", (i_snitch.acc_qready_i && i_snitch.acc_qvalid_o && !snitch_pkg::shared_offload(i_snitch.acc_qdata_op_o)),);
-          extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "is_seq_insn", (i_snitch.inst_data_i ==? riscv_instr::FREP));
+          extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "fpu_offload", 1'b0);
+          extras_str = $sformatf("%s'%s': 0x%8x, ", extras_str, "is_seq_insn", 1'b0);
           extras_str = $sformatf("%s}", extras_str);
 `else
           fmt_extras(extras_snitch, extras_str);
diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv
index f71786d1f..148549b0a 100644
--- a/hardware/deps/snitch/src/snitch.sv
+++ b/hardware/deps/snitch/src/snitch.sv
@@ -81,7 +81,7 @@ module snitch #(
 );
 
   localparam int RegWidth = RVE ? 4 : 5;
-  localparam int RegNrReadPorts  = 2;
+  localparam int RegNrReadPorts = snitch_pkg::XPULPIMG ? 3 : 2;
 
   logic illegal_inst;
   logic zero_lsb;
@@ -118,7 +118,7 @@ module snitch #(
   logic [2**RegWidth-1:0]                   sb_d, sb_q;
 
   // Load/Store Defines
-  logic is_load, is_store, is_signed;
+  logic is_load, is_store, is_signed, is_postincr;
   logic is_fp_load, is_fp_store;
   logic ls_misaligned;
   logic ld_addr_misaligned;
@@ -150,8 +150,10 @@ module snitch #(
   logic lsu_qready, lsu_qvalid;
   logic lsu_pvalid, lsu_pready;
   logic [RegWidth-1:0] lsu_rd;
+  logic [31:0] lsu_qaddr;
 
   logic retire_load; // retire a load instruction
+  logic retire_p; // retire from post-increment instructions
   logic retire_i; // retire the rest of the base instruction set
   logic retire_acc; // retire an instruction we offloaded
 
@@ -175,11 +177,13 @@ module snitch #(
   } alu_op;
 
   enum logic [3:0] {
-    None, Reg, IImmediate, UImmediate, JImmediate, SImmediate, SFImmediate, PC, CSR, CSRImmediate, PBImmediate
-  } opa_select, opb_select;
+    None, Reg, IImmediate, UImmediate, JImmediate, SImmediate, SFImmediate, PC, CSR, CSRImmediate, PBImmediate, RegRd, RegRs2
+  } opa_select, opb_select, opc_select;
 
-  logic write_rd; // write desitnation this cycle
+  logic write_rd; // write rd desitnation this cycle
   logic uses_rd;
+  logic write_rs1; // write rs1 destination this cycle
+  logic uses_rs1;
   enum logic [1:0] {Consec, Alu, Exception} next_pc;
 
   enum logic [1:0] {RdAlu, RdConsecPC, RdBypass} rd_select;
@@ -217,7 +221,7 @@ module snitch #(
   assign acc_qdata_op_o = inst_data_i;
   assign acc_qdata_arga_o = {{32{gpr_rdata[0][31]}}, gpr_rdata[0]};
   assign acc_qdata_argb_o = {{32{gpr_rdata[1][31]}}, gpr_rdata[1]};
-  assign acc_qdata_argc_o = {32'b0, alu_result};
+  assign acc_qdata_argc_o = {{32{gpr_rdata[2][31]}}, gpr_rdata[2]};
 
   // instruction fetch interface
   assign inst_addr_o = pc_q;
@@ -229,7 +233,8 @@ module snitch #(
   // Scoreboard: Keep track of rd dependencies (only loads at the moment)
   logic operands_ready;
   logic dst_ready;
-  logic opa_ready, opb_ready;
+  logic opa_ready, opb_ready, opc_ready;
+  logic dstrd_ready, dstrs1_ready;
 
   always_comb begin
     sb_d = sb_q;
@@ -241,11 +246,14 @@ module snitch #(
   end
   // TODO(zarubaf): This can probably be described a bit more efficient
   assign opa_ready = (opa_select != Reg) | ~sb_q[rs1];
-  assign opb_ready = (opb_select != Reg & opb_select != SImmediate) | ~sb_q[rs2];
-  assign operands_ready = opa_ready & opb_ready;
+  assign opb_ready = ((opb_select != Reg & opb_select != SImmediate) | ~sb_q[rs2]) & ((opb_select != RegRd) | ~sb_q[rd]);
+  assign opc_ready = ((opc_select != Reg) | ~sb_q[rd]) & ((opc_select != RegRs2) | ~sb_q[rs2]);
+  assign operands_ready = opa_ready & opb_ready & opc_ready;
   // either we are not using the destination register or we need to make
   // sure that its destination operand is not marked busy in the scoreboard.
-  assign dst_ready = ~uses_rd | (uses_rd & ~sb_q[rd]);
+  assign dstrd_ready = ~uses_rd | (uses_rd & ~sb_q[rd]);
+  assign dstrs1_ready = ~uses_rs1 | (uses_rs1 & ~sb_q[rs1]);
+  assign dst_ready = dstrd_ready & dstrs1_ready;
 
   assign valid_instr = (inst_ready_i & inst_valid_o) & operands_ready & dst_ready;
   // the accelerator interface stalled us
@@ -284,14 +292,18 @@ module snitch #(
     alu_op = Add;
     opa_select = None;
     opb_select = None;
+    opc_select = None;
 
     next_pc = Consec;
 
+    // set up rd destination
     rd_select = RdAlu;
     write_rd = 1'b1;
-    // if we are writing the field this cycle we need
-    // an int destination register
+    // if we are writing the field this cycle we need an int destination register
     uses_rd = write_rd;
+    // set up rs1 destination
+    write_rs1 = 1'b0;
+    uses_rs1 = write_rs1;
 
     rd_bypass = '0;
     zero_lsb = 1'b0;
@@ -299,6 +311,7 @@ module snitch #(
     // LSU interface
     is_load = 1'b0;
     is_store = 1'b0;
+    is_postincr = 1'b0;
     is_fp_load = 1'b0;
     is_fp_store = 1'b0;
     is_signed = 1'b0;
@@ -748,29 +761,324 @@ module snitch #(
       end
 
 /* Xpulpimg extension */
-      // Off-load to IPU coprocessor
-      riscv_instr::P_ABS,          // Xpulpimg: p.abs
-      riscv_instr::P_SLET,         // Xpulpimg: p.slet
-      riscv_instr::P_SLETU,        // Xpulpimg: p.sletu
-      riscv_instr::P_MIN,          // Xpulpimg: p.min
-      riscv_instr::P_MINU,         // Xpulpimg: p.minu
-      riscv_instr::P_MAX,          // Xpulpimg: p.max
-      riscv_instr::P_MAXU,         // Xpulpimg: p.maxu
-      riscv_instr::P_EXTHS,        // Xpulpimg: p.exths
-      riscv_instr::P_EXTHZ,        // Xpulpimg: p.exthz
-      riscv_instr::P_EXTBS,        // Xpulpimg: p.extbs
-      riscv_instr::P_EXTBZ,        // Xpulpimg: p.extbz
-      riscv_instr::P_CLIP,         // Xpulpimg: p.clip
-      riscv_instr::P_CLIPU,        // Xpulpimg: p.clipu
-      riscv_instr::P_CLIPR,        // Xpulpimg: p.clipr
-      riscv_instr::P_CLIPUR: begin // Xpulpimg: p.clipur
+      // Post-increment loads/stores
+      riscv_instr::P_LB_IRPOST: begin // Xpulpimg: p.lb rd,iimm(rs1!)
         if (snitch_pkg::XPULPIMG) begin
           write_rd = 1'b0;
           uses_rd = 1'b1;
-          acc_qvalid_o = valid_instr;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          opa_select = Reg;
+          opb_select = IImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LBU_IRPOST: begin // Xpulpimg: p.lbu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          opa_select = Reg;
+          opb_select = IImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LH_IRPOST: begin  // Xpulpimg: p.lh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = IImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LHU_IRPOST: begin // Xpulpimg: p.lhu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = IImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LW_IRPOST: begin  // Xpulpimg: p.lw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = IImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LB_RRPOST: begin  // Xpulpimg: p.lb rd,rs2(rs1!)
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LBU_RRPOST: begin // Xpulpimg: p.lbu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LH_RRPOST: begin  // Xpulpimg: p.lh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LHU_RRPOST: begin // Xpulpimg: p.lhu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LW_RRPOST: begin  // Xpulpimg: p.lw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          write_rs1 = 1'b1;
+          is_load = 1'b1;
+          is_postincr = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LB_RR: begin      // Xpulpimg: p.lb rd,rs2(rs1)
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
           opa_select = Reg;
           opb_select = Reg;
-          acc_register_rd = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LBU_RR: begin     // Xpulpimg: p.lbu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LH_RR: begin      // Xpulpimg: p.lh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LHU_RR: begin     // Xpulpimg: p.lhu
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_LW_RR: begin      // Xpulpimg: p.lw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SB_IRPOST: begin  // Xpulpimg: p.sb rs2,simm(rs1!)
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          opa_select = Reg;
+          opb_select = SImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SH_IRPOST: begin  // Xpulpimg: p.sh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = SImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SW_IRPOST: begin  // Xpulpimg: p.sw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = SImmediate;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // opb is usually assigned with the content of rs2; in stores with reg-reg
+      // addressing mode, however, the offset is stored in rd, so rd content is
+      // instead assigned to opb: if we cross such signals now (rd -> opb,
+      // rs2 -> opc) we don't have to do that in the ALU, with bigger muxes
+      riscv_instr::P_SB_RRPOST: begin  // Xpulpimg: p.sb rs2,rs3(rs1!)
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          opa_select = Reg; // rs1 base address
+          opb_select = RegRd; // rs3 (i.e. rd) offset
+          opc_select = RegRs2; // rs2 source data
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SH_RRPOST: begin  // Xpulpimg: p.sh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = RegRd;
+          opc_select = RegRs2;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SW_RRPOST: begin  // Xpulpimg: p.sw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          write_rs1 = 1'b1;
+          is_store = 1'b1;
+          is_postincr = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = RegRd;
+          opc_select = RegRs2;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SB_RR: begin      // Xpulpimg: p.sb rs2,rs3(rs1)
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          is_store = 1'b1;
+          opa_select = Reg;
+          opb_select = RegRd;
+          opc_select = RegRs2;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SH_RR: begin      // Xpulpimg: p.sh
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          is_store = 1'b1;
+          ls_size = HalfWord;
+          opa_select = Reg;
+          opb_select = RegRd;
+          opc_select = RegRs2;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      riscv_instr::P_SW_RR: begin      // Xpulpimg: p.sw
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          is_store = 1'b1;
+          ls_size = Word;
+          opa_select = Reg;
+          opb_select = RegRd;
+          opc_select = RegRs2;
         end else begin
           illegal_inst = 1'b1;
         end
@@ -798,6 +1106,202 @@ module snitch #(
           illegal_inst = 1'b1;
         end
       end
+      // Off-load to IPU coprocessor
+      // 1 source register (rs1)
+      riscv_instr::P_ABS,                // Xpulpimg: p.abs
+      riscv_instr::P_EXTHS,              // Xpulpimg: p.exths
+      riscv_instr::P_EXTHZ,              // Xpulpimg: p.exthz
+      riscv_instr::P_EXTBS,              // Xpulpimg: p.extbs
+      riscv_instr::P_EXTBZ,              // Xpulpimg: p.extbz
+      riscv_instr::P_CLIP,               // Xpulpimg: p.clip
+      riscv_instr::P_CLIPU,              // Xpulpimg: p.clipu
+      riscv_instr::PV_ADD_SCI_H,         // Xpulpimg: pv.add.sci.h
+      riscv_instr::PV_ADD_SCI_B,         // Xpulpimg: pv.add.sci.b
+      riscv_instr::PV_SUB_SCI_H,         // Xpulpimg: pv.sub.sci.h
+      riscv_instr::PV_SUB_SCI_B,         // Xpulpimg: pv.sub.sci.b
+      riscv_instr::PV_AVG_SCI_H,         // Xpulpimg: pv.avg.sci.h
+      riscv_instr::PV_AVG_SCI_B,         // Xpulpimg: pv.avg.sci.b
+      riscv_instr::PV_AVGU_SCI_H,        // Xpulpimg: pv.avgu.sci.h
+      riscv_instr::PV_AVGU_SCI_B,        // Xpulpimg: pv.avgu.sci.b
+      riscv_instr::PV_MIN_SCI_H,         // Xpulpimg: pv.min.sci.h
+      riscv_instr::PV_MIN_SCI_B,         // Xpulpimg: pv.min.sci.b
+      riscv_instr::PV_MINU_SCI_H,        // Xpulpimg: pv.minu.sci.h
+      riscv_instr::PV_MINU_SCI_B,        // Xpulpimg: pv.minu.sci.b
+      riscv_instr::PV_MAX_SCI_H,         // Xpulpimg: pv.max.sci.h
+      riscv_instr::PV_MAX_SCI_B,         // Xpulpimg: pv.max.sci.b
+      riscv_instr::PV_MAXU_SCI_H,        // Xpulpimg: pv.maxu.sci.h
+      riscv_instr::PV_MAXU_SCI_B,        // Xpulpimg: pv.maxu.sci.b
+      riscv_instr::PV_SRL_SCI_H,         // Xpulpimg: pv.srl.sci.h
+      riscv_instr::PV_SRL_SCI_B,         // Xpulpimg: pv.srl.sci.b
+      riscv_instr::PV_SRA_SCI_H,         // Xpulpimg: pv.sra.sci.h
+      riscv_instr::PV_SRA_SCI_B,         // Xpulpimg: pv.sra.sci.b
+      riscv_instr::PV_SLL_SCI_H,         // Xpulpimg: pv.sll.sci.h
+      riscv_instr::PV_SLL_SCI_B,         // Xpulpimg: pv.sll.sci.b
+      riscv_instr::PV_OR_SCI_H,          // Xpulpimg: pv.or.sci.h
+      riscv_instr::PV_OR_SCI_B,          // Xpulpimg: pv.or.sci.b
+      riscv_instr::PV_XOR_SCI_H,         // Xpulpimg: pv.xor.sci.h
+      riscv_instr::PV_XOR_SCI_B,         // Xpulpimg: pv.xor.sci.b
+      riscv_instr::PV_AND_SCI_B,         // Xpulpimg: pv.and.sci.b
+      riscv_instr::PV_AND_SCI_H,         // Xpulpimg: pv.and.sci.h
+      riscv_instr::PV_ABS_H,             // Xpulpimg: pv.abs.h
+      riscv_instr::PV_ABS_B,             // Xpulpimg: pv.abs.b
+      riscv_instr::PV_EXTRACT_H,         // Xpulpimg: pv.extract.h
+      riscv_instr::PV_EXTRACT_B,         // Xpulpimg: pv.extract.b
+      riscv_instr::PV_EXTRACTU_H,        // Xpulpimg: pv.extractu.h
+      riscv_instr::PV_EXTRACTU_B,        // Xpulpimg: pv.extractu.b
+      riscv_instr::PV_DOTUP_SCI_H,       // Xpulpimg: pv.dotup.sci.h
+      riscv_instr::PV_DOTUP_SCI_B,       // Xpulpimg: pv.dotup.sci.b
+      riscv_instr::PV_DOTUSP_SCI_H,      // Xpulpimg: pv.dotusp.sci.h
+      riscv_instr::PV_DOTUSP_SCI_B,      // Xpulpimg: pv.dotusp.sci.b
+      riscv_instr::PV_DOTSP_SCI_H,       // Xpulpimg: pv.dotsp.sci.h
+      riscv_instr::PV_DOTSP_SCI_B: begin // Xpulpimg: pv.dotsp.sci.b
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          acc_qvalid_o = valid_instr;
+          opa_select = Reg;
+          acc_register_rd = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // 2 source registers (rs1, rs2)
+      riscv_instr::P_SLET,              // Xpulpimg: p.slet
+      riscv_instr::P_SLETU,             // Xpulpimg: p.sletu
+      riscv_instr::P_MIN,               // Xpulpimg: p.min
+      riscv_instr::P_MINU,              // Xpulpimg: p.minu
+      riscv_instr::P_MAX,               // Xpulpimg: p.max
+      riscv_instr::P_MAXU,              // Xpulpimg: p.maxu
+      riscv_instr::P_CLIPR,             // Xpulpimg: p.clipr
+      riscv_instr::P_CLIPUR,            // Xpulpimg: p.clipur
+      riscv_instr::PV_ADD_H,            // Xpulpimg: pv.add.h
+      riscv_instr::PV_ADD_SC_H,         // Xpulpimg: pv.add.sc.h
+      riscv_instr::PV_ADD_B,            // Xpulpimg: pv.add.b
+      riscv_instr::PV_ADD_SC_B,         // Xpulpimg: pv.add.sc.b
+      riscv_instr::PV_SUB_H,            // Xpulpimg: pv.sub.h
+      riscv_instr::PV_SUB_SC_H,         // Xpulpimg: pv.sub.sc.h
+      riscv_instr::PV_SUB_B,            // Xpulpimg: pv.sub.b
+      riscv_instr::PV_SUB_SC_B,         // Xpulpimg: pv.sub.sc.b
+      riscv_instr::PV_AVG_H,            // Xpulpimg: pv.avg.h
+      riscv_instr::PV_AVG_SC_H,         // Xpulpimg: pv.avg.sc.h
+      riscv_instr::PV_AVG_B,            // Xpulpimg: pv.avg.b
+      riscv_instr::PV_AVG_SC_B,         // Xpulpimg: pv.avg.sc.b
+      riscv_instr::PV_AVGU_H,           // Xpulpimg: pv.avgu.h
+      riscv_instr::PV_AVGU_SC_H,        // Xpulpimg: pv.avgu.sc.h
+      riscv_instr::PV_AVGU_B,           // Xpulpimg: pv.avgu.b
+      riscv_instr::PV_AVGU_SC_B,        // Xpulpimg: pv.avgu.sc.b
+      riscv_instr::PV_MIN_H,            // Xpulpimg: pv.min.h
+      riscv_instr::PV_MIN_SC_H,         // Xpulpimg: pv.min.sc.h
+      riscv_instr::PV_MIN_B,            // Xpulpimg: pv.min.b
+      riscv_instr::PV_MIN_SC_B,         // Xpulpimg: pv.min.sc.b
+      riscv_instr::PV_MINU_H,           // Xpulpimg: pv.minu.h
+      riscv_instr::PV_MINU_SC_H,        // Xpulpimg: pv.minu.sc.h
+      riscv_instr::PV_MINU_B,           // Xpulpimg: pv.minu.b
+      riscv_instr::PV_MINU_SC_B,        // Xpulpimg: pv.minu.sc.b
+      riscv_instr::PV_MAX_H,            // Xpulpimg: pv.max.h
+      riscv_instr::PV_MAX_SC_H,         // Xpulpimg: pv.max.sc.h
+      riscv_instr::PV_MAX_B,            // Xpulpimg: pv.max.b
+      riscv_instr::PV_MAX_SC_B,         // Xpulpimg: pv.max.sc.b
+      riscv_instr::PV_MAXU_H,           // Xpulpimg: pv.maxu.h
+      riscv_instr::PV_MAXU_SC_H,        // Xpulpimg: pv.maxu.sc.h
+      riscv_instr::PV_MAXU_B,           // Xpulpimg: pv.maxu.b
+      riscv_instr::PV_MAXU_SC_B,        // Xpulpimg: pv.maxu.sc.b
+      riscv_instr::PV_SRL_H,            // Xpulpimg: pv.srl.h
+      riscv_instr::PV_SRL_SC_H,         // Xpulpimg: pv.srl.sc.h
+      riscv_instr::PV_SRL_B,            // Xpulpimg: pv.srl.b
+      riscv_instr::PV_SRL_SC_B,         // Xpulpimg: pv.srl.sc.b
+      riscv_instr::PV_SRA_H,            // Xpulpimg: pv.sra.h
+      riscv_instr::PV_SRA_SC_H,         // Xpulpimg: pv.sra.sc.h
+      riscv_instr::PV_SRA_B,            // Xpulpimg: pv.sra.b
+      riscv_instr::PV_SRA_SC_B,         // Xpulpimg: pv.sra.sc.b
+      riscv_instr::PV_SLL_H,            // Xpulpimg: pv.sll.h
+      riscv_instr::PV_SLL_SC_H,         // Xpulpimg: pv.sll.sc.h
+      riscv_instr::PV_SLL_B,            // Xpulpimg: pv.sll.b
+      riscv_instr::PV_SLL_SC_B,         // Xpulpimg: pv.sll.sc.b
+      riscv_instr::PV_OR_H,             // Xpulpimg: pv.or.h
+      riscv_instr::PV_OR_SC_H,          // Xpulpimg: pv.or.sc.h
+      riscv_instr::PV_OR_B,             // Xpulpimg: pv.or.b
+      riscv_instr::PV_OR_SC_B,          // Xpulpimg: pv.or.sc.b
+      riscv_instr::PV_XOR_H,            // Xpulpimg: pv.xor.h
+      riscv_instr::PV_XOR_SC_H,         // Xpulpimg: pv.xor.sc.h
+      riscv_instr::PV_XOR_B,            // Xpulpimg: pv.xor.b
+      riscv_instr::PV_XOR_SC_B,         // Xpulpimg: pv.xor.sc.b
+      riscv_instr::PV_AND_H,            // Xpulpimg: pv.and.h
+      riscv_instr::PV_AND_SC_H,         // Xpulpimg: pv.and.sc.h
+      riscv_instr::PV_AND_B,            // Xpulpimg: pv.and.b
+      riscv_instr::PV_AND_SC_B,         // Xpulpimg: pv.and.sc.b
+      riscv_instr::PV_DOTUP_H,          // Xpulpimg: pv.dotup.h
+      riscv_instr::PV_DOTUP_SC_H,       // Xpulpimg: pv.dotup.sc.h
+      riscv_instr::PV_DOTUP_B,          // Xpulpimg: pv.dotup.b
+      riscv_instr::PV_DOTUP_SC_B,       // Xpulpimg: pv.dotup.sc.b
+      riscv_instr::PV_DOTUSP_H,         // Xpulpimg: pv.dotusp.h
+      riscv_instr::PV_DOTUSP_SC_H,      // Xpulpimg: pv.dotusp.sc.h
+      riscv_instr::PV_DOTUSP_B,         // Xpulpimg: pv.dotusp.b
+      riscv_instr::PV_DOTUSP_SC_B,      // Xpulpimg: pv.dotusp.sc.b
+      riscv_instr::PV_DOTSP_H,          // Xpulpimg: pv.dotsp.h
+      riscv_instr::PV_DOTSP_SC_H,       // Xpulpimg: pv.dotsp.sc.h
+      riscv_instr::PV_DOTSP_B,          // Xpulpimg: pv.dotsp.b
+      riscv_instr::PV_DOTSP_SC_B: begin // Xpulpimg: pv.dotsp.sc.b
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          acc_qvalid_o = valid_instr;
+          opa_select = Reg;
+          opb_select = Reg;
+          acc_register_rd = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // 2 source registers (rs1, rd)
+      riscv_instr::PV_INSERT_H,           // Xpulpimg: pv.insert.h
+      riscv_instr::PV_INSERT_B,           // Xpulpimg: pv.insert.b
+      riscv_instr::PV_SDOTUP_SCI_H,       // Xpulpimg: pv.sdotup.sci.h
+      riscv_instr::PV_SDOTUP_SCI_B,       // Xpulpimg: pv.sdotup.sci.b
+      riscv_instr::PV_SDOTUSP_SCI_H,      // Xpulpimg: pv.sdotusp.sci.h
+      riscv_instr::PV_SDOTUSP_SCI_B,      // Xpulpimg: pv.sdotusp.sci.b
+      riscv_instr::PV_SDOTSP_SCI_H,       // Xpulpimg: pv.sdotsp.sci.h
+      riscv_instr::PV_SDOTSP_SCI_B: begin // Xpulpimg: pv.sdotsp.sci.b
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          acc_qvalid_o = valid_instr;
+          opa_select = Reg;
+          opc_select = Reg;
+          acc_register_rd = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // 3 source registers (rs1, rs2, rd)
+      riscv_instr::P_MAC,                // Xpulpimg: p.mac
+      riscv_instr::P_MSU,                // Xpulpimg: p.msu
+      riscv_instr::PV_SDOTUP_H,          // Xpulpimg: pv.sdotup.h
+      riscv_instr::PV_SDOTUP_SC_H,       // Xpulpimg: pv.sdotup.sc.h
+      riscv_instr::PV_SDOTUP_B,          // Xpulpimg: pv.sdotup.b
+      riscv_instr::PV_SDOTUP_SC_B,       // Xpulpimg: pv.sdotup.sc.b
+      riscv_instr::PV_SDOTUSP_H,         // Xpulpimg: pv.sdotusp.h
+      riscv_instr::PV_SDOTUSP_SC_H,      // Xpulpimg: pv.sdotusp.sc.h
+      riscv_instr::PV_SDOTUSP_B,         // Xpulpimg: pv.sdotusp.b
+      riscv_instr::PV_SDOTUSP_SC_B,      // Xpulpimg: pv.sdotusp.sc.b
+      riscv_instr::PV_SDOTSP_H,          // Xpulpimg: pv.sdotsp.h
+      riscv_instr::PV_SDOTSP_SC_H,       // Xpulpimg: pv.sdotsp.sc.h
+      riscv_instr::PV_SDOTSP_B,          // Xpulpimg: pv.sdotsp.b
+      riscv_instr::PV_SDOTSP_SC_B,       // Xpulpimg: pv.sdotsp.sc.b
+      riscv_instr::PV_SHUFFLE2_H,        // Xpulpimg: pv.shuffle2.h
+      riscv_instr::PV_SHUFFLE2_B: begin  // Xpulpimg: pv.shuffle2.b
+        if (snitch_pkg::XPULPIMG) begin
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          acc_qvalid_o = valid_instr;
+          opa_select = Reg;
+          opb_select = Reg;
+          opc_select = Reg;
+          acc_register_rd = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
 /* end of Xpulpimg extension */
 
       // TODO(zarubaf): Illegal Instructions
@@ -810,6 +1314,8 @@ module snitch #(
     if (exception) begin
      write_rd = 1'b0;
      uses_rd = 1'b0;
+     write_rs1 = 1'b0;
+     uses_rs1 = 1'b0;
      acc_qvalid_o = 1'b0;
      next_pc = Exception;
     end
@@ -893,12 +1399,17 @@ module snitch #(
       PC: opb = pc_q;
       CSR: opb = csr_rvalue;
       PBImmediate: opb = pbimm;
+      RegRd: opb = gpr_rdata[2];
       default: opb = '0;
     endcase
   end
 
   assign gpr_raddr[0] = rs1;
   assign gpr_raddr[1] = rs2;
+  // connect third read port only if present
+  if (RegNrReadPorts >= 3) begin : gpr_raddr_2
+    assign gpr_raddr[2] = rd;
+  end
 
   // --------------------
   // ALU
@@ -998,7 +1509,7 @@ module snitch #(
     .lsu_qtag_i   ( rd                    ),
     .lsu_qwrite   ( is_store              ),
     .lsu_qsigned  ( is_signed             ),
-    .lsu_qaddr_i  ( alu_result            ),
+    .lsu_qaddr_i  ( lsu_qaddr             ),
     .lsu_qdata_i  ( gpr_rdata[1]          ),
     .lsu_qsize_i  ( ls_size               ),
     .lsu_qamo_i   ( ls_amo                ),
@@ -1022,8 +1533,18 @@ module snitch #(
     .data_pready_o
   );
 
+  // address can be alu_result (i.e. rs1 + iimm/simm) or rs1 (for post-increment load/stores)
+  assign lsu_qaddr = is_postincr ? gpr_rdata[0] : alu_result;
+
   assign lsu_qvalid = valid_instr & (is_load | is_store) & ~(ld_addr_misaligned | st_addr_misaligned);
 
+  // NOTE(smazzola): write-backs "on rd from non-load or non-acc instructions" and "on rs1 from
+  // post-increment instructions" in the same cycle should be mutually exclusive (currently valid
+  // assumption since write-back to rs1 happens on the cycle in which the post-increment load/store
+  // is issued, if that cycle is not a stall, and it is not postponed like offloaded instructions,
+  // so no other instructions writing back on rd can be issued in the same cycle)
+  // retire post-incremented address on rs1 if valid postincr instruction and LSU not stalling
+  assign retire_p = write_rs1 & ~stall & (rs1 != 0);
   // we can retire if we are not stalling and if the instruction is writing a register
   assign retire_i = write_rd & valid_instr & (rd != 0);
 
@@ -1071,7 +1592,9 @@ module snitch #(
   if (RegNrWritePorts == 1) begin
     always_comb begin
       gpr_we[0] = 1'b0;
-      gpr_waddr[0] = rd;
+      // NOTE(smazzola): this works because write-backs on rd and rs1 in the same cycle are mutually
+      // exclusive; if this should change, the following statement has to be written in another form
+      gpr_waddr[0] = retire_p ? rs1 : rd; // choose whether to writeback at RF[rs1] for post-increment load/stores
       gpr_wdata[0] = alu_writeback;
       // external interfaces
       lsu_pready = 1'b0;
@@ -1079,7 +1602,7 @@ module snitch #(
       retire_acc = 1'b0;
       retire_load = 1'b0;
 
-      if (retire_i) begin
+      if (retire_i | retire_p) begin
         gpr_we[0] = 1'b1;
       // if we are not retiring another instruction retire the load now
       end else if (lsu_pvalid) begin
@@ -1099,7 +1622,9 @@ module snitch #(
   end else if (RegNrWritePorts == 2) begin
     always_comb begin
       gpr_we[0] = 1'b0;
-      gpr_waddr[0] = rd;
+      // NOTE(smazzola): this works because write-backs on rd and rs1 in the same cycle are mutually
+      // exclusive; if this should change, the following statement has to be written in another form
+      gpr_waddr[0] = retire_p ? rs1 : rd; // choose whether to writeback at RF[rs1] for post-increment load/stores
       gpr_wdata[0] = alu_writeback;
       gpr_we[1] = 1'b0;
       gpr_waddr[1] = lsu_rd;
@@ -1110,7 +1635,7 @@ module snitch #(
       retire_acc = 1'b0;
       retire_load = 1'b0;
 
-      if (retire_i) begin
+      if (retire_i | retire_p) begin
         gpr_we[0] = 1'b1;
         if (lsu_pvalid) begin
           retire_load = 1'b1;
diff --git a/hardware/deps/snitch/src/snitch_icache/snitch_icache.sv b/hardware/deps/snitch/src/snitch_icache/snitch_icache.sv
index 8b66d80d3..9bcc4a4d6 100644
--- a/hardware/deps/snitch/src/snitch_icache/snitch_icache.sv
+++ b/hardware/deps/snitch/src/snitch_icache/snitch_icache.sv
@@ -52,7 +52,7 @@ module snitch_icache #(
     input  logic clk_d2_i,
     input  logic rst_ni,
 
-    input  logic                               enable_prefetching_i,
+    input  logic [NR_FETCH_PORTS-1:0]                              enable_prefetching_i,
     output snitch_icache_pkg::icache_events_t [NR_FETCH_PORTS-1:0] icache_events_o,
 
     input  logic flush_valid_i,
@@ -220,24 +220,24 @@ module snitch_icache #(
             .clk_i ( clk_d2_i ),
             .rst_ni,
             .flush_valid_i,
-            .enable_prefetching_i,
-            .icache_events_o ( icache_events_o [i]    ),
-            .in_addr_i       ( inst_addr_i    [i]     ),
-            .in_data_o       ( in_cache_data  [i]     ),
-            .in_error_o      ( in_cache_error [i]     ),
-            .in_valid_i      ( in_cache_valid [i]     ),
-            .in_ready_o      ( in_cache_ready [i]     ),
-
-            .out_req_addr_o  ( local_prefetch_req.addr   ),
-            .out_req_id_o    ( local_prefetch_req.id     ),
-            .out_req_valid_o ( local_prefetch_req_valid ),
-            .out_req_ready_i ( local_prefetch_req_ready ),
-
-            .out_rsp_data_i  ( local_prefetch_rsp.data   ),
-            .out_rsp_error_i ( local_prefetch_rsp.error  ),
-            .out_rsp_id_i    ( local_prefetch_rsp.id     ),
-            .out_rsp_valid_i ( local_prefetch_rsp_valid  ),
-            .out_rsp_ready_o ( local_prefetch_rsp_ready  )
+            .enable_prefetching_i ( enable_prefetching_i [i] ),
+            .icache_events_o      ( icache_events_o [i]      ),
+            .in_addr_i            ( inst_addr_i    [i]       ),
+            .in_data_o            ( in_cache_data  [i]       ),
+            .in_error_o           ( in_cache_error [i]       ),
+            .in_valid_i           ( in_cache_valid [i]       ),
+            .in_ready_o           ( in_cache_ready [i]       ),
+
+            .out_req_addr_o       ( local_prefetch_req.addr  ),
+            .out_req_id_o         ( local_prefetch_req.id    ),
+            .out_req_valid_o      ( local_prefetch_req_valid ),
+            .out_req_ready_i      ( local_prefetch_req_ready ),
+
+            .out_rsp_data_i       ( local_prefetch_rsp.data  ),
+            .out_rsp_error_i      ( local_prefetch_rsp.error ),
+            .out_rsp_id_i         ( local_prefetch_rsp.id    ),
+            .out_rsp_valid_i      ( local_prefetch_rsp_valid ),
+            .out_rsp_ready_o      ( local_prefetch_rsp_ready )
         );
 
         isochronous_spill_register  #(
diff --git a/hardware/deps/snitch/src/snitch_icache/snitch_icache_l0.sv b/hardware/deps/snitch/src/snitch_icache/snitch_icache_l0.sv
index 8544d3078..147dbf093 100644
--- a/hardware/deps/snitch/src/snitch_icache/snitch_icache_l0.sv
+++ b/hardware/deps/snitch/src/snitch_icache/snitch_icache_l0.sv
@@ -298,26 +298,28 @@ module snitch_icache_l0 import snitch_icache_pkg::*; #(
     always_comb begin
       is_branch_taken[i] = 1'b0;
       is_jal[i] = 1'b0;
-      unique casez (ins_data[i*32+:32])
-        // static prediction
-        riscv_instr::BEQ,
-        riscv_instr::BNE,
-        riscv_instr::BLT,
-        riscv_instr::BGE,
-        riscv_instr::BLTU,
-        riscv_instr::BGEU: begin
-          // look at the sign bit of the immediate field
-          // backward branches (immediate negative) taken
-          // forward branches not taken
-          is_branch_taken[i] = ins_data[i*32+31];
-        end
-        riscv_instr::JAL: begin
-          is_jal[i] = 1'b1;
-        end
-        // we can't do anything about the JALR case as we don't
-        // know the destination.
-        default:;
-      endcase
+      if (hit_early_is_onehot) begin
+        unique casez (ins_data[i*32+:32])
+          // static prediction
+          riscv_instr::BEQ,
+          riscv_instr::BNE,
+          riscv_instr::BLT,
+          riscv_instr::BGE,
+          riscv_instr::BLTU,
+          riscv_instr::BGEU: begin
+            // look at the sign bit of the immediate field
+            // backward branches (immediate negative) taken
+            // forward branches not taken
+            is_branch_taken[i] = ins_data[i*32+31];
+          end
+          riscv_instr::JAL: begin
+            is_jal[i] = 1'b1;
+          end
+          // we can't do anything about the JALR case as we don't
+          // know the destination.
+          default:;
+        endcase
+      end
     end
   end
 
diff --git a/hardware/deps/snitch/src/snitch_ipu.sv b/hardware/deps/snitch/src/snitch_ipu.sv
index 6f6dc2bcf..c7c72e52f 100644
--- a/hardware/deps/snitch/src/snitch_ipu.sv
+++ b/hardware/deps/snitch/src/snitch_ipu.sv
@@ -30,8 +30,10 @@ module snitch_ipu #(
   } result_t;
   // input handshake
   logic div_valid_op, div_ready_op;
+  /* verilator lint_off UNDRIVEN */
   logic mul_valid_op, mul_ready_op;
   logic dsp_valid_op, dsp_ready_op;
+  /* verilator lint_on UNDRIVEN */
   // output handshake
   logic mul_valid, mul_ready;
   logic div_valid, div_ready;
@@ -51,8 +53,13 @@ module snitch_ipu #(
       riscv_instr::MULH,
       riscv_instr::MULHSU,
       riscv_instr::MULHU: begin
-        mul_valid_op = acc_qvalid_i;
-        acc_qready_o = mul_ready_op;
+        if (snitch_pkg::XPULPIMG) begin
+          dsp_valid_op = acc_qvalid_i;
+          acc_qready_o = dsp_ready_op;
+        end else begin
+          mul_valid_op = acc_qvalid_i;
+          acc_qready_o = mul_ready_op;
+        end
       end
       riscv_instr::DIV,
       riscv_instr::DIVU,
@@ -61,21 +68,153 @@ module snitch_ipu #(
         div_valid_op = acc_qvalid_i;
         acc_qready_o = div_ready_op;
       end
-      riscv_instr::P_ABS,          // Xpulpimg: p.abs
-      riscv_instr::P_SLET,         // Xpulpimg: p.slet
-      riscv_instr::P_SLETU,        // Xpulpimg: p.sletu
-      riscv_instr::P_MIN,          // Xpulpimg: p.min
-      riscv_instr::P_MINU,         // Xpulpimg: p.minu
-      riscv_instr::P_MAX,          // Xpulpimg: p.max
-      riscv_instr::P_MAXU,         // Xpulpimg: p.maxu
-      riscv_instr::P_EXTHS,        // Xpulpimg: p.exths
-      riscv_instr::P_EXTHZ,        // Xpulpimg: p.exthz
-      riscv_instr::P_EXTBS,        // Xpulpimg: p.extbs
-      riscv_instr::P_EXTBZ,        // Xpulpimg: p.extbz
-      riscv_instr::P_CLIP,         // Xpulpimg: p.clip
-      riscv_instr::P_CLIPU,        // Xpulpimg: p.clipu
-      riscv_instr::P_CLIPR,        // Xpulpimg: p.clipr
-      riscv_instr::P_CLIPUR: begin // Xpulpimg: p.clipur
+      riscv_instr::P_ABS,                 // Xpulpimg: p.abs
+      riscv_instr::P_SLET,                // Xpulpimg: p.slet
+      riscv_instr::P_SLETU,               // Xpulpimg: p.sletu
+      riscv_instr::P_MIN,                 // Xpulpimg: p.min
+      riscv_instr::P_MINU,                // Xpulpimg: p.minu
+      riscv_instr::P_MAX,                 // Xpulpimg: p.max
+      riscv_instr::P_MAXU,                // Xpulpimg: p.maxu
+      riscv_instr::P_EXTHS,               // Xpulpimg: p.exths
+      riscv_instr::P_EXTHZ,               // Xpulpimg: p.exthz
+      riscv_instr::P_EXTBS,               // Xpulpimg: p.extbs
+      riscv_instr::P_EXTBZ,               // Xpulpimg: p.extbz
+      riscv_instr::P_CLIP,                // Xpulpimg: p.clip
+      riscv_instr::P_CLIPU,               // Xpulpimg: p.clipu
+      riscv_instr::P_CLIPR,               // Xpulpimg: p.clipr
+      riscv_instr::P_CLIPUR,              // Xpulpimg: p.clipur
+      riscv_instr::P_MAC,                 // Xpulpimg: p.mac
+      riscv_instr::P_MSU,                 // Xpulpimg: p.msu
+      riscv_instr::PV_ADD_H,              // Xpulpimg: pv.add.h
+      riscv_instr::PV_ADD_SC_H,           // Xpulpimg: pv.add.sc.h
+      riscv_instr::PV_ADD_SCI_H,          // Xpulpimg: pv.add.sci.h
+      riscv_instr::PV_ADD_B,              // Xpulpimg: pv.add.b
+      riscv_instr::PV_ADD_SC_B,           // Xpulpimg: pv.add.sc.b
+      riscv_instr::PV_ADD_SCI_B,          // Xpulpimg: pv.add.sci.b
+      riscv_instr::PV_SUB_H,              // Xpulpimg: pv.sub.h
+      riscv_instr::PV_SUB_SC_H,           // Xpulpimg: pv.sub.sc.h
+      riscv_instr::PV_SUB_SCI_H,          // Xpulpimg: pv.sub.sci.h
+      riscv_instr::PV_SUB_B,              // Xpulpimg: pv.sub.b
+      riscv_instr::PV_SUB_SC_B,           // Xpulpimg: pv.sub.sc.b
+      riscv_instr::PV_SUB_SCI_B,          // Xpulpimg: pv.sub.sci.b
+      riscv_instr::PV_AVG_H,              // Xpulpimg: pv.avg.h
+      riscv_instr::PV_AVG_SC_H,           // Xpulpimg: pv.avg.sc.h
+      riscv_instr::PV_AVG_SCI_H,          // Xpulpimg: pv.avg.sci.h
+      riscv_instr::PV_AVG_B,              // Xpulpimg: pv.avg.b
+      riscv_instr::PV_AVG_SC_B,           // Xpulpimg: pv.avg.sc.b
+      riscv_instr::PV_AVG_SCI_B,          // Xpulpimg: pv.avg.sci.b
+      riscv_instr::PV_AVGU_H,             // Xpulpimg: pv.avgu.h
+      riscv_instr::PV_AVGU_SC_H,          // Xpulpimg: pv.avgu.sc.h
+      riscv_instr::PV_AVGU_SCI_H,         // Xpulpimg: pv.avgu.sci.h
+      riscv_instr::PV_AVGU_B,             // Xpulpimg: pv.avgu.b
+      riscv_instr::PV_AVGU_SC_B,          // Xpulpimg: pv.avgu.sc.b
+      riscv_instr::PV_AVGU_SCI_B,         // Xpulpimg: pv.avgu.sci.b
+      riscv_instr::PV_MIN_H,              // Xpulpimg: pv.min.h
+      riscv_instr::PV_MIN_SC_H,           // Xpulpimg: pv.min.sc.h
+      riscv_instr::PV_MIN_SCI_H,          // Xpulpimg: pv.min.sci.h
+      riscv_instr::PV_MIN_B,              // Xpulpimg: pv.min.b
+      riscv_instr::PV_MIN_SC_B,           // Xpulpimg: pv.min.sc.b
+      riscv_instr::PV_MIN_SCI_B,          // Xpulpimg: pv.min.sci.b
+      riscv_instr::PV_MINU_H,             // Xpulpimg: pv.minu.h
+      riscv_instr::PV_MINU_SC_H,          // Xpulpimg: pv.minu.sc.h
+      riscv_instr::PV_MINU_SCI_H,         // Xpulpimg: pv.minu.sci.h
+      riscv_instr::PV_MINU_B,             // Xpulpimg: pv.minu.b
+      riscv_instr::PV_MINU_SC_B,          // Xpulpimg: pv.minu.sc.b
+      riscv_instr::PV_MINU_SCI_B,         // Xpulpimg: pv.minu.sci.b
+      riscv_instr::PV_MAX_H,              // Xpulpimg: pv.max.h
+      riscv_instr::PV_MAX_SC_H,           // Xpulpimg: pv.max.sc.h
+      riscv_instr::PV_MAX_SCI_H,          // Xpulpimg: pv.max.sci.h
+      riscv_instr::PV_MAX_B,              // Xpulpimg: pv.max.b
+      riscv_instr::PV_MAX_SC_B,           // Xpulpimg: pv.max.sc.b
+      riscv_instr::PV_MAX_SCI_B,          // Xpulpimg: pv.max.sci.b
+      riscv_instr::PV_MAXU_H,             // Xpulpimg: pv.maxu.h
+      riscv_instr::PV_MAXU_SC_H,          // Xpulpimg: pv.maxu.sc.h
+      riscv_instr::PV_MAXU_SCI_H,         // Xpulpimg: pv.maxu.sci.h
+      riscv_instr::PV_MAXU_B,             // Xpulpimg: pv.maxu.b
+      riscv_instr::PV_MAXU_SC_B,          // Xpulpimg: pv.maxu.sc.b
+      riscv_instr::PV_MAXU_SCI_B,         // Xpulpimg: pv.maxu.sci.b
+      riscv_instr::PV_SRL_H,              // Xpulpimg: pv.srl.h
+      riscv_instr::PV_SRL_SC_H,           // Xpulpimg: pv.srl.sc.h
+      riscv_instr::PV_SRL_SCI_H,          // Xpulpimg: pv.srl.sci.h
+      riscv_instr::PV_SRL_B,              // Xpulpimg: pv.srl.b
+      riscv_instr::PV_SRL_SC_B,           // Xpulpimg: pv.srl.sc.b
+      riscv_instr::PV_SRL_SCI_B,          // Xpulpimg: pv.srl.sci.b
+      riscv_instr::PV_SRA_H,              // Xpulpimg: pv.sra.h
+      riscv_instr::PV_SRA_SC_H,           // Xpulpimg: pv.sra.sc.h
+      riscv_instr::PV_SRA_SCI_H,          // Xpulpimg: pv.sra.sci.h
+      riscv_instr::PV_SRA_B,              // Xpulpimg: pv.sra.b
+      riscv_instr::PV_SRA_SC_B,           // Xpulpimg: pv.sra.sc.b
+      riscv_instr::PV_SRA_SCI_B,          // Xpulpimg: pv.sra.sci.b
+      riscv_instr::PV_SLL_H,              // Xpulpimg: pv.sll.h
+      riscv_instr::PV_SLL_SC_H,           // Xpulpimg: pv.sll.sc.h
+      riscv_instr::PV_SLL_SCI_H,          // Xpulpimg: pv.sll.sci.h
+      riscv_instr::PV_SLL_B,              // Xpulpimg: pv.sll.b
+      riscv_instr::PV_SLL_SC_B,           // Xpulpimg: pv.sll.sc.b
+      riscv_instr::PV_SLL_SCI_B,          // Xpulpimg: pv.sll.sci.b
+      riscv_instr::PV_OR_H,               // Xpulpimg: pv.or.h
+      riscv_instr::PV_OR_SC_H,            // Xpulpimg: pv.or.sc.h
+      riscv_instr::PV_OR_SCI_H,           // Xpulpimg: pv.or.sci.h
+      riscv_instr::PV_OR_B,               // Xpulpimg: pv.or.b
+      riscv_instr::PV_OR_SC_B,            // Xpulpimg: pv.or.sc.b
+      riscv_instr::PV_OR_SCI_B,           // Xpulpimg: pv.or.sci.b
+      riscv_instr::PV_XOR_H,              // Xpulpimg: pv.xor.h
+      riscv_instr::PV_XOR_SC_H,           // Xpulpimg: pv.xor.sc.h
+      riscv_instr::PV_XOR_SCI_H,          // Xpulpimg: pv.xor.sci.h
+      riscv_instr::PV_XOR_B,              // Xpulpimg: pv.xor.b
+      riscv_instr::PV_XOR_SC_B,           // Xpulpimg: pv.xor.sc.b
+      riscv_instr::PV_XOR_SCI_B,          // Xpulpimg: pv.xor.sci.b
+      riscv_instr::PV_AND_H,              // Xpulpimg: pv.and.h
+      riscv_instr::PV_AND_SC_H,           // Xpulpimg: pv.and.sc.h
+      riscv_instr::PV_AND_SCI_H,          // Xpulpimg: pv.and.sci.h
+      riscv_instr::PV_AND_B,              // Xpulpimg: pv.and.b
+      riscv_instr::PV_AND_SC_B,           // Xpulpimg: pv.and.sc.b
+      riscv_instr::PV_AND_SCI_B,          // Xpulpimg: pv.and.sci.b
+      riscv_instr::PV_ABS_H,              // Xpulpimg: pv.abs.h
+      riscv_instr::PV_ABS_B,              // Xpulpimg: pv.abs.b
+      riscv_instr::PV_EXTRACT_H,          // Xpulpimg: pv.extract.h
+      riscv_instr::PV_EXTRACT_B,          // Xpulpimg: pv.extract.b
+      riscv_instr::PV_EXTRACTU_H,         // Xpulpimg: pv.extractu.h
+      riscv_instr::PV_EXTRACTU_B,         // Xpulpimg: pv.extractu.b
+      riscv_instr::PV_INSERT_H,           // Xpulpimg: pv.insert.h
+      riscv_instr::PV_INSERT_B,           // Xpulpimg: pv.insert.b
+      riscv_instr::PV_DOTUP_H,            // Xpulpimg: pv.dotup.h
+      riscv_instr::PV_DOTUP_SC_H,         // Xpulpimg: pv.dotup.sc.h
+      riscv_instr::PV_DOTUP_SCI_H,        // Xpulpimg: pv.dotup.sci.h
+      riscv_instr::PV_DOTUP_B,            // Xpulpimg: pv.dotup.b
+      riscv_instr::PV_DOTUP_SC_B,         // Xpulpimg: pv.dotup.sc.b
+      riscv_instr::PV_DOTUP_SCI_B,        // Xpulpimg: pv.dotup.sci.b
+      riscv_instr::PV_DOTUSP_H,           // Xpulpimg: pv.dotusp.h
+      riscv_instr::PV_DOTUSP_SC_H,        // Xpulpimg: pv.dotusp.sc.h
+      riscv_instr::PV_DOTUSP_SCI_H,       // Xpulpimg: pv.dotusp.sci.h
+      riscv_instr::PV_DOTUSP_B,           // Xpulpimg: pv.dotusp.b
+      riscv_instr::PV_DOTUSP_SC_B,        // Xpulpimg: pv.dotusp.sc.b
+      riscv_instr::PV_DOTUSP_SCI_B,       // Xpulpimg: pv.dotusp.sci.b
+      riscv_instr::PV_DOTSP_H,            // Xpulpimg: pv.dotsp.h
+      riscv_instr::PV_DOTSP_SC_H,         // Xpulpimg: pv.dotsp.sc.h
+      riscv_instr::PV_DOTSP_SCI_H,        // Xpulpimg: pv.dotsp.sci.h
+      riscv_instr::PV_DOTSP_B,            // Xpulpimg: pv.dotsp.b
+      riscv_instr::PV_DOTSP_SC_B,         // Xpulpimg: pv.dotsp.sc.b
+      riscv_instr::PV_DOTSP_SCI_B,        // Xpulpimg: pv.dotsp.sci.b
+      riscv_instr::PV_SDOTUP_H,           // Xpulpimg: pv.sdotup.h
+      riscv_instr::PV_SDOTUP_SC_H,        // Xpulpimg: pv.sdotup.sc.h
+      riscv_instr::PV_SDOTUP_SCI_H,       // Xpulpimg: pv.sdotup.sci.h
+      riscv_instr::PV_SDOTUP_B,           // Xpulpimg: pv.sdotup.b
+      riscv_instr::PV_SDOTUP_SC_B,        // Xpulpimg: pv.sdotup.sc.b
+      riscv_instr::PV_SDOTUP_SCI_B,       // Xpulpimg: pv.sdotup.sci.b
+      riscv_instr::PV_SDOTUSP_H,          // Xpulpimg: pv.sdotusp.h
+      riscv_instr::PV_SDOTUSP_SC_H,       // Xpulpimg: pv.sdotusp.sc.h
+      riscv_instr::PV_SDOTUSP_SCI_H,      // Xpulpimg: pv.sdotusp.sci.h
+      riscv_instr::PV_SDOTUSP_B,          // Xpulpimg: pv.sdotusp.b
+      riscv_instr::PV_SDOTUSP_SC_B,       // Xpulpimg: pv.sdotusp.sc.b
+      riscv_instr::PV_SDOTUSP_SCI_B,      // Xpulpimg: pv.sdotusp.sci.b
+      riscv_instr::PV_SDOTSP_H,           // Xpulpimg: pv.sdotsp.h
+      riscv_instr::PV_SDOTSP_SC_H,        // Xpulpimg: pv.sdotsp.sc.h
+      riscv_instr::PV_SDOTSP_SCI_H,       // Xpulpimg: pv.sdotsp.sci.h
+      riscv_instr::PV_SDOTSP_B,           // Xpulpimg: pv.sdotsp.b
+      riscv_instr::PV_SDOTSP_SC_B,        // Xpulpimg: pv.sdotsp.sc.b
+      riscv_instr::PV_SDOTSP_SCI_B,       // Xpulpimg: pv.sdotsp.sci.b
+      riscv_instr::PV_SHUFFLE2_H,         // Xpulpimg: pv.shuffle2.h
+      riscv_instr::PV_SHUFFLE2_B: begin   // Xpulpimg: pv.shuffle2.b
         if (snitch_pkg::XPULPIMG) begin
           dsp_valid_op = acc_qvalid_i;
           acc_qready_o = dsp_ready_op;
@@ -87,24 +226,6 @@ module snitch_ipu #(
     endcase
   end
 
-  // Multiplication
-  multiplier #(
-    .Width    ( 32      ),
-    .IdWidth  ( IdWidth )
-  ) i_multiplier (
-    .clk_i,
-    .rst_i,
-    .id_i        ( acc_qid_i              ),
-    .operator_i  ( acc_qdata_op_i         ),
-    .operand_a_i ( acc_qdata_arga_i       ),
-    .operand_b_i ( acc_qdata_argb_i       ),
-    .valid_i     ( mul_valid_op           ),
-    .ready_o     ( mul_ready_op           ),
-    .result_o    ( mul.result             ),
-    .valid_o     ( mul_valid              ),
-    .ready_i     ( mul_ready              ),
-    .id_o        ( mul.id                 )
-  );
   // Serial Divider
   serdiv #(
       .WIDTH       ( 32      ),
@@ -123,42 +244,61 @@ module snitch_ipu #(
       .id_o        ( div.id                 ),
       .res_o       ( div.result             )
   );
-  if (snitch_pkg::XPULPIMG) begin : gen_dspu
+
+  if (snitch_pkg::XPULPIMG) begin : gen_xpulpimg
     // DSP Unit
     dspu #(
         .Width    ( 32      ),
         .IdWidth  ( IdWidth )
     ) i_dspu (
-        .clk_i       ( clk_i                  ),
-        .rst_i       ( rst_i                  ),
-        .id_i        ( acc_qid_i              ),
-        .operator_i  ( acc_qdata_op_i         ),
-        .op_a_i      ( acc_qdata_arga_i       ),
-        .op_b_i      ( acc_qdata_argb_i       ),
-        .in_valid_i  ( dsp_valid_op           ),
-        .in_ready_o  ( dsp_ready_op           ),
-        .out_valid_o ( dsp_valid              ),
-        .out_ready_i ( dsp_ready              ),
-        .id_o        ( dsp.id                 ),
-        .result_o    ( dsp.result             )
+        .clk_i       ( clk_i                ),
+        .rst_i       ( rst_i                ),
+        .id_i        ( acc_qid_i            ),
+        .operator_i  ( acc_qdata_op_i       ),
+        .op_a_i      ( acc_qdata_arga_i     ),
+        .op_b_i      ( acc_qdata_argb_i     ),
+        .op_c_i      ( acc_qdata_argc_i     ),
+        .in_valid_i  ( dsp_valid_op         ),
+        .in_ready_o  ( dsp_ready_op         ),
+        .out_valid_o ( dsp_valid            ),
+        .out_ready_i ( dsp_ready            ),
+        .id_o        ( dsp.id               ),
+        .result_o    ( dsp.result           )
     );
-  end
-  // Output Arbitration
-  if (snitch_pkg::XPULPIMG) begin : gen_3inputs
+    // Output Arbitration
     stream_arbiter #(
       .DATA_T ( result_t ),
-      .N_INP  ( 3        )
+      .N_INP  ( 2        )
     ) i_stream_arbiter (
       .clk_i,
-      .rst_ni      ( ~rst_i                            ),
-      .inp_data_i  ( {div, mul, dsp}                   ),
-      .inp_valid_i ( {div_valid, mul_valid, dsp_valid} ),
-      .inp_ready_o ( {div_ready, mul_ready, dsp_ready} ),
-      .oup_data_o  ( oup                               ),
-      .oup_valid_o ( acc_pvalid_o                      ),
-      .oup_ready_i ( acc_pready_i                      )
+      .rst_ni      ( ~rst_i                 ),
+      .inp_data_i  ( {div, dsp}             ),
+      .inp_valid_i ( {div_valid, dsp_valid} ),
+      .inp_ready_o ( {div_ready, dsp_ready} ),
+      .oup_data_o  ( oup                    ),
+      .oup_valid_o ( acc_pvalid_o           ),
+      .oup_ready_i ( acc_pready_i           )
+    );
+  end else begin : gen_vanilla
+    // Multiplication
+    multiplier #(
+      .Width    ( 32      ),
+      .IdWidth  ( IdWidth )
+    ) i_multiplier (
+      .clk_i,
+      .rst_i,
+      .id_i        ( acc_qid_i              ),
+      .operator_i  ( acc_qdata_op_i         ),
+      .operand_a_i ( acc_qdata_arga_i       ),
+      .operand_b_i ( acc_qdata_argb_i       ),
+      .valid_i     ( mul_valid_op           ),
+      .ready_o     ( mul_ready_op           ),
+      .result_o    ( mul.result             ),
+      .valid_o     ( mul_valid              ),
+      .ready_i     ( mul_ready              ),
+      .id_o        ( mul.id                 )
     );
-  end else begin : gen_2inputs
+    // Output Arbitration
     stream_arbiter #(
       .DATA_T ( result_t ),
       .N_INP  ( 2        )
@@ -173,6 +313,7 @@ module snitch_ipu #(
       .oup_ready_i ( acc_pready_i           )
     );
   end
+
   assign acc_pdata_o = oup.result;
   assign acc_pid_o = oup.id;
 endmodule
@@ -188,6 +329,7 @@ module dspu #(
     input  logic [31:0]        operator_i,
     input  logic [Width-1:0]   op_a_i,
     input  logic [Width-1:0]   op_b_i,
+    input  logic [Width-1:0]   op_c_i,
     input  logic               in_valid_i,
     output logic               in_ready_o,
     output logic               out_valid_o,
@@ -202,19 +344,42 @@ module dspu #(
   assign id_o = id_i;
 
   // Decoded fields
-  logic [4:0] ximm;
-  assign ximm = operator_i[24:20];
+  logic [4:0] imm5;
+  logic [5:0] imm6;
+  assign imm5 = operator_i[24:20];
+  assign imm6 = {operator_i[24:20], operator_i[25]};
 
   // Internal control signals
-  logic cmp_signed;     // comparator operation is signed
+  logic cmp_signed;            // comparator operation is signed
   enum logic [1:0] {
-    Reg, Zero, ClipBound
-  } cmp_op_b_sel;       // selection of shared comparator operands
-  logic clip_unsigned;  // clip operation has "0" as lower bound
-  logic clip_register;  // if 1 clip operation uses rs2, else ximm
+    None, Reg, Zero, ClipBound
+  } cmp_op_b_sel;              // selection of shared comparator operands
+  logic clip_unsigned;         // clip operation has "0" as lower bound
+  logic clip_register;         // if 1 clip operation uses rs2, else imm5
+  enum logic [1:0] {
+    NoMul, MulLow, MulHigh, MulMac
+  } mul_op;                    // type of multiplication operation
+  logic mac_msu;               // multiplication operation is MSU
+  logic mul_op_a_sign;         // sign of multiplier operand a
+  logic mac_op_b_sign;         // sign of multiplier operand b
   enum logic [3:0] {
-    Abs, Sle, Min, Max, Exths, Exthz, Extbs, Extbz, Clip
-  } res_sel;            // result selection
+    Nop, Abs, Sle, Min, Max, Exths, Exthz, Extbs, Extbz, Clip, Mac, Simd
+  } res_sel;                   // result selection
+
+  enum logic [4:0] {
+    SimdNop, SimdAdd, SimdSub, SimdAvg, SimdMin, SimdMax, SimdSrl, SimdSra, SimdSll, SimdOr,
+    SimdXor, SimdAnd, SimdAbs, SimdExt, SimdIns, SimdDotp, SimdShuffle
+  } simd_op;                   // SIMD operation
+  enum logic {
+    HalfWord, Byte
+  } simd_size;                 // SIMD granularity
+  enum logic [1:0] {
+    Vect, Sc, Sci
+  } simd_mode;                 // SIMD mode
+  logic simd_signed;           // SIMD operation is signed and uses sign-extended imm6
+  logic simd_dotp_op_a_signed; // signedness of SIMD dotp operand a
+  logic simd_dotp_op_b_signed; // signedness of SIMD dotp operand b
+  logic simd_dotp_acc;         // accumulate result of SIMD dotp on destination reg
 
   // --------------------
   // Decoder
@@ -222,46 +387,90 @@ module dspu #(
 
   always_comb begin
     cmp_signed = 1'b1;
-    cmp_op_b_sel = Reg;
+    cmp_op_b_sel = None;
     clip_unsigned = 1'b0;
     clip_register = 1'b0;
-    res_sel = Abs;
+    mul_op = NoMul;
+    mac_msu = 1'b0;
+    mul_op_a_sign = 1'b0;
+    mac_op_b_sign = 1'b0;
+    res_sel = Nop;
+    simd_op = SimdNop;
+    simd_size = HalfWord;
+    simd_mode = Vect;
+    simd_signed = 1;
+    simd_dotp_op_a_signed = 1;
+    simd_dotp_op_b_signed = 1;
+    simd_dotp_acc = 0;
     unique casez (operator_i)
+      // Multiplications from M extension
+      riscv_instr::MUL: begin
+        mul_op = MulLow;
+        mul_op_a_sign = 1'b1;
+        mac_op_b_sign = 1'b1;
+        res_sel = Mac;
+      end
+      riscv_instr::MULH: begin
+        mul_op = MulHigh;
+        mul_op_a_sign = 1'b1;
+        mac_op_b_sign = 1'b1;
+        res_sel = Mac;
+      end
+      riscv_instr::MULHSU: begin
+        mul_op = MulHigh;
+        mul_op_a_sign = 1'b1;
+        res_sel = Mac;
+      end
+      riscv_instr::MULHU: begin
+        mul_op = MulHigh;
+        res_sel = Mac;
+      end
+      // Instructions from Xpulpimg
       riscv_instr::P_ABS: begin
         cmp_op_b_sel = Zero;
         res_sel = Abs;
       end
       riscv_instr::P_SLET: begin
+        cmp_op_b_sel = Reg;
         res_sel = Sle;
       end
       riscv_instr::P_SLETU: begin
         cmp_signed = 1'b0;
+        cmp_op_b_sel = Reg;
         res_sel = Sle;
       end
       riscv_instr::P_MIN: begin
+        cmp_op_b_sel = Reg;
         res_sel = Min;
       end
       riscv_instr::P_MINU: begin
         cmp_signed = 1'b0;
+        cmp_op_b_sel = Reg;
         res_sel = Min;
       end
       riscv_instr::P_MAX: begin
+        cmp_op_b_sel = Reg;
         res_sel = Max;
       end
       riscv_instr::P_MAXU: begin
         cmp_signed = 1'b0;
+        cmp_op_b_sel = Reg;
         res_sel = Max;
       end
       riscv_instr::P_EXTHS: begin
+        cmp_op_b_sel = Reg;
         res_sel = Exths;
       end
       riscv_instr::P_EXTHZ: begin
+        cmp_op_b_sel = Reg;
         res_sel = Exthz;
       end
       riscv_instr::P_EXTBS: begin
+        cmp_op_b_sel = Reg;
         res_sel = Extbs;
       end
       riscv_instr::P_EXTBZ: begin
+        cmp_op_b_sel = Reg;
         res_sel = Extbz;
       end
       riscv_instr::P_CLIP: begin
@@ -284,6 +493,770 @@ module dspu #(
         cmp_op_b_sel = ClipBound;
         res_sel = Clip;
       end
+      riscv_instr::P_MAC: begin
+        mul_op = MulMac;
+        mul_op_a_sign = 1'b1;
+        mac_op_b_sign = 1'b1;
+        res_sel = Mac;
+      end
+      riscv_instr::P_MSU: begin
+        mul_op = MulMac;
+        mac_msu = 1'b1;
+        mul_op_a_sign = 1'b1;
+        mac_op_b_sign = 1'b1;
+        res_sel = Mac;
+      end
+      riscv_instr::PV_ADD_H: begin
+        simd_op = SimdAdd;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ADD_SC_H: begin
+        simd_op = SimdAdd;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ADD_SCI_H: begin
+        simd_op = SimdAdd;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ADD_B: begin
+        simd_op = SimdAdd;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ADD_SC_B: begin
+        simd_op = SimdAdd;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ADD_SCI_B: begin
+        simd_op = SimdAdd;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_H: begin
+        simd_op = SimdSub;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_SC_H: begin
+        simd_op = SimdSub;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_SCI_H: begin
+        simd_op = SimdSub;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_B: begin
+        simd_op = SimdSub;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_SC_B: begin
+        simd_op = SimdSub;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SUB_SCI_B: begin
+        simd_op = SimdSub;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_H: begin
+        simd_op = SimdAvg;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_SC_H: begin
+        simd_op = SimdAvg;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_SCI_H: begin
+        simd_op = SimdAvg;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_SC_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVG_SCI_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_H: begin
+        simd_op = SimdAvg;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_SC_H: begin
+        simd_op = SimdAvg;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_SCI_H: begin
+        simd_op = SimdAvg;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_SC_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AVGU_SCI_B: begin
+        simd_op = SimdAvg;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_H: begin
+        simd_op = SimdMin;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_SC_H: begin
+        simd_op = SimdMin;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_SCI_H: begin
+        simd_op = SimdMin;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_SC_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MIN_SCI_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_H: begin
+        simd_op = SimdMin;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_SC_H: begin
+        simd_op = SimdMin;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_SCI_H: begin
+        simd_op = SimdMin;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_SC_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MINU_SCI_B: begin
+        simd_op = SimdMin;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_H: begin
+        simd_op = SimdMax;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_SC_H: begin
+        simd_op = SimdMax;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_SCI_H: begin
+        simd_op = SimdMax;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_SC_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAX_SCI_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_H: begin
+        simd_op = SimdMax;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_SC_H: begin
+        simd_op = SimdMax;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_SCI_H: begin
+        simd_op = SimdMax;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_SC_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_MAXU_SCI_B: begin
+        simd_op = SimdMax;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_H: begin
+        simd_op = SimdSrl;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_SC_H: begin
+        simd_op = SimdSrl;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_SCI_H: begin
+        simd_op = SimdSrl;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_B: begin
+        simd_op = SimdSrl;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_SC_B: begin
+        simd_op = SimdSrl;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRL_SCI_B: begin
+        simd_op = SimdSrl;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_H: begin
+        simd_op = SimdSra;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_SC_H: begin
+        simd_op = SimdSra;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_SCI_H: begin
+        simd_op = SimdSra;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_B: begin
+        simd_op = SimdSra;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_SC_B: begin
+        simd_op = SimdSra;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SRA_SCI_B: begin
+        simd_op = SimdSra;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_H: begin
+        simd_op = SimdSll;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_SC_H: begin
+        simd_op = SimdSll;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_SCI_H: begin
+        simd_op = SimdSll;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_B: begin
+        simd_op = SimdSll;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_SC_B: begin
+        simd_op = SimdSll;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SLL_SCI_B: begin
+        simd_op = SimdSll;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_H: begin
+        simd_op = SimdOr;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_SC_H: begin
+        simd_op = SimdOr;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_SCI_H: begin
+        simd_op = SimdOr;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_B: begin
+        simd_op = SimdOr;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_SC_B: begin
+        simd_op = SimdOr;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_OR_SCI_B: begin
+        simd_op = SimdOr;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_H: begin
+        simd_op = SimdXor;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_SC_H: begin
+        simd_op = SimdXor;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_SCI_H: begin
+        simd_op = SimdXor;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_B: begin
+        simd_op = SimdXor;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_SC_B: begin
+        simd_op = SimdXor;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_XOR_SCI_B: begin
+        simd_op = SimdXor;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_H: begin
+        simd_op = SimdAnd;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_SC_H: begin
+        simd_op = SimdAnd;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_SCI_H: begin
+        simd_op = SimdAnd;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_B: begin
+        simd_op = SimdAnd;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_SC_B: begin
+        simd_op = SimdAnd;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_AND_SCI_B: begin
+        simd_op = SimdAnd;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ABS_H: begin
+        simd_op = SimdAbs;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_ABS_B: begin
+        simd_op = SimdAbs;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_EXTRACT_H: begin
+        simd_op = SimdExt;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_EXTRACT_B: begin
+        simd_op = SimdExt;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_EXTRACTU_H: begin
+        simd_op = SimdExt;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_EXTRACTU_B: begin
+        simd_op = SimdExt;
+        simd_size = Byte;
+        simd_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_INSERT_H: begin
+        simd_op = SimdIns;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_INSERT_B: begin
+        simd_op = SimdIns;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_H: begin
+        simd_op = SimdDotp;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_H: begin
+        simd_op = SimdDotp;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTUSP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_dotp_op_a_signed = 0;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_H: begin
+        simd_op = SimdDotp;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_DOTSP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_H: begin
+        simd_op = SimdDotp;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_signed = 0;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_op_b_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_H: begin
+        simd_op = SimdDotp;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTUSP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_dotp_op_a_signed = 0;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_H: begin
+        simd_op = SimdDotp;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_SC_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sc;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_SCI_H: begin
+        simd_op = SimdDotp;
+        simd_mode = Sci;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_SC_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sc;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SDOTSP_SCI_B: begin
+        simd_op = SimdDotp;
+        simd_size = Byte;
+        simd_mode = Sci;
+        simd_dotp_acc = 1;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SHUFFLE2_H: begin
+        simd_op = SimdShuffle;
+        res_sel = Simd;
+      end
+      riscv_instr::PV_SHUFFLE2_B: begin
+        simd_op = SimdShuffle;
+        simd_size = Byte;
+        res_sel = Simd;
+      end
       default: ;
     endcase
   end
@@ -294,8 +1267,6 @@ module dspu #(
   // |___//_/ \_\|_|/_/ \_\|_| /_/ \_\|_|  |_||_|
   //
 
-  logic cmp_result;
-
   // --------------------
   // Clips
   // --------------------
@@ -304,8 +1275,8 @@ module dspu #(
   logic [Width-1:0] clip_lower;
   logic [Width-1:0] clip_comp;
 
-  // Generate -2^(ximm-1), 2^(ximm-1)-1 for clip/clipu and -rs2-1, rs2 for clipr, clipur
-  assign clip_lower = ({(Width+1){1'b1}} << $unsigned(ximm)) >> 1;
+  // Generate -2^(imm5-1), 2^(imm5-1)-1 for clip/clipu and -rs2-1, rs2 for clipr, clipur
+  assign clip_lower = ({(Width+1){1'b1}} << $unsigned(imm5)) >> 1;
   assign clip_op_b_n = clip_unsigned ? 'b0 : (clip_register ? ~op_b_i : clip_lower);
   assign clip_op_b = clip_register ? op_b_i : ~clip_lower;
 
@@ -315,13 +1286,14 @@ module dspu #(
   // Select operand to use in comparison for clip operations: clips would need two comparisons
   // to clamp the result between the two bounds; but one comparison is enough if we select the
   // second operand basing on op_a and clip_op_b signs (i.e. rs1 and clip upper bound, being
-  // either rs2 or 2^(ximm-1)-1)
+  // either rs2 or 2^(imm5-1)-1)
   assign clip_comp = clip_use_n_bound ? clip_op_b_n : clip_op_b;
 
   // --------------------
   // Shared comparator
   // --------------------
   logic [Width-1:0] cmp_op_a, cmp_op_b;
+  logic cmp_result;
 
   // Comparator operand A assignment
   assign cmp_op_a = op_a_i;
@@ -338,6 +1310,197 @@ module dspu #(
   // Instantiate comparator
   assign cmp_result = $signed({cmp_op_a[Width-1] & cmp_signed, cmp_op_a}) <= $signed({cmp_op_b[Width-1] & cmp_signed, cmp_op_b});
 
+  // --------------------
+  // Multiplier & acc
+  // --------------------
+
+  // 32x32 into 32 bits multiplier & accumulator
+  logic [Width-1:0] mul_op_a;
+  logic [2*Width-1:0] mul_result;
+  logic [Width-1:0] mac_result;
+
+  assign mul_op_a = mac_msu ? -op_a_i : op_a_i; // op_a_i is sign-inverted if mac_msu=1, to have -op_a*op_b
+
+  // 32-bits input, 64-bits output multiplier
+  assign mul_result = $signed({mul_op_a[Width-1] & mul_op_a_sign, mul_op_a}) * $signed({op_b_i[Width-1] & mac_op_b_sign, op_b_i});
+
+  always_comb begin
+    unique case (mul_op)
+      MulLow: mac_result = mul_result[Width-1:0]; // mul, take lowest 32 bits
+      MulHigh: mac_result = mul_result[2*Width-1:Width]; // mul high, take highest 32 bits
+      MulMac: mac_result = op_c_i + mul_result[Width-1:0]; // accumulate
+      default: mac_result = '0;
+    endcase
+  end
+
+  // --------------------
+  // SIMD operations
+  // --------------------
+
+  logic [3:0][7:0] simd_op_a, simd_op_b, simd_op_c;
+  logic [1:0][7:0] simd_imm;
+  logic [3:0][7:0] simd_result;
+
+  // half-word and byte immediate extensions
+  always_comb
+    if(simd_signed) simd_imm = $signed(imm6);
+    else simd_imm = $unsigned(imm6);
+
+  // SIMD operands composition
+  always_comb begin
+    simd_op_a = 'b0;
+    simd_op_b = 'b0;
+    simd_op_c = 'b0;
+    unique case (simd_size)
+      // half-word granularity
+      HalfWord:
+        for (int i = 0; i < Width/16; i++) begin
+          simd_op_a[2*i +: 2] = op_a_i[16*i +: 16]; // operands A are the half-words of op_a_i
+          // operands B are the half-words of op_b_i, replicated lowest half-word of op_b_i or replicated 6-bit immediate
+          simd_op_b[2*i +: 2] = (simd_mode == Vect) ? op_b_i[16*i +: 16] : ((simd_mode == Sc) ? op_b_i[15:0] : simd_imm);
+          simd_op_c[2*i +: 2] = op_c_i[16*i +: 16]; // operands C are the half-words of op_c_i
+        end
+      // byte granularity
+      Byte:
+        for (int i = 0; i < Width/8; i++) begin
+          simd_op_a[i] = op_a_i[8*i +: 8]; // operands A are the bytes of op_a_i
+          // operands B are the bytes of op_b_i, replicated lowest byte of op_b_i or replicated 6-bit immediate
+          simd_op_b[i] = (simd_mode == Vect) ? op_b_i[8*i +: 8] : ((simd_mode == Sc) ? op_b_i[7:0] : simd_imm[0]);
+          simd_op_c[i] = op_c_i[8*i +: 8]; // operands C are the bytes of op_c_i
+        end
+      default: ;
+    endcase
+  end
+
+  // SIMD unit
+  always_comb begin
+    simd_result = 'b0;
+    unique case (simd_size)
+      // half-word granularity
+      HalfWord: begin
+        unique case (simd_op)
+          SimdAdd:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed(simd_op_a[2*i +: 2]) + $signed(simd_op_b[2*i +: 2]);
+          SimdSub:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed(simd_op_a[2*i +: 2]) - $signed(simd_op_b[2*i +: 2]);
+          SimdAvg:
+            for (int i = 0; i < Width/16; i++) begin
+              simd_result[2*i +: 2] = $signed(simd_op_a[2*i +: 2]) + $signed(simd_op_b[2*i +: 2]);
+              simd_result[2*i +: 2] = {simd_result[2*i+1][7] & simd_signed, simd_result[2*i +: 2]} >> 1;
+            end
+          SimdMin:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed({simd_op_a[2*i+1][7] & simd_signed, simd_op_a[2*i +: 2]}) <=
+                                       $signed({simd_op_b[2*i+1][7] & simd_signed, simd_op_b[2*i +: 2]}) ?
+                                       simd_op_a[2*i +: 2] : simd_op_b[2*i +: 2];
+          SimdMax:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed({simd_op_a[2*i+1][7] & simd_signed, simd_op_a[2*i +: 2]}) >
+                                       $signed({simd_op_b[2*i+1][7] & simd_signed, simd_op_b[2*i +: 2]}) ?
+                                       simd_op_a[2*i +: 2] : simd_op_b[2*i +: 2];
+          SimdSrl:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $unsigned(simd_op_a[2*i +: 2]) >> simd_op_b[2*i][3:0];
+          SimdSra:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed(simd_op_a[2*i +: 2]) >>> simd_op_b[2*i][3:0];
+          SimdSll:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $unsigned(simd_op_a[2*i +: 2]) << simd_op_b[2*i][3:0];
+          SimdOr: simd_result = simd_op_a | simd_op_b;
+          SimdXor: simd_result = simd_op_a ^ simd_op_b;
+          SimdAnd: simd_result = simd_op_a & simd_op_b;
+          SimdAbs:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = $signed(simd_op_a[2*i +: 2]) > 0 ? simd_op_a[2*i +: 2] : -$signed(simd_op_a[2*i +: 2]);
+          SimdExt: begin
+            simd_result[1:0] = simd_op_a[2*imm6[0] +: 2];
+            // sign- or zero-extend
+            simd_result[3:2] = {16{simd_op_a[2*imm6[0]+1][7] & simd_signed}};
+          end
+          SimdIns: begin
+            simd_result = op_c_i;
+            simd_result[2*imm6[0] +: 2] = simd_op_a[1:0];
+          end
+          SimdDotp: begin
+            simd_result = op_c_i & {(Width){simd_dotp_acc}}; // accumulate on rd or start from zero
+            for (int i = 0; i < Width/16; i++) begin
+              simd_result = $signed(simd_result) + $signed({simd_op_a[2*i+1][7] & simd_dotp_op_a_signed, simd_op_a[2*i +: 2]}) *
+                                                   $signed({simd_op_b[2*i+1][7] & simd_dotp_op_b_signed, simd_op_b[2*i +: 2]});
+            end
+          end
+          SimdShuffle:
+            for (int i = 0; i < Width/16; i++)
+              simd_result[2*i +: 2] = simd_op_b[2*i][1] ? simd_op_a[2*simd_op_b[2*i][0] +: 2] : simd_op_c[2*simd_op_b[2*i][0] +: 2];
+          default: ;
+        endcase
+      end
+      // byte granularity
+      Byte: begin
+        unique case (simd_op)
+          SimdAdd:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed(simd_op_a[i]) + $signed(simd_op_b[i]);
+          SimdSub:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed(simd_op_a[i]) - $signed(simd_op_b[i]);
+          SimdAvg:
+            for (int i = 0; i < Width/8; i++) begin
+              simd_result[i] = $signed(simd_op_a[i]) + $signed(simd_op_b[i]);
+              simd_result[i] = {simd_result[i][7] & simd_signed, simd_result[i]} >> 1;
+            end
+          SimdMin:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed({simd_op_a[i][7] & simd_signed, simd_op_a[i]}) <=
+                               $signed({simd_op_b[i][7] & simd_signed, simd_op_b[i]}) ?
+                               simd_op_a[i] : simd_op_b[i];
+          SimdMax:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed({simd_op_a[i][7] & simd_signed, simd_op_a[i]}) >
+                               $signed({simd_op_b[i][7] & simd_signed, simd_op_b[i]}) ?
+                               simd_op_a[i] : simd_op_b[i];
+          SimdSrl:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $unsigned(simd_op_a[i]) >> simd_op_b[i][2:0];
+          SimdSra:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed(simd_op_a[i]) >>> simd_op_b[i][2:0];
+          SimdSll:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $unsigned(simd_op_a[i]) << simd_op_b[i][2:0];
+          SimdOr: simd_result = simd_op_a | simd_op_b;
+          SimdXor: simd_result = simd_op_a ^ simd_op_b;
+          SimdAnd: simd_result = simd_op_a & simd_op_b;
+          SimdAbs:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = $signed(simd_op_a[i]) > 0 ? simd_op_a[i] : -$signed(simd_op_a[i]);
+          SimdExt: begin
+            simd_result[0] = simd_op_a[imm6[1:0]];
+            // sign- or zero-extend
+            simd_result[3:1] = {24{simd_op_a[imm6[1:0]][7] & simd_signed}};
+          end
+          SimdIns: begin
+            simd_result = op_c_i;
+            simd_result[imm6[1:0]] = simd_op_a[0];
+          end
+          SimdDotp: begin
+            simd_result = op_c_i & {(Width){simd_dotp_acc}}; // accumulate on rd or start from zero
+            for (int i = 0; i < Width/8; i++)
+              simd_result = $signed(simd_result) + $signed({simd_op_a[i][7] & simd_dotp_op_a_signed, simd_op_a[i]}) *
+                                                   $signed({simd_op_b[i][7] & simd_dotp_op_b_signed, simd_op_b[i]});
+          end
+          SimdShuffle:
+            for (int i = 0; i < Width/8; i++)
+              simd_result[i] = simd_op_b[i][2] ? simd_op_a[simd_op_b[i][1:0]] : simd_op_c[simd_op_b[i][1:0]];
+          default: ;
+        endcase
+      end
+      default: ;
+    endcase
+  end
+
   // --------------------
   // Result generation
   // --------------------
@@ -368,6 +1531,8 @@ module dspu #(
       //     + if clip_op_b >= 0: clip_comp=clip_op_b (i.e. rs1>=0 and clip_op_b>=0) and the result must
       //       be clipped to the upper bound since rs1 > clip_op_b
       Clip: result_o = cmp_result ? (clip_use_n_bound ? clip_op_b_n : op_a_i) : (op_a_i[Width-1] ? op_a_i : clip_op_b);
+      Mac: result_o = mac_result;
+      Simd: result_o = simd_result;
       default: result_o = '0;
     endcase
   end
diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv
index 79e5546a2..5a20ccf51 100644
--- a/hardware/src/mempool_tile.sv
+++ b/hardware/src/mempool_tile.sv
@@ -173,7 +173,7 @@ module mempool_tile
       .clk_i                (clk_i                   ),
       .clk_d2_i             (clk_i                   ),
       .rst_ni               (rst_ni                  ),
-      .enable_prefetching_i (1'b1                    ),
+      .enable_prefetching_i (snitch_inst_valid[c]    ),
       .icache_events_o      (/* Unused */            ),
       .flush_valid_i        (1'b0                    ),
       .flush_ready_o        (/* Unused */            ),
diff --git a/toolchain/riscv-gnu-toolchain b/toolchain/riscv-gnu-toolchain
index 42e484f35..0c46580ac 160000
--- a/toolchain/riscv-gnu-toolchain
+++ b/toolchain/riscv-gnu-toolchain
@@ -1 +1 @@
-Subproject commit 42e484f35b7832ae0f67eb85bf12c7844f64f089
+Subproject commit 0c46580ac5e0cb6eca97e469d61751dda3bdcabb
diff --git a/toolchain/riscv-isa-sim/disasm/disasm.cc b/toolchain/riscv-isa-sim/disasm/disasm.cc
index 9ee83b575..3a73ddfe3 100644
--- a/toolchain/riscv-isa-sim/disasm/disasm.cc
+++ b/toolchain/riscv-isa-sim/disasm/disasm.cc
@@ -367,6 +367,55 @@ struct : public arg_t {
   }
 } p_simm5;
 
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((uint32_t)insn.p_zimm6());
+  }
+} p_zimm6;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.p_simm6());
+  }
+} p_simm6;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.i_imm()) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} load_address_irpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.rs2()]) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} load_address_rrpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.rs2()]) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} load_address_rr;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::to_string((int)insn.s_imm()) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} store_address_irpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.p_rs3()]) + '(' + xpr_name[insn.rs1()] + "!)";
+  }
+} store_address_rrpost;
+
+struct : public arg_t {
+  std::string to_string(insn_t insn) const {
+    return std::string(xpr_name[insn.p_rs3()]) + '(' + xpr_name[insn.rs1()] + ')';
+  }
+} store_address_rr;
+
+
 typedef struct {
   reg_t match;
   reg_t mask;
@@ -434,7 +483,15 @@ disassembler_t::disassembler_t(int xlen)
   #define DEFINE_XFTYPE(code) DISASM_INSN(#code, code, 0, {&frd, &xrs1})
   #define DEFINE_SFENCE_TYPE(code) DISASM_INSN(#code, code, 0, {&xrs1, &xrs2})
   // Xpulpimg
-  #define DEFINE_PITYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_zimm5})
+  #define DEFINE_PLOAD_IRPOST(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_irpost})
+  #define DEFINE_PLOAD_RRPOST(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_rrpost})
+  #define DEFINE_PLOAD_RR(code) DISASM_INSN(#code, code, 0, {&xrd, &load_address_rr})
+  #define DEFINE_PSTORE_IRPOST(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_irpost})
+  #define DEFINE_PSTORE_RRPOST(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_rrpost})
+  #define DEFINE_PSTORE_RR(code) DISASM_INSN(#code, code, 0, {&xrs2, &store_address_rr})
+  #define DEFINE_PI0TYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_zimm5})
+  #define DEFINE_PI1ZTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_zimm6})
+  #define DEFINE_PI1STYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_simm6})
   #define DEFINE_PBTYPE(code) DISASM_INSN(#code, code, 0, {&xrd, &xrs1, &p_simm5, &branch_target})
 
   DEFINE_XLOAD(lb)
@@ -752,7 +809,7 @@ disassembler_t::disassembler_t(int xlen)
   DISASM_INSN("c.fsdsp", c_fsdsp, 0, {&rvc_fp_rs2, &rvc_sdsp_address});
 
   DISASM_INSN("vsetvli", vsetvli, 0, {&xrd, &xrs1, &v_vtype});
-  DISASM_INSN("vsetvl", vsetvl, 0, {&xrd, &xrs1, &xrs2});
+  //DISASM_INSN("vsetvl", vsetvl, 0, {&xrd, &xrs1, &xrs2});
 
   #define DISASM_VMEM_INSN(name, fmt, ff) \
     add_insn(new disasm_insn_t(#name "8"    #ff ".v",  match_##name##8##ff##_v,     mask_##name##8##ff##_v    | mask_nf, fmt)); \
@@ -963,9 +1020,9 @@ disassembler_t::disassembler_t(int xlen)
   DISASM_OPIV__XI_INSN(vslidedown,   0);
 
   //0b01_0000
-  DISASM_OPIV_VXIM_INSN(vadc,    1, 0);
+  //DISASM_OPIV_VXIM_INSN(vadc,    1, 0);
   DISASM_OPIV_VXIM_INSN(vmadc,   1, 1);
-  DISASM_OPIV_VX_M_INSN(vsbc,    1, 0);
+  //DISASM_OPIV_VX_M_INSN(vsbc,    1, 0);
   DISASM_OPIV_VX_M_INSN(vmsbc,   1, 1);
   DISASM_OPIV_VXIM_INSN(vmerge,  1, 0);
   DISASM_INSN("vmv.v.i", vmv_v_i, 0, {&vd, &v_simm5});
@@ -1008,9 +1065,9 @@ disassembler_t::disassembler_t(int xlen)
 
   //OPMVV/OPMVX
   //0b00_0000
-  DISASM_OPIV_VX__INSN(vaaddu,    0);
+  //DISASM_OPIV_VX__INSN(vaaddu,    0);
   DISASM_OPIV_VX__INSN(vaadd,     0);
-  DISASM_OPIV_VX__INSN(vasubu,    0);
+  //DISASM_OPIV_VX__INSN(vasubu,    0);
   DISASM_OPIV_VX__INSN(vasub,     0);
 
   DISASM_OPIV_S___INSN(vredsum,   1);
@@ -1021,7 +1078,7 @@ disassembler_t::disassembler_t(int xlen)
   DISASM_OPIV_S___INSN(vredmin,   1);
   DISASM_OPIV_S___INSN(vredmaxu,  0);
   DISASM_OPIV_S___INSN(vredmax,   1);
-  DISASM_OPIV__X__INSN(vslide1up,  1);
+  //DISASM_OPIV__X__INSN(vslide1up,  1);
   DISASM_OPIV__X__INSN(vslide1down,1);
 
   //0b01_0000
@@ -1062,13 +1119,13 @@ disassembler_t::disassembler_t(int xlen)
   DISASM_OPIV_M___INSN(vmxnor,    1);
 
   //0b10_0000
-  DISASM_OPIV_VX__INSN(vdivu,     0);
+  //DISASM_OPIV_VX__INSN(vdivu,     0);
   DISASM_OPIV_VX__INSN(vdiv,      1);
   DISASM_OPIV_VX__INSN(vremu,     0);
   DISASM_OPIV_VX__INSN(vrem,      1);
-  DISASM_OPIV_VX__INSN(vmulhu,    0);
+  //DISASM_OPIV_VX__INSN(vmulhu,    0);
   DISASM_OPIV_VX__INSN(vmul,      1);
-  DISASM_OPIV_VX__INSN(vmulhsu,   0);
+  //DISASM_OPIV_VX__INSN(vmulhsu,   0);
   DISASM_OPIV_VX__INSN(vmulh,     1);
   DISASM_OPIV_VX__INSN(vmadd,     1);
   DISASM_OPIV_VX__INSN(vnmsub,    1);
@@ -1180,7 +1237,7 @@ disassembler_t::disassembler_t(int xlen)
   DISASM_OPIV__F_INSN(vfrdiv);
 
   //vfunary0
-  DISASM_VFUNARY0_INSN(vf,  v);
+  //DISASM_VFUNARY0_INSN(vf,  v);
 
   DISASM_VFUNARY0_INSN(vfw, v);
   DISASM_INSN("vfwcvt.f.f.v", vfwcvt_f_f_v, 0, {&vd, &vs2, &opt, &vm});
@@ -1277,6 +1334,30 @@ disassembler_t::disassembler_t(int xlen)
   }
 
   // Xpulpimg extension
+  DEFINE_PLOAD_IRPOST(p_lb_irpost);
+  DEFINE_PLOAD_IRPOST(p_lbu_irpost);
+  DEFINE_PLOAD_IRPOST(p_lh_irpost);
+  DEFINE_PLOAD_IRPOST(p_lhu_irpost);
+  DEFINE_PLOAD_IRPOST(p_lw_irpost);
+  DEFINE_PLOAD_RRPOST(p_lb_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lbu_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lh_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lhu_rrpost);
+  DEFINE_PLOAD_RRPOST(p_lw_rrpost);
+  DEFINE_PLOAD_RR(p_lb_rr);
+  DEFINE_PLOAD_RR(p_lbu_rr);
+  DEFINE_PLOAD_RR(p_lh_rr);
+  DEFINE_PLOAD_RR(p_lhu_rr);
+  DEFINE_PLOAD_RR(p_lw_rr);
+  DEFINE_PSTORE_IRPOST(p_sb_irpost);
+  DEFINE_PSTORE_IRPOST(p_sh_irpost);
+  DEFINE_PSTORE_IRPOST(p_sw_irpost);
+  DEFINE_PSTORE_RRPOST(p_sb_rrpost);
+  DEFINE_PSTORE_RRPOST(p_sh_rrpost);
+  DEFINE_PSTORE_RRPOST(p_sw_rrpost);
+  DEFINE_PSTORE_RR(p_sb_rr);
+  DEFINE_PSTORE_RR(p_sh_rr);
+  DEFINE_PSTORE_RR(p_sw_rr);
   DEFINE_R1TYPE(p_abs);
   DEFINE_RTYPE(p_slet);
   DEFINE_RTYPE(p_sletu);
@@ -1288,12 +1369,147 @@ disassembler_t::disassembler_t(int xlen)
   DEFINE_R1TYPE(p_exthz);
   DEFINE_R1TYPE(p_extbs);
   DEFINE_R1TYPE(p_extbz);
-  DEFINE_PITYPE(p_clip);
-  DEFINE_PITYPE(p_clipu);
+  DEFINE_PI0TYPE(p_clip);
+  DEFINE_PI0TYPE(p_clipu);
   DEFINE_RTYPE(p_clipr);
   DEFINE_RTYPE(p_clipur);
   DEFINE_PBTYPE(p_beqimm);
   DEFINE_PBTYPE(p_bneimm);
+  DEFINE_RTYPE(p_mac);
+  DEFINE_RTYPE(p_msu);
+
+  DEFINE_RTYPE(pv_add_h);
+  DEFINE_RTYPE(pv_add_sc_h);
+  DEFINE_PI1STYPE(pv_add_sci_h);
+  DEFINE_RTYPE(pv_add_b);
+  DEFINE_RTYPE(pv_add_sc_b);
+  DEFINE_PI1STYPE(pv_add_sci_b);
+  DEFINE_RTYPE(pv_sub_h);
+  DEFINE_RTYPE(pv_sub_sc_h);
+  DEFINE_PI1STYPE(pv_sub_sci_h);
+  DEFINE_RTYPE(pv_sub_b);
+  DEFINE_RTYPE(pv_sub_sc_b);
+  DEFINE_PI1STYPE(pv_sub_sci_b);
+  DEFINE_RTYPE(pv_avg_h);
+  DEFINE_RTYPE(pv_avg_sc_h);
+  DEFINE_PI1STYPE(pv_avg_sci_h);
+  DEFINE_RTYPE(pv_avg_b);
+  DEFINE_RTYPE(pv_avg_sc_b);
+  DEFINE_PI1STYPE(pv_avg_sci_b);
+  DEFINE_RTYPE(pv_avgu_h);
+  DEFINE_RTYPE(pv_avgu_sc_h);
+  DEFINE_PI1ZTYPE(pv_avgu_sci_h);
+  DEFINE_RTYPE(pv_avgu_b);
+  DEFINE_RTYPE(pv_avgu_sc_b);
+  DEFINE_PI1ZTYPE(pv_avgu_sci_b);
+  DEFINE_RTYPE(pv_min_h);
+  DEFINE_RTYPE(pv_min_sc_h);
+  DEFINE_PI1STYPE(pv_min_sci_h);
+  DEFINE_RTYPE(pv_min_b);
+  DEFINE_RTYPE(pv_min_sc_b);
+  DEFINE_PI1STYPE(pv_min_sci_b);
+  DEFINE_RTYPE(pv_minu_h);
+  DEFINE_RTYPE(pv_minu_sc_h);
+  DEFINE_PI1ZTYPE(pv_minu_sci_h);
+  DEFINE_RTYPE(pv_minu_b);
+  DEFINE_RTYPE(pv_minu_sc_b);
+  DEFINE_PI1ZTYPE(pv_minu_sci_b);
+  DEFINE_RTYPE(pv_max_h);
+  DEFINE_RTYPE(pv_max_sc_h);
+  DEFINE_PI1STYPE(pv_max_sci_h);
+  DEFINE_RTYPE(pv_max_b);
+  DEFINE_RTYPE(pv_max_sc_b);
+  DEFINE_PI1STYPE(pv_max_sci_b);
+  DEFINE_RTYPE(pv_maxu_h);
+  DEFINE_RTYPE(pv_maxu_sc_h);
+  DEFINE_PI1ZTYPE(pv_maxu_sci_h);
+  DEFINE_RTYPE(pv_maxu_b);
+  DEFINE_RTYPE(pv_maxu_sc_b);
+  DEFINE_PI1ZTYPE(pv_maxu_sci_b);
+  DEFINE_RTYPE(pv_srl_h);
+  DEFINE_RTYPE(pv_srl_sc_h);
+  DEFINE_PI1ZTYPE(pv_srl_sci_h);
+  DEFINE_RTYPE(pv_srl_b);
+  DEFINE_RTYPE(pv_srl_sc_b);
+  DEFINE_PI1ZTYPE(pv_srl_sci_b);
+  DEFINE_RTYPE(pv_sra_h);
+  DEFINE_RTYPE(pv_sra_sc_h);
+  DEFINE_PI1ZTYPE(pv_sra_sci_h);
+  DEFINE_RTYPE(pv_sra_b);
+  DEFINE_RTYPE(pv_sra_sc_b);
+  DEFINE_PI1ZTYPE(pv_sra_sci_b);
+  DEFINE_RTYPE(pv_sll_h);
+  DEFINE_RTYPE(pv_sll_sc_h);
+  DEFINE_PI1ZTYPE(pv_sll_sci_h);
+  DEFINE_RTYPE(pv_sll_b);
+  DEFINE_RTYPE(pv_sll_sc_b);
+  DEFINE_PI1ZTYPE(pv_sll_sci_b);
+  DEFINE_RTYPE(pv_or_h);
+  DEFINE_RTYPE(pv_or_sc_h);
+  DEFINE_PI1ZTYPE(pv_or_sci_h);
+  DEFINE_RTYPE(pv_or_b);
+  DEFINE_RTYPE(pv_or_sc_b);
+  DEFINE_PI1ZTYPE(pv_or_sci_b);
+  DEFINE_RTYPE(pv_xor_h);
+  DEFINE_RTYPE(pv_xor_sc_h);
+  DEFINE_PI1ZTYPE(pv_xor_sci_h);
+  DEFINE_RTYPE(pv_xor_b);
+  DEFINE_RTYPE(pv_xor_sc_b);
+  DEFINE_PI1ZTYPE(pv_xor_sci_b);
+  DEFINE_RTYPE(pv_and_h);
+  DEFINE_RTYPE(pv_and_sc_h);
+  DEFINE_PI1ZTYPE(pv_and_sci_h);
+  DEFINE_RTYPE(pv_and_b);
+  DEFINE_RTYPE(pv_and_sc_b);
+  DEFINE_PI1ZTYPE(pv_and_sci_b);
+  DEFINE_R1TYPE(pv_abs_h);
+  DEFINE_R1TYPE(pv_abs_b);
+  DEFINE_PI1ZTYPE(pv_extract_h);
+  DEFINE_PI1ZTYPE(pv_extract_b);
+  DEFINE_PI1ZTYPE(pv_extractu_h);
+  DEFINE_PI1ZTYPE(pv_extractu_b);
+  DEFINE_PI1ZTYPE(pv_insert_h);
+  DEFINE_PI1ZTYPE(pv_insert_b);
+
+  DEFINE_RTYPE(pv_dotup_h);
+  DEFINE_RTYPE(pv_dotup_sc_h);
+  DEFINE_PI1ZTYPE(pv_dotup_sci_h);
+  DEFINE_RTYPE(pv_dotup_b);
+  DEFINE_RTYPE(pv_dotup_sc_b);
+  DEFINE_PI1ZTYPE(pv_dotup_sci_b);
+  DEFINE_RTYPE(pv_dotusp_h);
+  DEFINE_RTYPE(pv_dotusp_sc_h);
+  DEFINE_PI1ZTYPE(pv_dotusp_sci_h);
+  DEFINE_RTYPE(pv_dotusp_b);
+  DEFINE_RTYPE(pv_dotusp_sc_b);
+  DEFINE_PI1ZTYPE(pv_dotusp_sci_b);
+  DEFINE_RTYPE(pv_dotsp_h);
+  DEFINE_RTYPE(pv_dotsp_sc_h);
+  DEFINE_PI1ZTYPE(pv_dotsp_sci_h);
+  DEFINE_RTYPE(pv_dotsp_b);
+  DEFINE_RTYPE(pv_dotsp_sc_b);
+  DEFINE_PI1ZTYPE(pv_dotsp_sci_b);
+  DEFINE_RTYPE(pv_sdotup_h);
+  DEFINE_RTYPE(pv_sdotup_sc_h);
+  DEFINE_PI1ZTYPE(pv_sdotup_sci_h);
+  DEFINE_RTYPE(pv_sdotup_b);
+  DEFINE_RTYPE(pv_sdotup_sc_b);
+  DEFINE_PI1ZTYPE(pv_sdotup_sci_b);
+  DEFINE_RTYPE(pv_sdotusp_h);
+  DEFINE_RTYPE(pv_sdotusp_sc_h);
+  DEFINE_PI1ZTYPE(pv_sdotusp_sci_h);
+  DEFINE_RTYPE(pv_sdotusp_b);
+  DEFINE_RTYPE(pv_sdotusp_sc_b);
+  DEFINE_PI1ZTYPE(pv_sdotusp_sci_b);
+  DEFINE_RTYPE(pv_sdotsp_h);
+  DEFINE_RTYPE(pv_sdotsp_sc_h);
+  DEFINE_PI1ZTYPE(pv_sdotsp_sci_h);
+  DEFINE_RTYPE(pv_sdotsp_b);
+  DEFINE_RTYPE(pv_sdotsp_sc_b);
+  DEFINE_PI1ZTYPE(pv_sdotsp_sci_b);
+
+  DEFINE_RTYPE(pv_shuffle2_h);
+  DEFINE_RTYPE(pv_shuffle2_b);
 
   // provide a default disassembly for all instructions as a fallback
   #define DECLARE_INSN(code, match, mask) \
diff --git a/toolchain/riscv-isa-sim/riscv/decode.h b/toolchain/riscv-isa-sim/riscv/decode.h
index 63b1e2676..d6d270af8 100644
--- a/toolchain/riscv-isa-sim/riscv/decode.h
+++ b/toolchain/riscv-isa-sim/riscv/decode.h
@@ -131,6 +131,10 @@ class insn_t
   // Xpulpimg
   uint64_t p_zimm5() { return x(20, 5); }
   int64_t p_simm5() { return xs(20, 5); }
+  uint64_t p_rs3() { return x(7, 5); }
+  uint64_t p_zimm6() { return x(25,1) + (x(20, 5) << 1); }
+  int64_t p_simm6() { return x(25,1) + (xs(20, 5) << 1); }
+
 
 private:
   insn_bits_t b;
@@ -284,6 +288,16 @@ class regfile_t
 #define sext8(x)  ((sreg_t)(int8_t)(x))
 #define zext8(x)  ((reg_t)(uint8_t)(x))
 
+#define P_RS3 READ_REG(insn.p_rs3()) /* same as RD, just different semantical value */
+#define WRITE_RS1(value) WRITE_REG(insn.rs1(), value)
+
+#define RS1_H(i) ((RS1 >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rs1 half: i should only be 0 or 1 */
+#define RS1_B(i) ((RS1 >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rs1 byte: i should only be from 0 to 3 */
+#define RS2_H(i) ((RS2 >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rs2 half: i should only be 0 or 1 */
+#define RS2_B(i) ((RS2 >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rs2 byte: i should only be from 0 to 3 */
+#define RD_H(i) ((RD >> ((xlen >> 1) * (i & 0x1))) & 0xFFFF) /* select rd half: i should only be 0 or 1 */
+#define RD_B(i) ((RD >> ((xlen >> 2) * (i & 0x3))) & 0xFF) /* select rd byte: i should only be from 0 to 3 */
+
 
 #define sext32(x) ((sreg_t)(int32_t)(x))
 #define zext32(x) ((reg_t)(uint32_t)(x))
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lb_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lb_irpost.h
new file mode 100644
index 000000000..ed17db162
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lb_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int8(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lb_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_lb_rr.h
new file mode 100644
index 000000000..c32237fe1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lb_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int8(RS1 + sreg_t(RS2)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lb_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lb_rrpost.h
new file mode 100644
index 000000000..9dc2bd93d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lb_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int8(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lbu_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_irpost.h
new file mode 100644
index 000000000..0f015c376
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint8(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rr.h
new file mode 100644
index 000000000..a95ca2a9a
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_uint8(RS1 + sreg_t(RS2)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rrpost.h
new file mode 100644
index 000000000..3456c8aec
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lbu_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint8(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lh_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lh_irpost.h
new file mode 100644
index 000000000..3fea47c18
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lh_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int16(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lh_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_lh_rr.h
new file mode 100644
index 000000000..cd5bf8219
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lh_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int16(RS1 + sreg_t(RS2)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lh_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lh_rrpost.h
new file mode 100644
index 000000000..60353fd3e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lh_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int16(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lhu_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_irpost.h
new file mode 100644
index 000000000..8e7cfb6be
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint16(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rr.h
new file mode 100644
index 000000000..6568736a7
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_uint16(RS1 + sreg_t(RS2)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rrpost.h
new file mode 100644
index 000000000..195222ac0
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lhu_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_uint16(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lw_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lw_irpost.h
new file mode 100644
index 000000000..fb77d8723
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lw_irpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int32(RS1));
+WRITE_RS1(RS1 + insn.i_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lw_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_lw_rr.h
new file mode 100644
index 000000000..78fa33231
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lw_rr.h
@@ -0,0 +1 @@
+WRITE_RD(MMU.load_int32(RS1 + sreg_t(RS2)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_lw_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_lw_rrpost.h
new file mode 100644
index 000000000..e315c5dfe
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_lw_rrpost.h
@@ -0,0 +1,2 @@
+WRITE_RD(MMU.load_int32(RS1));
+WRITE_RS1(RS1 + sreg_t(RS2));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_mac.h b/toolchain/riscv-isa-sim/riscv/insns/p_mac.h
new file mode 100644
index 000000000..bf5c77a14
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_mac.h
@@ -0,0 +1 @@
+WRITE_RD(sext_xlen(sreg_t(RD) + sext_xlen(sreg_t(RS1) * sreg_t(RS2))));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_msu.h b/toolchain/riscv-isa-sim/riscv/insns/p_msu.h
new file mode 100644
index 000000000..2a42cf05e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_msu.h
@@ -0,0 +1 @@
+WRITE_RD(sext_xlen(sreg_t(RD) - sext_xlen(sreg_t(RS1) * sreg_t(RS2))));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sb_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sb_irpost.h
new file mode 100644
index 000000000..9339bc9ca
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sb_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint8(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sb_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_sb_rr.h
new file mode 100644
index 000000000..73e49727c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sb_rr.h
@@ -0,0 +1 @@
+MMU.store_uint8(RS1 + sreg_t(P_RS3), RS2);
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sb_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sb_rrpost.h
new file mode 100644
index 000000000..044255174
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sb_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint8(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sh_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sh_irpost.h
new file mode 100644
index 000000000..f915c518d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sh_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint16(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sh_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_sh_rr.h
new file mode 100644
index 000000000..f3270bd56
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sh_rr.h
@@ -0,0 +1 @@
+MMU.store_uint16(RS1 + sreg_t(P_RS3), RS2);
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sh_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sh_rrpost.h
new file mode 100644
index 000000000..5043c6287
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sh_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint16(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sw_irpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sw_irpost.h
new file mode 100644
index 000000000..7ff0406fe
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sw_irpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint32(RS1, RS2);
+WRITE_RS1(RS1 + insn.s_imm());
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sw_rr.h b/toolchain/riscv-isa-sim/riscv/insns/p_sw_rr.h
new file mode 100644
index 000000000..6bef97f73
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sw_rr.h
@@ -0,0 +1 @@
+MMU.store_uint32(RS1 + sreg_t(P_RS3), RS2);
diff --git a/toolchain/riscv-isa-sim/riscv/insns/p_sw_rrpost.h b/toolchain/riscv-isa-sim/riscv/insns/p_sw_rrpost.h
new file mode 100644
index 000000000..6382d6d80
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/p_sw_rrpost.h
@@ -0,0 +1,2 @@
+MMU.store_uint32(RS1, RS2);
+WRITE_RS1(RS1 + sreg_t(P_RS3));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_abs_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_abs_b.h
new file mode 100644
index 000000000..c0bc089cc
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_abs_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > 0 ? RS1_B(i) : -sext8(RS1_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_abs_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_abs_h.h
new file mode 100644
index 000000000..42ca4ff3c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_abs_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > 0 ? RS1_H(i) : -sext16(RS1_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_b.h
new file mode 100644
index 000000000..ecae63a04
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + sext8(RS2_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_h.h
new file mode 100644
index 000000000..0a78665af
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + sext16(RS2_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_b.h
new file mode 100644
index 000000000..572b61c07
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + sext8(RS2_B(0));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_h.h
new file mode 100644
index 000000000..734a911b8
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + sext16(RS2_H(0));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_b.h
new file mode 100644
index 000000000..df47f1cb5
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) + insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_h.h
new file mode 100644
index 000000000..907621c09
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_add_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) + insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_b.h
new file mode 100644
index 000000000..d3711b762
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_h.h
new file mode 100644
index 000000000..8bae35685
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) & RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_b.h
new file mode 100644
index 000000000..b1e6c865e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_h.h
new file mode 100644
index 000000000..2389d11e1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) & RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_b.h
new file mode 100644
index 000000000..7e4e9e0ac
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) & insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_h.h
new file mode 100644
index 000000000..fbd57d116
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_and_sci_h.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_H(i) & insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_b.h
new file mode 100644
index 000000000..3d5d6d472
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + sext8(RS2_B(i))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_h.h
new file mode 100644
index 000000000..725f2f2e0
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + sext16(RS2_H(i))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_b.h
new file mode 100644
index 000000000..0b7d2f8d2
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + sext8(RS2_B(0))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_h.h
new file mode 100644
index 000000000..8a6cb5e50
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + sext16(RS2_H(0))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_b.h
new file mode 100644
index 000000000..ff67065e3
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(sext8(RS1_B(i)) + insn.p_simm6()) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_h.h
new file mode 100644
index 000000000..f7deefd25
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avg_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(sext16(RS1_H(i)) + insn.p_simm6()) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_b.h
new file mode 100644
index 000000000..435c4d22c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + zext8(RS2_B(i))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_h.h
new file mode 100644
index 000000000..3fdbaf4dd
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + zext16(RS2_H(i))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_b.h
new file mode 100644
index 000000000..47ca3888b
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + zext8(RS2_B(0))) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_h.h
new file mode 100644
index 000000000..0bf92f93b
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + zext16(RS2_H(0))) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_b.h
new file mode 100644
index 000000000..fbc0dff92
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(zext8(RS1_B(i)) + insn.p_zimm6()) >> 1;
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_h.h
new file mode 100644
index 000000000..dd8cd3544
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_avgu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(zext16(RS1_H(i)) + insn.p_zimm6()) >> 1;
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_b.h
new file mode 100644
index 000000000..93b7233cc
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_h.h
new file mode 100644
index 000000000..9feed35ef
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_b.h
new file mode 100644
index 000000000..cef11d5e7
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_h.h
new file mode 100644
index 000000000..ef558d39f
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_b.h
new file mode 100644
index 000000000..3470fd55a
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_h.h
new file mode 100644
index 000000000..97e30eb29
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotsp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_b.h
new file mode 100644
index 000000000..fa77f3667
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_h.h
new file mode 100644
index 000000000..4e170b238
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_b.h
new file mode 100644
index 000000000..a581d0162
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_h.h
new file mode 100644
index 000000000..b78762a87
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sc_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_b.h
new file mode 100644
index 000000000..0dedb1caf
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_h.h
new file mode 100644
index 000000000..64a36d569
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotup_sci_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_b.h
new file mode 100644
index 000000000..1cdfc2f2c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_h.h
new file mode 100644
index 000000000..81968a14d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_b.h
new file mode 100644
index 000000000..d562a7d4d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_h.h
new file mode 100644
index 000000000..3815c3721
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_b.h
new file mode 100644
index 000000000..92c229540
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_h.h
new file mode 100644
index 000000000..8f91a89a3
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_dotusp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_extract_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_extract_b.h
new file mode 100644
index 000000000..fce80bbb6
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_extract_b.h
@@ -0,0 +1 @@
+WRITE_RD(sext8(RS1_B(insn.p_zimm6() & 0x03)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_extract_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_extract_h.h
new file mode 100644
index 000000000..ee35393d4
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_extract_h.h
@@ -0,0 +1 @@
+WRITE_RD(sext16(RS1_H(insn.p_zimm6() & 0x01)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_b.h
new file mode 100644
index 000000000..c24023387
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_b.h
@@ -0,0 +1 @@
+WRITE_RD(zext8(RS1_B(insn.p_zimm6() & 0x03)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_h.h
new file mode 100644
index 000000000..90b679afd
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_extractu_h.h
@@ -0,0 +1 @@
+WRITE_RD(zext16(RS1_H(insn.p_zimm6() & 0x01)));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_insert_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_insert_b.h
new file mode 100644
index 000000000..5575e7967
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_insert_b.h
@@ -0,0 +1,6 @@
+uint32_t ins_rd = RD;
+uint8_t i = insn.p_zimm6() & 0x03; /* select to which rd half to write the 16-bit value */
+
+ins_rd = (ins_rd & ~(0xFF << ((xlen >> 2) * i))) | ((RS1_H(0) & 0xFF) << ((xlen >> 2) * i));
+
+WRITE_RD(sext_xlen(ins_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_insert_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_insert_h.h
new file mode 100644
index 000000000..eccb0eda6
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_insert_h.h
@@ -0,0 +1,6 @@
+uint32_t ins_rd = RD;
+uint8_t i = insn.p_zimm6() & 0x01; /* select to which rd half to write the 16-bit value */
+
+ins_rd = (ins_rd & ~(0xFFFF << ((xlen >> 1) * i))) | ((RS1_H(0) & 0xFFFF) << ((xlen >> 1) * i));
+
+WRITE_RD(sext_xlen(ins_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_b.h
new file mode 100644
index 000000000..4dc3e6be8
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > sext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_h.h
new file mode 100644
index 000000000..c65a32da6
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > sext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_b.h
new file mode 100644
index 000000000..896087f62
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > sext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_h.h
new file mode 100644
index 000000000..fd55fb49b
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > sext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_b.h
new file mode 100644
index 000000000..5e06669fa
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) > insn.p_simm6() ? RS1_B(i) : insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_h.h
new file mode 100644
index 000000000..ce1df2ee1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_max_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) > insn.p_simm6() ? RS1_H(i) : insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_b.h
new file mode 100644
index 000000000..5821c1726
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > zext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_h.h
new file mode 100644
index 000000000..3e587c3c9
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > zext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_b.h
new file mode 100644
index 000000000..c297b87ab
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > zext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_h.h
new file mode 100644
index 000000000..fbb5c7feb
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > zext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_b.h
new file mode 100644
index 000000000..ab5f6e5f9
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_b.h
@@ -0,0 +1,10 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) > insn.p_zimm6() ? RS1_B(i) : insn.p_zimm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
+
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_h.h
new file mode 100644
index 000000000..9aaf9effc
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_maxu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) > insn.p_zimm6() ? RS1_H(i) : insn.p_zimm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_b.h
new file mode 100644
index 000000000..1b9104b55
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= sext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_h.h
new file mode 100644
index 000000000..bbc83caea
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= sext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_b.h
new file mode 100644
index 000000000..1d2aac507
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= sext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_h.h
new file mode 100644
index 000000000..b2b8ab110
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= sext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_b.h
new file mode 100644
index 000000000..031b51f36
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) <= insn.p_simm6() ? RS1_B(i) : insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_h.h
new file mode 100644
index 000000000..d007e0662
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_min_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) <= insn.p_simm6() ? RS1_H(i) : insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_b.h
new file mode 100644
index 000000000..bbb92ca55
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= zext8(RS2_B(i)) ? RS1_B(i) : RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_h.h
new file mode 100644
index 000000000..fa7b0a4e2
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= zext16(RS2_H(i)) ? RS1_H(i) : RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_b.h
new file mode 100644
index 000000000..566bcce6d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= zext8(RS2_B(0)) ? RS1_B(i) : RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_h.h
new file mode 100644
index 000000000..7471d9678
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= zext16(RS2_H(0)) ? RS1_H(i) : RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_b.h
new file mode 100644
index 000000000..75c43787c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) <= insn.p_zimm6() ? RS1_B(i) : insn.p_zimm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_h.h
new file mode 100644
index 000000000..c665e92f4
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_minu_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) <= insn.p_zimm6() ? RS1_H(i) : insn.p_zimm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_b.h
new file mode 100644
index 000000000..d27a6e5d1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_h.h
new file mode 100644
index 000000000..65b112893
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_b.h
new file mode 100644
index 000000000..cac508744
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_h.h
new file mode 100644
index 000000000..e6f567cf3
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_b.h
new file mode 100644
index 000000000..0cb7b5cb6
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) | insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_h.h
new file mode 100644
index 000000000..e95922e1e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_or_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) | insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_b.h
new file mode 100644
index 000000000..812e3d436
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_h.h
new file mode 100644
index 000000000..9ccfae939
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_b.h
new file mode 100644
index 000000000..e665a669f
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_h.h
new file mode 100644
index 000000000..fa1ca93fe
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_b.h
new file mode 100644
index 000000000..31aab1fe5
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sext8(RS1_B(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_h.h
new file mode 100644
index 000000000..151d16a2e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotsp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sext16(RS1_H(i)) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_b.h
new file mode 100644
index 000000000..82e47b4f8
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_h.h
new file mode 100644
index 000000000..de77009a0
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_b.h
new file mode 100644
index 000000000..717fffc11
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * zext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_h.h
new file mode 100644
index 000000000..ecf048566
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sc_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * zext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_b.h
new file mode 100644
index 000000000..bd4d850e6
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_b.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += zext8(RS1_B(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_h.h
new file mode 100644
index 000000000..145e73717
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotup_sci_h.h
@@ -0,0 +1,6 @@
+uint32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += zext16(RS1_H(i)) * insn.p_zimm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_b.h
new file mode 100644
index 000000000..05d268ed2
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_h.h
new file mode 100644
index 000000000..fdc550db1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(i));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_b.h
new file mode 100644
index 000000000..2840cd148
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * sext8(RS2_B(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_h.h
new file mode 100644
index 000000000..ca4c25ac1
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sc_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * sext16(RS2_H(0));
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_b.h
new file mode 100644
index 000000000..d6823f83a
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_b.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/8 - 1; i >= 0; i--)
+  acc += sreg_t(zext8(RS1_B(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_h.h
new file mode 100644
index 000000000..42c4fbe88
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sdotusp_sci_h.h
@@ -0,0 +1,6 @@
+int32_t acc = RD;
+
+for(int i = xlen/16 - 1; i >= 0; i--)
+  acc += sreg_t(zext16(RS1_H(i))) * insn.p_simm6();
+
+WRITE_RD(sext_xlen(acc));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_b.h
new file mode 100644
index 000000000..8dd4e9994
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_b.h
@@ -0,0 +1,14 @@
+uint8_t src_sel; // select rd or rs1 as source (bit [2] of second operand)
+uint8_t byte_sel; // select which byte from source (bits [1:0] of second operand)
+uint8_t source;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  byte_sel = RS2_B(i) & 0x03; // bits [1:0] of RS2_B(i)
+  src_sel = (RS2_B(i) >> 2) & 0x01; // bit [2] of RS2_B(i)
+  source = src_sel ? RS1_B(byte_sel) : RD_B(byte_sel);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)source & 0x000000FF;
+}
+
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_h.h
new file mode 100644
index 000000000..362a4bdc7
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_shuffle2_h.h
@@ -0,0 +1,14 @@
+uint8_t src_sel; // select rd or rs1 as source (bit [1] of second operand)
+uint8_t half_sel; // select which half from source (bit [0] of second operand)
+uint16_t source;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  half_sel = RS2_H(i) & 0x01; // bit [0] of RS2_H(i)
+  src_sel = (RS2_H(i) >> 1) & 0x01; // bit [1] of RS2_H(i)
+  source = src_sel ? RS1_H(half_sel) : RD_H(half_sel);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)source & 0x0000FFFF;
+}
+
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_b.h
new file mode 100644
index 000000000..ca8bcd688
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_h.h
new file mode 100644
index 000000000..cb9200cac
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_b.h
new file mode 100644
index 000000000..d32051998
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_h.h
new file mode 100644
index 000000000..e84cf0214
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_b.h
new file mode 100644
index 000000000..8e637bea8
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) << (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_h.h
new file mode 100644
index 000000000..ec94a2e28
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sll_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) << (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_b.h
new file mode 100644
index 000000000..9525a0afc
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_h.h
new file mode 100644
index 000000000..b3e8a0b94
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_b.h
new file mode 100644
index 000000000..9442d9280
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_h.h
new file mode 100644
index 000000000..1e012f750
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_b.h
new file mode 100644
index 000000000..3dafb3cb5
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) >> (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_h.h
new file mode 100644
index 000000000..4f56d0e5e
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sra_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) >> (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_b.h
new file mode 100644
index 000000000..37be2e23a
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (zext8(RS2_B(i)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_h.h
new file mode 100644
index 000000000..1b35116d3
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (zext16(RS2_H(i)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_b.h
new file mode 100644
index 000000000..4b04ab6f7
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (zext8(RS2_B(0)) & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_h.h
new file mode 100644
index 000000000..f49f784db
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (zext16(RS2_H(0)) & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_b.h
new file mode 100644
index 000000000..b0b38f2a9
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = zext8(RS1_B(i)) >> (insn.p_simm6() & 0x07);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_h.h
new file mode 100644
index 000000000..5aba29cc9
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_srl_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = zext16(RS1_H(i)) >> (insn.p_simm6() & 0x0F);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_b.h
new file mode 100644
index 000000000..2ce1fe224
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - sext8(RS2_B(i));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_h.h
new file mode 100644
index 000000000..4ec513726
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - sext16(RS2_H(i));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_b.h
new file mode 100644
index 000000000..3375e64c2
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - sext8(RS2_B(0));
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_h.h
new file mode 100644
index 000000000..4bb12839c
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sc_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - sext16(RS2_H(0));
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_b.h
new file mode 100644
index 000000000..20cc94123
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_b.h
@@ -0,0 +1,9 @@
+int8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = sext8(RS1_B(i)) - insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_h.h
new file mode 100644
index 000000000..50b11a665
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_sub_sci_h.h
@@ -0,0 +1,9 @@
+int16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = sext16(RS1_H(i)) - insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_b.h
new file mode 100644
index 000000000..2fc203b4d
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ RS2_B(i);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_h.h
new file mode 100644
index 000000000..56cf0b7c9
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ RS2_H(i);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_b.h
new file mode 100644
index 000000000..ed3d5075a
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ RS2_B(0);
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_h.h
new file mode 100644
index 000000000..9d632f367
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sc_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ RS2_H(0);
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_b.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_b.h
new file mode 100644
index 000000000..7ecbf94fc
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_b.h
@@ -0,0 +1,9 @@
+uint8_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/8 - 1; i >= 0; i--){
+  temp = RS1_B(i) ^ insn.p_simm6();
+  simd_rd <<= 8;
+  simd_rd += (uint32_t)temp & 0x000000FF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_h.h b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_h.h
new file mode 100644
index 000000000..0a02ced60
--- /dev/null
+++ b/toolchain/riscv-isa-sim/riscv/insns/pv_xor_sci_h.h
@@ -0,0 +1,9 @@
+uint16_t temp;
+uint32_t simd_rd = 0;
+
+for(int i = xlen/16 - 1; i >= 0; i--){
+  temp = RS1_H(i) ^ insn.p_simm6();
+  simd_rd <<= 16;
+  simd_rd += (uint32_t)temp & 0x0000FFFF;
+}
+WRITE_RD(sext_xlen(simd_rd));
diff --git a/toolchain/riscv-isa-sim/riscv/riscv.mk.in b/toolchain/riscv-isa-sim/riscv/riscv.mk.in
index af69ae8f0..20c11ab2a 100644
--- a/toolchain/riscv-isa-sim/riscv/riscv.mk.in
+++ b/toolchain/riscv-isa-sim/riscv/riscv.mk.in
@@ -333,14 +333,23 @@ riscv_insn_ext_q = \
 	fsqrt_q \
 	fsub_q \
 
+# Disabled riscv_insn_ext_v_alu_int instructions for opcode overlap:
+#vasubu_vx
+#vslide1up_vx
+#vaaddu_vx
+#vadc_vvm
+#vadc_vxm
+#vsbc_vvm
+#vsbc_vxm
+#vmulhu_vx
+#vdivu_vx
+#vmulhsu_vx
+
 riscv_insn_ext_v_alu_int = \
 	vaadd_vv \
 	vaaddu_vv \
 	vaadd_vx \
-	vaaddu_vx \
 	vadc_vim \
-	vadc_vvm \
-	vadc_vxm \
 	vadd_vi \
 	vadd_vv \
 	vadd_vx \
@@ -350,12 +359,10 @@ riscv_insn_ext_v_alu_int = \
 	vasub_vv \
 	vasubu_vv \
 	vasub_vx \
-	vasubu_vx \
 	vcompress_vm \
 	vdiv_vv \
 	vdiv_vx \
 	vdivu_vv \
-	vdivu_vx \
 	vdot_vv \
 	vdotu_vv \
 	vid_v \
@@ -416,9 +423,7 @@ riscv_insn_ext_v_alu_int = \
 	vmulh_vv \
 	vmulh_vx \
 	vmulhsu_vv \
-	vmulhsu_vx \
 	vmulhu_vv \
-	vmulhu_vx \
 	vmv_s_x \
 	vmv_v_i \
 	vmv_v_v \
@@ -473,13 +478,10 @@ riscv_insn_ext_v_alu_int = \
 	vsaddu_vi \
 	vsaddu_vv \
 	vsaddu_vx \
-	vsbc_vvm \
-	vsbc_vxm \
 	vsext_vf2 \
 	vsext_vf4 \
 	vsext_vf8 \
 	vslide1down_vx \
-	vslide1up_vx \
 	vslidedown_vi \
 	vslidedown_vx \
 	vslideup_vi \
@@ -545,6 +547,9 @@ riscv_insn_ext_v_alu_int = \
 	vzext_vf4 \
 	vzext_vf8 \
 
+# Disabled riscv_insn_ext_v_alu_fp instructions for opcode overlap:
+#vfcvt_x_f_v
+
 riscv_insn_ext_v_alu_fp = \
 	vfadd_vf \
 	vfadd_vv \
@@ -553,7 +558,6 @@ riscv_insn_ext_v_alu_fp = \
 	vfcvt_f_xu_v \
 	vfcvt_rtz_x_f_v \
 	vfcvt_rtz_xu_f_v \
-	vfcvt_x_f_v \
 	vfcvt_xu_f_v \
 	vfdiv_vf \
 	vfdiv_vv \
@@ -741,9 +745,11 @@ riscv_insn_ext_v_ldst = \
 	vs4r_v \
 	vs8r_v \
 
+# Disabled riscv_insn_ext_v_ctrl instructions for opcode overlap:
+#vsetvl
+
 riscv_insn_ext_v_ctrl = \
 	vsetvli \
-	vsetvl \
 
 riscv_insn_ext_v = \
 	$(riscv_insn_ext_v_alu_fp) \
@@ -753,6 +759,30 @@ riscv_insn_ext_v = \
 	$(riscv_insn_ext_v_ldst) \
 
 riscv_insn_ext_xpulpimg = \
+	p_lb_irpost \
+	p_lbu_irpost \
+	p_lh_irpost \
+	p_lhu_irpost \
+	p_lw_irpost \
+	p_lb_rrpost \
+	p_lbu_rrpost \
+	p_lh_rrpost \
+	p_lhu_rrpost \
+	p_lw_rrpost \
+	p_lb_rr \
+	p_lbu_rr \
+	p_lh_rr \
+	p_lhu_rr \
+	p_lw_rr \
+	p_sb_irpost \
+	p_sh_irpost \
+	p_sw_irpost \
+	p_sb_rrpost \
+	p_sh_rrpost \
+	p_sw_rrpost \
+	p_sb_rr \
+	p_sh_rr \
+	p_sw_rr \
 	p_abs \
 	p_slet \
 	p_sletu \
@@ -770,6 +800,138 @@ riscv_insn_ext_xpulpimg = \
 	p_clipur \
 	p_beqimm \
 	p_bneimm \
+	p_mac \
+	p_msu \
+	pv_add_h \
+	pv_add_sc_h \
+	pv_add_sci_h \
+	pv_add_b \
+	pv_add_sc_b \
+	pv_add_sci_b \
+	pv_sub_h \
+	pv_sub_sc_h \
+	pv_sub_sci_h \
+	pv_sub_b \
+	pv_sub_sc_b \
+	pv_sub_sci_b \
+	pv_avg_h \
+	pv_avg_sc_h \
+	pv_avg_sci_h \
+	pv_avg_b \
+	pv_avg_sc_b \
+	pv_avg_sci_b \
+	pv_avgu_h \
+	pv_avgu_sc_h \
+	pv_avgu_sci_h \
+	pv_avgu_b \
+	pv_avgu_sc_b \
+	pv_avgu_sci_b \
+	pv_min_h \
+	pv_min_sc_h \
+	pv_min_sci_h \
+	pv_min_b \
+	pv_min_sc_b \
+	pv_min_sci_b \
+	pv_minu_h \
+	pv_minu_sc_h \
+	pv_minu_sci_h \
+	pv_minu_b \
+	pv_minu_sc_b \
+	pv_minu_sci_b \
+	pv_max_h \
+	pv_max_sc_h \
+	pv_max_sci_h \
+	pv_max_b \
+	pv_max_sc_b \
+	pv_max_sci_b \
+	pv_maxu_h \
+	pv_maxu_sc_h \
+	pv_maxu_sci_h \
+	pv_maxu_b \
+	pv_maxu_sc_b \
+	pv_maxu_sci_b \
+	pv_srl_h \
+	pv_srl_sc_h \
+	pv_srl_sci_h \
+	pv_srl_b \
+	pv_srl_sc_b \
+	pv_srl_sci_b \
+	pv_sra_h \
+	pv_sra_sc_h \
+	pv_sra_sci_h \
+	pv_sra_b \
+	pv_sra_sc_b \
+	pv_sra_sci_b \
+	pv_sll_h \
+	pv_sll_sc_h \
+	pv_sll_sci_h \
+	pv_sll_b \
+	pv_sll_sc_b \
+	pv_sll_sci_b \
+	pv_or_h \
+	pv_or_sc_h \
+	pv_or_sci_h \
+	pv_or_b \
+	pv_or_sc_b \
+	pv_or_sci_b \
+	pv_xor_h \
+	pv_xor_sc_h \
+	pv_xor_sci_h \
+	pv_xor_b \
+	pv_xor_sc_b \
+	pv_xor_sci_b \
+	pv_and_h \
+	pv_and_sc_h \
+	pv_and_sci_h \
+	pv_and_b \
+	pv_and_sc_b \
+	pv_and_sci_b \
+	pv_abs_h \
+	pv_abs_b \
+	pv_extract_h \
+	pv_extract_b \
+	pv_extractu_h \
+	pv_extractu_b \
+	pv_insert_h \
+	pv_insert_b \
+	pv_dotup_h \
+	pv_dotup_sc_h \
+	pv_dotup_sci_h \
+	pv_dotup_b \
+	pv_dotup_sc_b \
+	pv_dotup_sci_b \
+	pv_dotusp_h \
+	pv_dotusp_sc_h \
+	pv_dotusp_sci_h \
+	pv_dotusp_b \
+	pv_dotusp_sc_b \
+	pv_dotusp_sci_b \
+	pv_dotsp_h \
+	pv_dotsp_sc_h \
+	pv_dotsp_sci_h \
+	pv_dotsp_b \
+	pv_dotsp_sc_b \
+	pv_dotsp_sci_b \
+	pv_sdotup_h \
+	pv_sdotup_sc_h \
+	pv_sdotup_sci_h \
+	pv_sdotup_b \
+	pv_sdotup_sc_b \
+	pv_sdotup_sci_b \
+	pv_sdotusp_h \
+	pv_sdotusp_sc_h \
+	pv_sdotusp_sci_h \
+	pv_sdotusp_b \
+	pv_sdotusp_sc_b \
+	pv_sdotusp_sci_b \
+	pv_sdotsp_h \
+	pv_sdotsp_sc_h \
+	pv_sdotsp_sci_h \
+	pv_sdotsp_b \
+	pv_sdotsp_sc_b \
+	pv_sdotsp_sci_b \
+	pv_shuffle2_h \
+	pv_shuffle2_b \
 
 riscv_insn_ext_h = \
 	hfence_gvma \
diff --git a/toolchain/riscv-opcodes/Makefile b/toolchain/riscv-opcodes/Makefile
index 5f39502b0..12d02b4aa 100644
--- a/toolchain/riscv-opcodes/Makefile
+++ b/toolchain/riscv-opcodes/Makefile
@@ -7,7 +7,7 @@ MY_OPCODES := opcodes-frep_CUSTOM opcodes-xpulpimg_CUSTOM opcodes-rv32d-zfh_DRAF
 
 ALL_OPCODES := opcodes-pseudo $(ALL_REAL_OPCODES) $(MY_OPCODES) opcodes-rvv-pseudo
 # Opcodes to be discarded
-DISCARDED_OPCODES :=
+DISCARDED_OPCODES := opcodes-frep_CUSTOM
 
 OPCODES = $(filter-out $(sort $(DISCARDED_OPCODES)), $(sort $(ALL_OPCODES)))
 
diff --git a/toolchain/riscv-opcodes/README.md b/toolchain/riscv-opcodes/README.md
index f3ac222ac..4b8110225 100644
--- a/toolchain/riscv-opcodes/README.md
+++ b/toolchain/riscv-opcodes/README.md
@@ -32,4 +32,8 @@ starting from their high-level, human-readable description.
   for the parsing script execution, basing on the target architecture, by
   listing them in the variable `DISCARDED_OPCODES`;
 - opcodes files from the official 128-bit extension have not been introduced
-  due to the other changes which they imply to other opcodes specifications.
+  due to the other changes which they imply to other opcodes specifications;
+- some of the instructions originally declared in the vectorial extension
+  (`opcodes-rvv` file) have been set as pseudo-instruction due to the overlapping
+  of their opcodes space with the opcodes space of the SIMD instructions from
+  Xpulpv2, defined in `opcodes-xpulpimg_CUSTOM`.
diff --git a/toolchain/riscv-opcodes/encoding_out.h b/toolchain/riscv-opcodes/encoding_out.h
index 98660a4cb..f78afbd15 100644
--- a/toolchain/riscv-opcodes/encoding_out.h
+++ b/toolchain/riscv-opcodes/encoding_out.h
@@ -326,8 +326,6 @@
 #define MASK_CUSTOM3_RD_RS1  0x707f
 #define MATCH_CUSTOM3_RD_RS1_RS2 0x707b
 #define MASK_CUSTOM3_RD_RS1_RS2  0x707f
-#define MATCH_FREP 0xb
-#define MASK_FREP  0x7f
 #define MATCH_SLLI_RV32 0x1013
 #define MASK_SLLI_RV32  0xfe00707f
 #define MATCH_SRLI_RV32 0x5013
@@ -928,8 +926,6 @@
 #define MASK_C_FSWSP  0xe003
 #define MATCH_VSETVLI 0x7057
 #define MASK_VSETVLI  0x8000707f
-#define MATCH_VSETVL 0x80007057
-#define MASK_VSETVL  0xfe00707f
 #define MATCH_VLE8_V 0x7
 #define MASK_VLE8_V  0x1df0707f
 #define MATCH_VLE16_V 0x5007
@@ -1230,8 +1226,6 @@
 #define MASK_VFNMSAC_VV  0xfc00707f
 #define MATCH_VFCVT_XU_F_V 0x48001057
 #define MASK_VFCVT_XU_F_V  0xfc0ff07f
-#define MATCH_VFCVT_X_F_V 0x48009057
-#define MASK_VFCVT_X_F_V  0xfc0ff07f
 #define MATCH_VFCVT_F_XU_V 0x48011057
 #define MASK_VFCVT_F_XU_V  0xfc0ff07f
 #define MATCH_VFCVT_F_X_V 0x48019057
@@ -1328,12 +1322,8 @@
 #define MASK_VSLIDEUP_VX  0xfc00707f
 #define MATCH_VSLIDEDOWN_VX 0x3c004057
 #define MASK_VSLIDEDOWN_VX  0xfc00707f
-#define MATCH_VADC_VXM 0x40004057
-#define MASK_VADC_VXM  0xfe00707f
 #define MATCH_VMADC_VXM 0x44004057
 #define MASK_VMADC_VXM  0xfc00707f
-#define MATCH_VSBC_VXM 0x48004057
-#define MASK_VSBC_VXM  0xfe00707f
 #define MATCH_VMSBC_VXM 0x4c004057
 #define MASK_VMSBC_VXM  0xfc00707f
 #define MATCH_VMERGE_VXM 0x5c004057
@@ -1414,12 +1404,8 @@
 #define MASK_VRGATHER_VV  0xfc00707f
 #define MATCH_VRGATHEREI16_VV 0x38000057
 #define MASK_VRGATHEREI16_VV  0xfc00707f
-#define MATCH_VADC_VVM 0x40000057
-#define MASK_VADC_VVM  0xfe00707f
 #define MATCH_VMADC_VVM 0x44000057
 #define MASK_VMADC_VVM  0xfc00707f
-#define MATCH_VSBC_VVM 0x48000057
-#define MASK_VSBC_VVM  0xfe00707f
 #define MATCH_VMSBC_VVM 0x4c000057
 #define MASK_VMSBC_VVM  0xfc00707f
 #define MATCH_VMERGE_VVM 0x5c000057
@@ -1668,34 +1654,22 @@
 #define MASK_VWMACC_VV  0xfc00707f
 #define MATCH_VWMACCSU_VV 0xfc002057
 #define MASK_VWMACCSU_VV  0xfc00707f
-#define MATCH_VAADDU_VX 0x20006057
-#define MASK_VAADDU_VX  0xfc00707f
 #define MATCH_VAADD_VX 0x24006057
 #define MASK_VAADD_VX  0xfc00707f
-#define MATCH_VASUBU_VX 0x28006057
-#define MASK_VASUBU_VX  0xfc00707f
 #define MATCH_VASUB_VX 0x2c006057
 #define MASK_VASUB_VX  0xfc00707f
 #define MATCH_VMV_S_X 0x42006057
 #define MASK_VMV_S_X  0xfff0707f
-#define MATCH_VSLIDE1UP_VX 0x38006057
-#define MASK_VSLIDE1UP_VX  0xfc00707f
 #define MATCH_VSLIDE1DOWN_VX 0x3c006057
 #define MASK_VSLIDE1DOWN_VX  0xfc00707f
-#define MATCH_VDIVU_VX 0x80006057
-#define MASK_VDIVU_VX  0xfc00707f
 #define MATCH_VDIV_VX 0x84006057
 #define MASK_VDIV_VX  0xfc00707f
 #define MATCH_VREMU_VX 0x88006057
 #define MASK_VREMU_VX  0xfc00707f
 #define MATCH_VREM_VX 0x8c006057
 #define MASK_VREM_VX  0xfc00707f
-#define MATCH_VMULHU_VX 0x90006057
-#define MASK_VMULHU_VX  0xfc00707f
 #define MATCH_VMUL_VX 0x94006057
 #define MASK_VMUL_VX  0xfc00707f
-#define MATCH_VMULHSU_VX 0x98006057
-#define MASK_VMULHSU_VX  0xfc00707f
 #define MATCH_VMULH_VX 0x9c006057
 #define MASK_VMULH_VX  0xfc00707f
 #define MATCH_VMADD_VX 0xa4006057
@@ -1846,6 +1820,54 @@
 #define MASK_CSRRSI  0x707f
 #define MATCH_CSRRCI 0x7073
 #define MASK_CSRRCI  0x707f
+#define MATCH_P_LB_IRPOST 0xb
+#define MASK_P_LB_IRPOST  0x707f
+#define MATCH_P_LBU_IRPOST 0x400b
+#define MASK_P_LBU_IRPOST  0x707f
+#define MATCH_P_LH_IRPOST 0x100b
+#define MASK_P_LH_IRPOST  0x707f
+#define MATCH_P_LHU_IRPOST 0x500b
+#define MASK_P_LHU_IRPOST  0x707f
+#define MATCH_P_LW_IRPOST 0x200b
+#define MASK_P_LW_IRPOST  0x707f
+#define MATCH_P_LB_RRPOST 0x700b
+#define MASK_P_LB_RRPOST  0xfe00707f
+#define MATCH_P_LBU_RRPOST 0x4000700b
+#define MASK_P_LBU_RRPOST  0xfe00707f
+#define MATCH_P_LH_RRPOST 0x1000700b
+#define MASK_P_LH_RRPOST  0xfe00707f
+#define MATCH_P_LHU_RRPOST 0x5000700b
+#define MASK_P_LHU_RRPOST  0xfe00707f
+#define MATCH_P_LW_RRPOST 0x2000700b
+#define MASK_P_LW_RRPOST  0xfe00707f
+#define MATCH_P_LB_RR 0x7003
+#define MASK_P_LB_RR  0xfe00707f
+#define MATCH_P_LBU_RR 0x40007003
+#define MASK_P_LBU_RR  0xfe00707f
+#define MATCH_P_LH_RR 0x10007003
+#define MASK_P_LH_RR  0xfe00707f
+#define MATCH_P_LHU_RR 0x50007003
+#define MASK_P_LHU_RR  0xfe00707f
+#define MATCH_P_LW_RR 0x20007003
+#define MASK_P_LW_RR  0xfe00707f
+#define MATCH_P_SB_IRPOST 0x2b
+#define MASK_P_SB_IRPOST  0x707f
+#define MATCH_P_SH_IRPOST 0x102b
+#define MASK_P_SH_IRPOST  0x707f
+#define MATCH_P_SW_IRPOST 0x202b
+#define MASK_P_SW_IRPOST  0x707f
+#define MATCH_P_SB_RRPOST 0x402b
+#define MASK_P_SB_RRPOST  0xfe00707f
+#define MATCH_P_SH_RRPOST 0x502b
+#define MASK_P_SH_RRPOST  0xfe00707f
+#define MATCH_P_SW_RRPOST 0x602b
+#define MASK_P_SW_RRPOST  0xfe00707f
+#define MATCH_P_SB_RR 0x4023
+#define MASK_P_SB_RR  0xfe00707f
+#define MATCH_P_SH_RR 0x5023
+#define MASK_P_SH_RR  0xfe00707f
+#define MATCH_P_SW_RR 0x6023
+#define MASK_P_SW_RR  0xfe00707f
 #define MATCH_P_ABS 0x4000033
 #define MASK_P_ABS  0xfff0707f
 #define MATCH_P_SLET 0x4002033
@@ -1880,6 +1902,270 @@
 #define MASK_P_BEQIMM  0x707f
 #define MATCH_P_BNEIMM 0x3063
 #define MASK_P_BNEIMM  0x707f
+#define MATCH_P_MAC 0x42000033
+#define MASK_P_MAC  0xfe00707f
+#define MATCH_P_MSU 0x42001033
+#define MASK_P_MSU  0xfe00707f
+#define MATCH_PV_ADD_H 0x57
+#define MASK_PV_ADD_H  0xfe00707f
+#define MATCH_PV_ADD_SC_H 0x4057
+#define MASK_PV_ADD_SC_H  0xfe00707f
+#define MATCH_PV_ADD_SCI_H 0x6057
+#define MASK_PV_ADD_SCI_H  0xfc00707f
+#define MATCH_PV_ADD_B 0x1057
+#define MASK_PV_ADD_B  0xfe00707f
+#define MATCH_PV_ADD_SC_B 0x5057
+#define MASK_PV_ADD_SC_B  0xfe00707f
+#define MATCH_PV_ADD_SCI_B 0x7057
+#define MASK_PV_ADD_SCI_B  0xfc00707f
+#define MATCH_PV_SUB_H 0x8000057
+#define MASK_PV_SUB_H  0xfe00707f
+#define MATCH_PV_SUB_SC_H 0x8004057
+#define MASK_PV_SUB_SC_H  0xfe00707f
+#define MATCH_PV_SUB_SCI_H 0x8006057
+#define MASK_PV_SUB_SCI_H  0xfc00707f
+#define MATCH_PV_SUB_B 0x8001057
+#define MASK_PV_SUB_B  0xfe00707f
+#define MATCH_PV_SUB_SC_B 0x8005057
+#define MASK_PV_SUB_SC_B  0xfe00707f
+#define MATCH_PV_SUB_SCI_B 0x8007057
+#define MASK_PV_SUB_SCI_B  0xfc00707f
+#define MATCH_PV_AVG_H 0x10000057
+#define MASK_PV_AVG_H  0xfe00707f
+#define MATCH_PV_AVG_SC_H 0x10004057
+#define MASK_PV_AVG_SC_H  0xfe00707f
+#define MATCH_PV_AVG_SCI_H 0x10006057
+#define MASK_PV_AVG_SCI_H  0xfc00707f
+#define MATCH_PV_AVG_B 0x10001057
+#define MASK_PV_AVG_B  0xfe00707f
+#define MATCH_PV_AVG_SC_B 0x10005057
+#define MASK_PV_AVG_SC_B  0xfe00707f
+#define MATCH_PV_AVG_SCI_B 0x10007057
+#define MASK_PV_AVG_SCI_B  0xfc00707f
+#define MATCH_PV_AVGU_H 0x18000057
+#define MASK_PV_AVGU_H  0xfe00707f
+#define MATCH_PV_AVGU_SC_H 0x18004057
+#define MASK_PV_AVGU_SC_H  0xfe00707f
+#define MATCH_PV_AVGU_SCI_H 0x18006057
+#define MASK_PV_AVGU_SCI_H  0xfc00707f
+#define MATCH_PV_AVGU_B 0x18001057
+#define MASK_PV_AVGU_B  0xfe00707f
+#define MATCH_PV_AVGU_SC_B 0x18005057
+#define MASK_PV_AVGU_SC_B  0xfe00707f
+#define MATCH_PV_AVGU_SCI_B 0x18007057
+#define MASK_PV_AVGU_SCI_B  0xfc00707f
+#define MATCH_PV_MIN_H 0x20000057
+#define MASK_PV_MIN_H  0xfe00707f
+#define MATCH_PV_MIN_SC_H 0x20004057
+#define MASK_PV_MIN_SC_H  0xfe00707f
+#define MATCH_PV_MIN_SCI_H 0x20006057
+#define MASK_PV_MIN_SCI_H  0xfc00707f
+#define MATCH_PV_MIN_B 0x20001057
+#define MASK_PV_MIN_B  0xfe00707f
+#define MATCH_PV_MIN_SC_B 0x20005057
+#define MASK_PV_MIN_SC_B  0xfe00707f
+#define MATCH_PV_MIN_SCI_B 0x20007057
+#define MASK_PV_MIN_SCI_B  0xfc00707f
+#define MATCH_PV_MINU_H 0x28000057
+#define MASK_PV_MINU_H  0xfe00707f
+#define MATCH_PV_MINU_SC_H 0x28004057
+#define MASK_PV_MINU_SC_H  0xfe00707f
+#define MATCH_PV_MINU_SCI_H 0x28006057
+#define MASK_PV_MINU_SCI_H  0xfc00707f
+#define MATCH_PV_MINU_B 0x28001057
+#define MASK_PV_MINU_B  0xfe00707f
+#define MATCH_PV_MINU_SC_B 0x28005057
+#define MASK_PV_MINU_SC_B  0xfe00707f
+#define MATCH_PV_MINU_SCI_B 0x28007057
+#define MASK_PV_MINU_SCI_B  0xfc00707f
+#define MATCH_PV_MAX_H 0x30000057
+#define MASK_PV_MAX_H  0xfe00707f
+#define MATCH_PV_MAX_SC_H 0x30004057
+#define MASK_PV_MAX_SC_H  0xfe00707f
+#define MATCH_PV_MAX_SCI_H 0x30006057
+#define MASK_PV_MAX_SCI_H  0xfc00707f
+#define MATCH_PV_MAX_B 0x30001057
+#define MASK_PV_MAX_B  0xfe00707f
+#define MATCH_PV_MAX_SC_B 0x30005057
+#define MASK_PV_MAX_SC_B  0xfe00707f
+#define MATCH_PV_MAX_SCI_B 0x30007057
+#define MASK_PV_MAX_SCI_B  0xfc00707f
+#define MATCH_PV_MAXU_H 0x38000057
+#define MASK_PV_MAXU_H  0xfe00707f
+#define MATCH_PV_MAXU_SC_H 0x38004057
+#define MASK_PV_MAXU_SC_H  0xfe00707f
+#define MATCH_PV_MAXU_SCI_H 0x38006057
+#define MASK_PV_MAXU_SCI_H  0xfc00707f
+#define MATCH_PV_MAXU_B 0x38001057
+#define MASK_PV_MAXU_B  0xfe00707f
+#define MATCH_PV_MAXU_SC_B 0x38005057
+#define MASK_PV_MAXU_SC_B  0xfe00707f
+#define MATCH_PV_MAXU_SCI_B 0x38007057
+#define MASK_PV_MAXU_SCI_B  0xfc00707f
+#define MATCH_PV_SRL_H 0x40000057
+#define MASK_PV_SRL_H  0xfe00707f
+#define MATCH_PV_SRL_SC_H 0x40004057
+#define MASK_PV_SRL_SC_H  0xfe00707f
+#define MATCH_PV_SRL_SCI_H 0x40006057
+#define MASK_PV_SRL_SCI_H  0xfc00707f
+#define MATCH_PV_SRL_B 0x40001057
+#define MASK_PV_SRL_B  0xfe00707f
+#define MATCH_PV_SRL_SC_B 0x40005057
+#define MASK_PV_SRL_SC_B  0xfe00707f
+#define MATCH_PV_SRL_SCI_B 0x40007057
+#define MASK_PV_SRL_SCI_B  0xfc00707f
+#define MATCH_PV_SRA_H 0x48000057
+#define MASK_PV_SRA_H  0xfe00707f
+#define MATCH_PV_SRA_SC_H 0x48004057
+#define MASK_PV_SRA_SC_H  0xfe00707f
+#define MATCH_PV_SRA_SCI_H 0x48006057
+#define MASK_PV_SRA_SCI_H  0xfc00707f
+#define MATCH_PV_SRA_B 0x48001057
+#define MASK_PV_SRA_B  0xfe00707f
+#define MATCH_PV_SRA_SC_B 0x48005057
+#define MASK_PV_SRA_SC_B  0xfe00707f
+#define MATCH_PV_SRA_SCI_B 0x48007057
+#define MASK_PV_SRA_SCI_B  0xfc00707f
+#define MATCH_PV_SLL_H 0x50000057
+#define MASK_PV_SLL_H  0xfe00707f
+#define MATCH_PV_SLL_SC_H 0x50004057
+#define MASK_PV_SLL_SC_H  0xfe00707f
+#define MATCH_PV_SLL_SCI_H 0x50006057
+#define MASK_PV_SLL_SCI_H  0xfc00707f
+#define MATCH_PV_SLL_B 0x50001057
+#define MASK_PV_SLL_B  0xfe00707f
+#define MATCH_PV_SLL_SC_B 0x50005057
+#define MASK_PV_SLL_SC_B  0xfe00707f
+#define MATCH_PV_SLL_SCI_B 0x50007057
+#define MASK_PV_SLL_SCI_B  0xfc00707f
+#define MATCH_PV_OR_H 0x58000057
+#define MASK_PV_OR_H  0xfe00707f
+#define MATCH_PV_OR_SC_H 0x58004057
+#define MASK_PV_OR_SC_H  0xfe00707f
+#define MATCH_PV_OR_SCI_H 0x58006057
+#define MASK_PV_OR_SCI_H  0xfc00707f
+#define MATCH_PV_OR_B 0x58001057
+#define MASK_PV_OR_B  0xfe00707f
+#define MATCH_PV_OR_SC_B 0x58005057
+#define MASK_PV_OR_SC_B  0xfe00707f
+#define MATCH_PV_OR_SCI_B 0x58007057
+#define MASK_PV_OR_SCI_B  0xfc00707f
+#define MATCH_PV_XOR_H 0x60000057
+#define MASK_PV_XOR_H  0xfe00707f
+#define MATCH_PV_XOR_SC_H 0x60004057
+#define MASK_PV_XOR_SC_H  0xfe00707f
+#define MATCH_PV_XOR_SCI_H 0x60006057
+#define MASK_PV_XOR_SCI_H  0xfc00707f
+#define MATCH_PV_XOR_B 0x60001057
+#define MASK_PV_XOR_B  0xfe00707f
+#define MATCH_PV_XOR_SC_B 0x60005057
+#define MASK_PV_XOR_SC_B  0xfe00707f
+#define MATCH_PV_XOR_SCI_B 0x60007057
+#define MASK_PV_XOR_SCI_B  0xfc00707f
+#define MATCH_PV_AND_H 0x68000057
+#define MASK_PV_AND_H  0xfe00707f
+#define MATCH_PV_AND_SC_H 0x68004057
+#define MASK_PV_AND_SC_H  0xfe00707f
+#define MATCH_PV_AND_SCI_H 0x68006057
+#define MASK_PV_AND_SCI_H  0xfc00707f
+#define MATCH_PV_AND_B 0x68001057
+#define MASK_PV_AND_B  0xfe00707f
+#define MATCH_PV_AND_SC_B 0x68005057
+#define MASK_PV_AND_SC_B  0xfe00707f
+#define MATCH_PV_AND_SCI_B 0x68007057
+#define MASK_PV_AND_SCI_B  0xfc00707f
+#define MATCH_PV_ABS_H 0x70000057
+#define MASK_PV_ABS_H  0xfff0707f
+#define MATCH_PV_ABS_B 0x70001057
+#define MASK_PV_ABS_B  0xfff0707f
+#define MATCH_PV_EXTRACT_H 0x78006057
+#define MASK_PV_EXTRACT_H  0xfc00707f
+#define MATCH_PV_EXTRACT_B 0x78007057
+#define MASK_PV_EXTRACT_B  0xfc00707f
+#define MATCH_PV_EXTRACTU_H 0x90006057
+#define MASK_PV_EXTRACTU_H  0xfc00707f
+#define MATCH_PV_EXTRACTU_B 0x90007057
+#define MASK_PV_EXTRACTU_B  0xfc00707f
+#define MATCH_PV_INSERT_H 0xb0006057
+#define MASK_PV_INSERT_H  0xfc00707f
+#define MATCH_PV_INSERT_B 0xb0007057
+#define MASK_PV_INSERT_B  0xfc00707f
+#define MATCH_PV_DOTUP_H 0x80000057
+#define MASK_PV_DOTUP_H  0xfe00707f
+#define MATCH_PV_DOTUP_SC_H 0x80004057
+#define MASK_PV_DOTUP_SC_H  0xfe00707f
+#define MATCH_PV_DOTUP_SCI_H 0x80006057
+#define MASK_PV_DOTUP_SCI_H  0xfc00707f
+#define MATCH_PV_DOTUP_B 0x80001057
+#define MASK_PV_DOTUP_B  0xfe00707f
+#define MATCH_PV_DOTUP_SC_B 0x80005057
+#define MASK_PV_DOTUP_SC_B  0xfe00707f
+#define MATCH_PV_DOTUP_SCI_B 0x80007057
+#define MASK_PV_DOTUP_SCI_B  0xfc00707f
+#define MATCH_PV_DOTUSP_H 0x88000057
+#define MASK_PV_DOTUSP_H  0xfe00707f
+#define MATCH_PV_DOTUSP_SC_H 0x88004057
+#define MASK_PV_DOTUSP_SC_H  0xfe00707f
+#define MATCH_PV_DOTUSP_SCI_H 0x88006057
+#define MASK_PV_DOTUSP_SCI_H  0xfc00707f
+#define MATCH_PV_DOTUSP_B 0x88001057
+#define MASK_PV_DOTUSP_B  0xfe00707f
+#define MATCH_PV_DOTUSP_SC_B 0x88005057
+#define MASK_PV_DOTUSP_SC_B  0xfe00707f
+#define MATCH_PV_DOTUSP_SCI_B 0x88007057
+#define MASK_PV_DOTUSP_SCI_B  0xfc00707f
+#define MATCH_PV_DOTSP_H 0x98000057
+#define MASK_PV_DOTSP_H  0xfe00707f
+#define MATCH_PV_DOTSP_SC_H 0x98004057
+#define MASK_PV_DOTSP_SC_H  0xfe00707f
+#define MATCH_PV_DOTSP_SCI_H 0x98006057
+#define MASK_PV_DOTSP_SCI_H  0xfc00707f
+#define MATCH_PV_DOTSP_B 0x98001057
+#define MASK_PV_DOTSP_B  0xfe00707f
+#define MATCH_PV_DOTSP_SC_B 0x98005057
+#define MASK_PV_DOTSP_SC_B  0xfe00707f
+#define MATCH_PV_DOTSP_SCI_B 0x98007057
+#define MASK_PV_DOTSP_SCI_B  0xfc00707f
+#define MATCH_PV_SDOTUP_H 0xa0000057
+#define MASK_PV_SDOTUP_H  0xfe00707f
+#define MATCH_PV_SDOTUP_SC_H 0xa0004057
+#define MASK_PV_SDOTUP_SC_H  0xfe00707f
+#define MATCH_PV_SDOTUP_SCI_H 0xa0006057
+#define MASK_PV_SDOTUP_SCI_H  0xfc00707f
+#define MATCH_PV_SDOTUP_B 0xa0001057
+#define MASK_PV_SDOTUP_B  0xfe00707f
+#define MATCH_PV_SDOTUP_SC_B 0xa0005057
+#define MASK_PV_SDOTUP_SC_B  0xfe00707f
+#define MATCH_PV_SDOTUP_SCI_B 0xa0007057
+#define MASK_PV_SDOTUP_SCI_B  0xfc00707f
+#define MATCH_PV_SDOTUSP_H 0xa8000057
+#define MASK_PV_SDOTUSP_H  0xfe00707f
+#define MATCH_PV_SDOTUSP_SC_H 0xa8004057
+#define MASK_PV_SDOTUSP_SC_H  0xfe00707f
+#define MATCH_PV_SDOTUSP_SCI_H 0xa8006057
+#define MASK_PV_SDOTUSP_SCI_H  0xfc00707f
+#define MATCH_PV_SDOTUSP_B 0xa8001057
+#define MASK_PV_SDOTUSP_B  0xfe00707f
+#define MATCH_PV_SDOTUSP_SC_B 0xa8005057
+#define MASK_PV_SDOTUSP_SC_B  0xfe00707f
+#define MATCH_PV_SDOTUSP_SCI_B 0xa8007057
+#define MASK_PV_SDOTUSP_SCI_B  0xfc00707f
+#define MATCH_PV_SDOTSP_H 0xb8000057
+#define MASK_PV_SDOTSP_H  0xfe00707f
+#define MATCH_PV_SDOTSP_SC_H 0xb8004057
+#define MASK_PV_SDOTSP_SC_H  0xfe00707f
+#define MATCH_PV_SDOTSP_SCI_H 0xb8006057
+#define MASK_PV_SDOTSP_SCI_H  0xfc00707f
+#define MATCH_PV_SDOTSP_B 0xb8001057
+#define MASK_PV_SDOTSP_B  0xfe00707f
+#define MATCH_PV_SDOTSP_SC_B 0xb8005057
+#define MASK_PV_SDOTSP_SC_B  0xfe00707f
+#define MATCH_PV_SDOTSP_SCI_B 0xb8007057
+#define MASK_PV_SDOTSP_SCI_B  0xfc00707f
+#define MATCH_PV_SHUFFLE2_H 0xc8000057
+#define MASK_PV_SHUFFLE2_H  0xfe00707f
+#define MATCH_PV_SHUFFLE2_B 0xc8001057
+#define MASK_PV_SHUFFLE2_B  0xfe00707f
 #define MATCH_FLAH 0x1007
 #define MASK_FLAH  0x707f
 #define MATCH_FSAH 0x1027
@@ -2848,7 +3134,6 @@ DECLARE_INSN(custom3_rs1_rs2, MATCH_CUSTOM3_RS1_RS2, MASK_CUSTOM3_RS1_RS2)
 DECLARE_INSN(custom3_rd, MATCH_CUSTOM3_RD, MASK_CUSTOM3_RD)
 DECLARE_INSN(custom3_rd_rs1, MATCH_CUSTOM3_RD_RS1, MASK_CUSTOM3_RD_RS1)
 DECLARE_INSN(custom3_rd_rs1_rs2, MATCH_CUSTOM3_RD_RS1_RS2, MASK_CUSTOM3_RD_RS1_RS2)
-DECLARE_INSN(frep, MATCH_FREP, MASK_FREP)
 DECLARE_INSN(slli_rv32, MATCH_SLLI_RV32, MASK_SLLI_RV32)
 DECLARE_INSN(srli_rv32, MATCH_SRLI_RV32, MASK_SRLI_RV32)
 DECLARE_INSN(srai_rv32, MATCH_SRAI_RV32, MASK_SRAI_RV32)
@@ -3149,7 +3434,6 @@ DECLARE_INSN(c_fsdsp, MATCH_C_FSDSP, MASK_C_FSDSP)
 DECLARE_INSN(c_swsp, MATCH_C_SWSP, MASK_C_SWSP)
 DECLARE_INSN(c_fswsp, MATCH_C_FSWSP, MASK_C_FSWSP)
 DECLARE_INSN(vsetvli, MATCH_VSETVLI, MASK_VSETVLI)
-DECLARE_INSN(vsetvl, MATCH_VSETVL, MASK_VSETVL)
 DECLARE_INSN(vle8_v, MATCH_VLE8_V, MASK_VLE8_V)
 DECLARE_INSN(vle16_v, MATCH_VLE16_V, MASK_VLE16_V)
 DECLARE_INSN(vle32_v, MATCH_VLE32_V, MASK_VLE32_V)
@@ -3300,7 +3584,6 @@ DECLARE_INSN(vfnmacc_vv, MATCH_VFNMACC_VV, MASK_VFNMACC_VV)
 DECLARE_INSN(vfmsac_vv, MATCH_VFMSAC_VV, MASK_VFMSAC_VV)
 DECLARE_INSN(vfnmsac_vv, MATCH_VFNMSAC_VV, MASK_VFNMSAC_VV)
 DECLARE_INSN(vfcvt_xu_f_v, MATCH_VFCVT_XU_F_V, MASK_VFCVT_XU_F_V)
-DECLARE_INSN(vfcvt_x_f_v, MATCH_VFCVT_X_F_V, MASK_VFCVT_X_F_V)
 DECLARE_INSN(vfcvt_f_xu_v, MATCH_VFCVT_F_XU_V, MASK_VFCVT_F_XU_V)
 DECLARE_INSN(vfcvt_f_x_v, MATCH_VFCVT_F_X_V, MASK_VFCVT_F_X_V)
 DECLARE_INSN(vfcvt_rtz_xu_f_v, MATCH_VFCVT_RTZ_XU_F_V, MASK_VFCVT_RTZ_XU_F_V)
@@ -3349,9 +3632,7 @@ DECLARE_INSN(vxor_vx, MATCH_VXOR_VX, MASK_VXOR_VX)
 DECLARE_INSN(vrgather_vx, MATCH_VRGATHER_VX, MASK_VRGATHER_VX)
 DECLARE_INSN(vslideup_vx, MATCH_VSLIDEUP_VX, MASK_VSLIDEUP_VX)
 DECLARE_INSN(vslidedown_vx, MATCH_VSLIDEDOWN_VX, MASK_VSLIDEDOWN_VX)
-DECLARE_INSN(vadc_vxm, MATCH_VADC_VXM, MASK_VADC_VXM)
 DECLARE_INSN(vmadc_vxm, MATCH_VMADC_VXM, MASK_VMADC_VXM)
-DECLARE_INSN(vsbc_vxm, MATCH_VSBC_VXM, MASK_VSBC_VXM)
 DECLARE_INSN(vmsbc_vxm, MATCH_VMSBC_VXM, MASK_VMSBC_VXM)
 DECLARE_INSN(vmerge_vxm, MATCH_VMERGE_VXM, MASK_VMERGE_VXM)
 DECLARE_INSN(vmv_v_x, MATCH_VMV_V_X, MASK_VMV_V_X)
@@ -3392,9 +3673,7 @@ DECLARE_INSN(vor_vv, MATCH_VOR_VV, MASK_VOR_VV)
 DECLARE_INSN(vxor_vv, MATCH_VXOR_VV, MASK_VXOR_VV)
 DECLARE_INSN(vrgather_vv, MATCH_VRGATHER_VV, MASK_VRGATHER_VV)
 DECLARE_INSN(vrgatherei16_vv, MATCH_VRGATHEREI16_VV, MASK_VRGATHEREI16_VV)
-DECLARE_INSN(vadc_vvm, MATCH_VADC_VVM, MASK_VADC_VVM)
 DECLARE_INSN(vmadc_vvm, MATCH_VMADC_VVM, MASK_VMADC_VVM)
-DECLARE_INSN(vsbc_vvm, MATCH_VSBC_VVM, MASK_VSBC_VVM)
 DECLARE_INSN(vmsbc_vvm, MATCH_VMSBC_VVM, MASK_VMSBC_VVM)
 DECLARE_INSN(vmerge_vvm, MATCH_VMERGE_VVM, MASK_VMERGE_VVM)
 DECLARE_INSN(vmv_v_v, MATCH_VMV_V_V, MASK_VMV_V_V)
@@ -3519,20 +3798,14 @@ DECLARE_INSN(vwmul_vv, MATCH_VWMUL_VV, MASK_VWMUL_VV)
 DECLARE_INSN(vwmaccu_vv, MATCH_VWMACCU_VV, MASK_VWMACCU_VV)
 DECLARE_INSN(vwmacc_vv, MATCH_VWMACC_VV, MASK_VWMACC_VV)
 DECLARE_INSN(vwmaccsu_vv, MATCH_VWMACCSU_VV, MASK_VWMACCSU_VV)
-DECLARE_INSN(vaaddu_vx, MATCH_VAADDU_VX, MASK_VAADDU_VX)
 DECLARE_INSN(vaadd_vx, MATCH_VAADD_VX, MASK_VAADD_VX)
-DECLARE_INSN(vasubu_vx, MATCH_VASUBU_VX, MASK_VASUBU_VX)
 DECLARE_INSN(vasub_vx, MATCH_VASUB_VX, MASK_VASUB_VX)
 DECLARE_INSN(vmv_s_x, MATCH_VMV_S_X, MASK_VMV_S_X)
-DECLARE_INSN(vslide1up_vx, MATCH_VSLIDE1UP_VX, MASK_VSLIDE1UP_VX)
 DECLARE_INSN(vslide1down_vx, MATCH_VSLIDE1DOWN_VX, MASK_VSLIDE1DOWN_VX)
-DECLARE_INSN(vdivu_vx, MATCH_VDIVU_VX, MASK_VDIVU_VX)
 DECLARE_INSN(vdiv_vx, MATCH_VDIV_VX, MASK_VDIV_VX)
 DECLARE_INSN(vremu_vx, MATCH_VREMU_VX, MASK_VREMU_VX)
 DECLARE_INSN(vrem_vx, MATCH_VREM_VX, MASK_VREM_VX)
-DECLARE_INSN(vmulhu_vx, MATCH_VMULHU_VX, MASK_VMULHU_VX)
 DECLARE_INSN(vmul_vx, MATCH_VMUL_VX, MASK_VMUL_VX)
-DECLARE_INSN(vmulhsu_vx, MATCH_VMULHSU_VX, MASK_VMULHSU_VX)
 DECLARE_INSN(vmulh_vx, MATCH_VMULH_VX, MASK_VMULH_VX)
 DECLARE_INSN(vmadd_vx, MATCH_VMADD_VX, MASK_VMADD_VX)
 DECLARE_INSN(vnmsub_vx, MATCH_VNMSUB_VX, MASK_VNMSUB_VX)
@@ -3608,6 +3881,30 @@ DECLARE_INSN(csrrc, MATCH_CSRRC, MASK_CSRRC)
 DECLARE_INSN(csrrwi, MATCH_CSRRWI, MASK_CSRRWI)
 DECLARE_INSN(csrrsi, MATCH_CSRRSI, MASK_CSRRSI)
 DECLARE_INSN(csrrci, MATCH_CSRRCI, MASK_CSRRCI)
+DECLARE_INSN(p_lb_irpost, MATCH_P_LB_IRPOST, MASK_P_LB_IRPOST)
+DECLARE_INSN(p_lbu_irpost, MATCH_P_LBU_IRPOST, MASK_P_LBU_IRPOST)
+DECLARE_INSN(p_lh_irpost, MATCH_P_LH_IRPOST, MASK_P_LH_IRPOST)
+DECLARE_INSN(p_lhu_irpost, MATCH_P_LHU_IRPOST, MASK_P_LHU_IRPOST)
+DECLARE_INSN(p_lw_irpost, MATCH_P_LW_IRPOST, MASK_P_LW_IRPOST)
+DECLARE_INSN(p_lb_rrpost, MATCH_P_LB_RRPOST, MASK_P_LB_RRPOST)
+DECLARE_INSN(p_lbu_rrpost, MATCH_P_LBU_RRPOST, MASK_P_LBU_RRPOST)
+DECLARE_INSN(p_lh_rrpost, MATCH_P_LH_RRPOST, MASK_P_LH_RRPOST)
+DECLARE_INSN(p_lhu_rrpost, MATCH_P_LHU_RRPOST, MASK_P_LHU_RRPOST)
+DECLARE_INSN(p_lw_rrpost, MATCH_P_LW_RRPOST, MASK_P_LW_RRPOST)
+DECLARE_INSN(p_lb_rr, MATCH_P_LB_RR, MASK_P_LB_RR)
+DECLARE_INSN(p_lbu_rr, MATCH_P_LBU_RR, MASK_P_LBU_RR)
+DECLARE_INSN(p_lh_rr, MATCH_P_LH_RR, MASK_P_LH_RR)
+DECLARE_INSN(p_lhu_rr, MATCH_P_LHU_RR, MASK_P_LHU_RR)
+DECLARE_INSN(p_lw_rr, MATCH_P_LW_RR, MASK_P_LW_RR)
+DECLARE_INSN(p_sb_irpost, MATCH_P_SB_IRPOST, MASK_P_SB_IRPOST)
+DECLARE_INSN(p_sh_irpost, MATCH_P_SH_IRPOST, MASK_P_SH_IRPOST)
+DECLARE_INSN(p_sw_irpost, MATCH_P_SW_IRPOST, MASK_P_SW_IRPOST)
+DECLARE_INSN(p_sb_rrpost, MATCH_P_SB_RRPOST, MASK_P_SB_RRPOST)
+DECLARE_INSN(p_sh_rrpost, MATCH_P_SH_RRPOST, MASK_P_SH_RRPOST)
+DECLARE_INSN(p_sw_rrpost, MATCH_P_SW_RRPOST, MASK_P_SW_RRPOST)
+DECLARE_INSN(p_sb_rr, MATCH_P_SB_RR, MASK_P_SB_RR)
+DECLARE_INSN(p_sh_rr, MATCH_P_SH_RR, MASK_P_SH_RR)
+DECLARE_INSN(p_sw_rr, MATCH_P_SW_RR, MASK_P_SW_RR)
 DECLARE_INSN(p_abs, MATCH_P_ABS, MASK_P_ABS)
 DECLARE_INSN(p_slet, MATCH_P_SLET, MASK_P_SLET)
 DECLARE_INSN(p_sletu, MATCH_P_SLETU, MASK_P_SLETU)
@@ -3625,6 +3922,138 @@ DECLARE_INSN(p_clipr, MATCH_P_CLIPR, MASK_P_CLIPR)
 DECLARE_INSN(p_clipur, MATCH_P_CLIPUR, MASK_P_CLIPUR)
 DECLARE_INSN(p_beqimm, MATCH_P_BEQIMM, MASK_P_BEQIMM)
 DECLARE_INSN(p_bneimm, MATCH_P_BNEIMM, MASK_P_BNEIMM)
+DECLARE_INSN(p_mac, MATCH_P_MAC, MASK_P_MAC)
+DECLARE_INSN(p_msu, MATCH_P_MSU, MASK_P_MSU)
+DECLARE_INSN(pv_add_h, MATCH_PV_ADD_H, MASK_PV_ADD_H)
+DECLARE_INSN(pv_add_sc_h, MATCH_PV_ADD_SC_H, MASK_PV_ADD_SC_H)
+DECLARE_INSN(pv_add_sci_h, MATCH_PV_ADD_SCI_H, MASK_PV_ADD_SCI_H)
+DECLARE_INSN(pv_add_b, MATCH_PV_ADD_B, MASK_PV_ADD_B)
+DECLARE_INSN(pv_add_sc_b, MATCH_PV_ADD_SC_B, MASK_PV_ADD_SC_B)
+DECLARE_INSN(pv_add_sci_b, MATCH_PV_ADD_SCI_B, MASK_PV_ADD_SCI_B)
+DECLARE_INSN(pv_sub_h, MATCH_PV_SUB_H, MASK_PV_SUB_H)
+DECLARE_INSN(pv_sub_sc_h, MATCH_PV_SUB_SC_H, MASK_PV_SUB_SC_H)
+DECLARE_INSN(pv_sub_sci_h, MATCH_PV_SUB_SCI_H, MASK_PV_SUB_SCI_H)
+DECLARE_INSN(pv_sub_b, MATCH_PV_SUB_B, MASK_PV_SUB_B)
+DECLARE_INSN(pv_sub_sc_b, MATCH_PV_SUB_SC_B, MASK_PV_SUB_SC_B)
+DECLARE_INSN(pv_sub_sci_b, MATCH_PV_SUB_SCI_B, MASK_PV_SUB_SCI_B)
+DECLARE_INSN(pv_avg_h, MATCH_PV_AVG_H, MASK_PV_AVG_H)
+DECLARE_INSN(pv_avg_sc_h, MATCH_PV_AVG_SC_H, MASK_PV_AVG_SC_H)
+DECLARE_INSN(pv_avg_sci_h, MATCH_PV_AVG_SCI_H, MASK_PV_AVG_SCI_H)
+DECLARE_INSN(pv_avg_b, MATCH_PV_AVG_B, MASK_PV_AVG_B)
+DECLARE_INSN(pv_avg_sc_b, MATCH_PV_AVG_SC_B, MASK_PV_AVG_SC_B)
+DECLARE_INSN(pv_avg_sci_b, MATCH_PV_AVG_SCI_B, MASK_PV_AVG_SCI_B)
+DECLARE_INSN(pv_avgu_h, MATCH_PV_AVGU_H, MASK_PV_AVGU_H)
+DECLARE_INSN(pv_avgu_sc_h, MATCH_PV_AVGU_SC_H, MASK_PV_AVGU_SC_H)
+DECLARE_INSN(pv_avgu_sci_h, MATCH_PV_AVGU_SCI_H, MASK_PV_AVGU_SCI_H)
+DECLARE_INSN(pv_avgu_b, MATCH_PV_AVGU_B, MASK_PV_AVGU_B)
+DECLARE_INSN(pv_avgu_sc_b, MATCH_PV_AVGU_SC_B, MASK_PV_AVGU_SC_B)
+DECLARE_INSN(pv_avgu_sci_b, MATCH_PV_AVGU_SCI_B, MASK_PV_AVGU_SCI_B)
+DECLARE_INSN(pv_min_h, MATCH_PV_MIN_H, MASK_PV_MIN_H)
+DECLARE_INSN(pv_min_sc_h, MATCH_PV_MIN_SC_H, MASK_PV_MIN_SC_H)
+DECLARE_INSN(pv_min_sci_h, MATCH_PV_MIN_SCI_H, MASK_PV_MIN_SCI_H)
+DECLARE_INSN(pv_min_b, MATCH_PV_MIN_B, MASK_PV_MIN_B)
+DECLARE_INSN(pv_min_sc_b, MATCH_PV_MIN_SC_B, MASK_PV_MIN_SC_B)
+DECLARE_INSN(pv_min_sci_b, MATCH_PV_MIN_SCI_B, MASK_PV_MIN_SCI_B)
+DECLARE_INSN(pv_minu_h, MATCH_PV_MINU_H, MASK_PV_MINU_H)
+DECLARE_INSN(pv_minu_sc_h, MATCH_PV_MINU_SC_H, MASK_PV_MINU_SC_H)
+DECLARE_INSN(pv_minu_sci_h, MATCH_PV_MINU_SCI_H, MASK_PV_MINU_SCI_H)
+DECLARE_INSN(pv_minu_b, MATCH_PV_MINU_B, MASK_PV_MINU_B)
+DECLARE_INSN(pv_minu_sc_b, MATCH_PV_MINU_SC_B, MASK_PV_MINU_SC_B)
+DECLARE_INSN(pv_minu_sci_b, MATCH_PV_MINU_SCI_B, MASK_PV_MINU_SCI_B)
+DECLARE_INSN(pv_max_h, MATCH_PV_MAX_H, MASK_PV_MAX_H)
+DECLARE_INSN(pv_max_sc_h, MATCH_PV_MAX_SC_H, MASK_PV_MAX_SC_H)
+DECLARE_INSN(pv_max_sci_h, MATCH_PV_MAX_SCI_H, MASK_PV_MAX_SCI_H)
+DECLARE_INSN(pv_max_b, MATCH_PV_MAX_B, MASK_PV_MAX_B)
+DECLARE_INSN(pv_max_sc_b, MATCH_PV_MAX_SC_B, MASK_PV_MAX_SC_B)
+DECLARE_INSN(pv_max_sci_b, MATCH_PV_MAX_SCI_B, MASK_PV_MAX_SCI_B)
+DECLARE_INSN(pv_maxu_h, MATCH_PV_MAXU_H, MASK_PV_MAXU_H)
+DECLARE_INSN(pv_maxu_sc_h, MATCH_PV_MAXU_SC_H, MASK_PV_MAXU_SC_H)
+DECLARE_INSN(pv_maxu_sci_h, MATCH_PV_MAXU_SCI_H, MASK_PV_MAXU_SCI_H)
+DECLARE_INSN(pv_maxu_b, MATCH_PV_MAXU_B, MASK_PV_MAXU_B)
+DECLARE_INSN(pv_maxu_sc_b, MATCH_PV_MAXU_SC_B, MASK_PV_MAXU_SC_B)
+DECLARE_INSN(pv_maxu_sci_b, MATCH_PV_MAXU_SCI_B, MASK_PV_MAXU_SCI_B)
+DECLARE_INSN(pv_srl_h, MATCH_PV_SRL_H, MASK_PV_SRL_H)
+DECLARE_INSN(pv_srl_sc_h, MATCH_PV_SRL_SC_H, MASK_PV_SRL_SC_H)
+DECLARE_INSN(pv_srl_sci_h, MATCH_PV_SRL_SCI_H, MASK_PV_SRL_SCI_H)
+DECLARE_INSN(pv_srl_b, MATCH_PV_SRL_B, MASK_PV_SRL_B)
+DECLARE_INSN(pv_srl_sc_b, MATCH_PV_SRL_SC_B, MASK_PV_SRL_SC_B)
+DECLARE_INSN(pv_srl_sci_b, MATCH_PV_SRL_SCI_B, MASK_PV_SRL_SCI_B)
+DECLARE_INSN(pv_sra_h, MATCH_PV_SRA_H, MASK_PV_SRA_H)
+DECLARE_INSN(pv_sra_sc_h, MATCH_PV_SRA_SC_H, MASK_PV_SRA_SC_H)
+DECLARE_INSN(pv_sra_sci_h, MATCH_PV_SRA_SCI_H, MASK_PV_SRA_SCI_H)
+DECLARE_INSN(pv_sra_b, MATCH_PV_SRA_B, MASK_PV_SRA_B)
+DECLARE_INSN(pv_sra_sc_b, MATCH_PV_SRA_SC_B, MASK_PV_SRA_SC_B)
+DECLARE_INSN(pv_sra_sci_b, MATCH_PV_SRA_SCI_B, MASK_PV_SRA_SCI_B)
+DECLARE_INSN(pv_sll_h, MATCH_PV_SLL_H, MASK_PV_SLL_H)
+DECLARE_INSN(pv_sll_sc_h, MATCH_PV_SLL_SC_H, MASK_PV_SLL_SC_H)
+DECLARE_INSN(pv_sll_sci_h, MATCH_PV_SLL_SCI_H, MASK_PV_SLL_SCI_H)
+DECLARE_INSN(pv_sll_b, MATCH_PV_SLL_B, MASK_PV_SLL_B)
+DECLARE_INSN(pv_sll_sc_b, MATCH_PV_SLL_SC_B, MASK_PV_SLL_SC_B)
+DECLARE_INSN(pv_sll_sci_b, MATCH_PV_SLL_SCI_B, MASK_PV_SLL_SCI_B)
+DECLARE_INSN(pv_or_h, MATCH_PV_OR_H, MASK_PV_OR_H)
+DECLARE_INSN(pv_or_sc_h, MATCH_PV_OR_SC_H, MASK_PV_OR_SC_H)
+DECLARE_INSN(pv_or_sci_h, MATCH_PV_OR_SCI_H, MASK_PV_OR_SCI_H)
+DECLARE_INSN(pv_or_b, MATCH_PV_OR_B, MASK_PV_OR_B)
+DECLARE_INSN(pv_or_sc_b, MATCH_PV_OR_SC_B, MASK_PV_OR_SC_B)
+DECLARE_INSN(pv_or_sci_b, MATCH_PV_OR_SCI_B, MASK_PV_OR_SCI_B)
+DECLARE_INSN(pv_xor_h, MATCH_PV_XOR_H, MASK_PV_XOR_H)
+DECLARE_INSN(pv_xor_sc_h, MATCH_PV_XOR_SC_H, MASK_PV_XOR_SC_H)
+DECLARE_INSN(pv_xor_sci_h, MATCH_PV_XOR_SCI_H, MASK_PV_XOR_SCI_H)
+DECLARE_INSN(pv_xor_b, MATCH_PV_XOR_B, MASK_PV_XOR_B)
+DECLARE_INSN(pv_xor_sc_b, MATCH_PV_XOR_SC_B, MASK_PV_XOR_SC_B)
+DECLARE_INSN(pv_xor_sci_b, MATCH_PV_XOR_SCI_B, MASK_PV_XOR_SCI_B)
+DECLARE_INSN(pv_and_h, MATCH_PV_AND_H, MASK_PV_AND_H)
+DECLARE_INSN(pv_and_sc_h, MATCH_PV_AND_SC_H, MASK_PV_AND_SC_H)
+DECLARE_INSN(pv_and_sci_h, MATCH_PV_AND_SCI_H, MASK_PV_AND_SCI_H)
+DECLARE_INSN(pv_and_b, MATCH_PV_AND_B, MASK_PV_AND_B)
+DECLARE_INSN(pv_and_sc_b, MATCH_PV_AND_SC_B, MASK_PV_AND_SC_B)
+DECLARE_INSN(pv_and_sci_b, MATCH_PV_AND_SCI_B, MASK_PV_AND_SCI_B)
+DECLARE_INSN(pv_abs_h, MATCH_PV_ABS_H, MASK_PV_ABS_H)
+DECLARE_INSN(pv_abs_b, MATCH_PV_ABS_B, MASK_PV_ABS_B)
+DECLARE_INSN(pv_extract_h, MATCH_PV_EXTRACT_H, MASK_PV_EXTRACT_H)
+DECLARE_INSN(pv_extract_b, MATCH_PV_EXTRACT_B, MASK_PV_EXTRACT_B)
+DECLARE_INSN(pv_extractu_h, MATCH_PV_EXTRACTU_H, MASK_PV_EXTRACTU_H)
+DECLARE_INSN(pv_extractu_b, MATCH_PV_EXTRACTU_B, MASK_PV_EXTRACTU_B)
+DECLARE_INSN(pv_insert_h, MATCH_PV_INSERT_H, MASK_PV_INSERT_H)
+DECLARE_INSN(pv_insert_b, MATCH_PV_INSERT_B, MASK_PV_INSERT_B)
+DECLARE_INSN(pv_dotup_h, MATCH_PV_DOTUP_H, MASK_PV_DOTUP_H)
+DECLARE_INSN(pv_dotup_sc_h, MATCH_PV_DOTUP_SC_H, MASK_PV_DOTUP_SC_H)
+DECLARE_INSN(pv_dotup_sci_h, MATCH_PV_DOTUP_SCI_H, MASK_PV_DOTUP_SCI_H)
+DECLARE_INSN(pv_dotup_b, MATCH_PV_DOTUP_B, MASK_PV_DOTUP_B)
+DECLARE_INSN(pv_dotup_sc_b, MATCH_PV_DOTUP_SC_B, MASK_PV_DOTUP_SC_B)
+DECLARE_INSN(pv_dotup_sci_b, MATCH_PV_DOTUP_SCI_B, MASK_PV_DOTUP_SCI_B)
+DECLARE_INSN(pv_dotusp_h, MATCH_PV_DOTUSP_H, MASK_PV_DOTUSP_H)
+DECLARE_INSN(pv_dotusp_sc_h, MATCH_PV_DOTUSP_SC_H, MASK_PV_DOTUSP_SC_H)
+DECLARE_INSN(pv_dotusp_sci_h, MATCH_PV_DOTUSP_SCI_H, MASK_PV_DOTUSP_SCI_H)
+DECLARE_INSN(pv_dotusp_b, MATCH_PV_DOTUSP_B, MASK_PV_DOTUSP_B)
+DECLARE_INSN(pv_dotusp_sc_b, MATCH_PV_DOTUSP_SC_B, MASK_PV_DOTUSP_SC_B)
+DECLARE_INSN(pv_dotusp_sci_b, MATCH_PV_DOTUSP_SCI_B, MASK_PV_DOTUSP_SCI_B)
+DECLARE_INSN(pv_dotsp_h, MATCH_PV_DOTSP_H, MASK_PV_DOTSP_H)
+DECLARE_INSN(pv_dotsp_sc_h, MATCH_PV_DOTSP_SC_H, MASK_PV_DOTSP_SC_H)
+DECLARE_INSN(pv_dotsp_sci_h, MATCH_PV_DOTSP_SCI_H, MASK_PV_DOTSP_SCI_H)
+DECLARE_INSN(pv_dotsp_b, MATCH_PV_DOTSP_B, MASK_PV_DOTSP_B)
+DECLARE_INSN(pv_dotsp_sc_b, MATCH_PV_DOTSP_SC_B, MASK_PV_DOTSP_SC_B)
+DECLARE_INSN(pv_dotsp_sci_b, MATCH_PV_DOTSP_SCI_B, MASK_PV_DOTSP_SCI_B)
+DECLARE_INSN(pv_sdotup_h, MATCH_PV_SDOTUP_H, MASK_PV_SDOTUP_H)
+DECLARE_INSN(pv_sdotup_sc_h, MATCH_PV_SDOTUP_SC_H, MASK_PV_SDOTUP_SC_H)
+DECLARE_INSN(pv_sdotup_sci_h, MATCH_PV_SDOTUP_SCI_H, MASK_PV_SDOTUP_SCI_H)
+DECLARE_INSN(pv_sdotup_b, MATCH_PV_SDOTUP_B, MASK_PV_SDOTUP_B)
+DECLARE_INSN(pv_sdotup_sc_b, MATCH_PV_SDOTUP_SC_B, MASK_PV_SDOTUP_SC_B)
+DECLARE_INSN(pv_sdotup_sci_b, MATCH_PV_SDOTUP_SCI_B, MASK_PV_SDOTUP_SCI_B)
+DECLARE_INSN(pv_sdotusp_h, MATCH_PV_SDOTUSP_H, MASK_PV_SDOTUSP_H)
+DECLARE_INSN(pv_sdotusp_sc_h, MATCH_PV_SDOTUSP_SC_H, MASK_PV_SDOTUSP_SC_H)
+DECLARE_INSN(pv_sdotusp_sci_h, MATCH_PV_SDOTUSP_SCI_H, MASK_PV_SDOTUSP_SCI_H)
+DECLARE_INSN(pv_sdotusp_b, MATCH_PV_SDOTUSP_B, MASK_PV_SDOTUSP_B)
+DECLARE_INSN(pv_sdotusp_sc_b, MATCH_PV_SDOTUSP_SC_B, MASK_PV_SDOTUSP_SC_B)
+DECLARE_INSN(pv_sdotusp_sci_b, MATCH_PV_SDOTUSP_SCI_B, MASK_PV_SDOTUSP_SCI_B)
+DECLARE_INSN(pv_sdotsp_h, MATCH_PV_SDOTSP_H, MASK_PV_SDOTSP_H)
+DECLARE_INSN(pv_sdotsp_sc_h, MATCH_PV_SDOTSP_SC_H, MASK_PV_SDOTSP_SC_H)
+DECLARE_INSN(pv_sdotsp_sci_h, MATCH_PV_SDOTSP_SCI_H, MASK_PV_SDOTSP_SCI_H)
+DECLARE_INSN(pv_sdotsp_b, MATCH_PV_SDOTSP_B, MASK_PV_SDOTSP_B)
+DECLARE_INSN(pv_sdotsp_sc_b, MATCH_PV_SDOTSP_SC_B, MASK_PV_SDOTSP_SC_B)
+DECLARE_INSN(pv_sdotsp_sci_b, MATCH_PV_SDOTSP_SCI_B, MASK_PV_SDOTSP_SCI_B)
+DECLARE_INSN(pv_shuffle2_h, MATCH_PV_SHUFFLE2_H, MASK_PV_SHUFFLE2_H)
+DECLARE_INSN(pv_shuffle2_b, MATCH_PV_SHUFFLE2_B, MASK_PV_SHUFFLE2_B)
 DECLARE_INSN(flah, MATCH_FLAH, MASK_FLAH)
 DECLARE_INSN(fsah, MATCH_FSAH, MASK_FSAH)
 DECLARE_INSN(fmadd_ah, MATCH_FMADD_AH, MASK_FMADD_AH)
diff --git a/toolchain/riscv-opcodes/inst.sverilog b/toolchain/riscv-opcodes/inst.sverilog
index 00ee613df..f50df39ca 100644
--- a/toolchain/riscv-opcodes/inst.sverilog
+++ b/toolchain/riscv-opcodes/inst.sverilog
@@ -24,7 +24,6 @@ package riscv_instr;
   localparam [31:0] CUSTOM3_RD         = 32'b?????????????????100?????1111011;
   localparam [31:0] CUSTOM3_RD_RS1     = 32'b?????????????????110?????1111011;
   localparam [31:0] CUSTOM3_RD_RS1_RS2 = 32'b?????????????????111?????1111011;
-  localparam [31:0] FREP               = 32'b?????????????????????????0001011;
   localparam [31:0] SLLI_RV32          = 32'b0000000??????????001?????0010011;
   localparam [31:0] SRLI_RV32          = 32'b0000000??????????101?????0010011;
   localparam [31:0] SRAI_RV32          = 32'b0100000??????????101?????0010011;
@@ -325,7 +324,6 @@ package riscv_instr;
   localparam [31:0] C_SWSP             = 32'b????????????????110???????????10;
   localparam [31:0] C_FSWSP            = 32'b????????????????111???????????10;
   localparam [31:0] VSETVLI            = 32'b0????????????????111?????1010111;
-  localparam [31:0] VSETVL             = 32'b1000000??????????111?????1010111;
   localparam [31:0] VLE8_V             = 32'b???000?00000?????000?????0000111;
   localparam [31:0] VLE16_V            = 32'b???000?00000?????101?????0000111;
   localparam [31:0] VLE32_V            = 32'b???000?00000?????110?????0000111;
@@ -476,7 +474,6 @@ package riscv_instr;
   localparam [31:0] VFMSAC_VV          = 32'b101110???????????001?????1010111;
   localparam [31:0] VFNMSAC_VV         = 32'b101111???????????001?????1010111;
   localparam [31:0] VFCVT_XU_F_V       = 32'b010010??????00000001?????1010111;
-  localparam [31:0] VFCVT_X_F_V        = 32'b010010??????00001001?????1010111;
   localparam [31:0] VFCVT_F_XU_V       = 32'b010010??????00010001?????1010111;
   localparam [31:0] VFCVT_F_X_V        = 32'b010010??????00011001?????1010111;
   localparam [31:0] VFCVT_RTZ_XU_F_V   = 32'b010010??????00110001?????1010111;
@@ -525,9 +522,7 @@ package riscv_instr;
   localparam [31:0] VRGATHER_VX        = 32'b001100???????????100?????1010111;
   localparam [31:0] VSLIDEUP_VX        = 32'b001110???????????100?????1010111;
   localparam [31:0] VSLIDEDOWN_VX      = 32'b001111???????????100?????1010111;
-  localparam [31:0] VADC_VXM           = 32'b0100000??????????100?????1010111;
   localparam [31:0] VMADC_VXM          = 32'b010001???????????100?????1010111;
-  localparam [31:0] VSBC_VXM           = 32'b0100100??????????100?????1010111;
   localparam [31:0] VMSBC_VXM          = 32'b010011???????????100?????1010111;
   localparam [31:0] VMERGE_VXM         = 32'b0101110??????????100?????1010111;
   localparam [31:0] VMV_V_X            = 32'b010111100000?????100?????1010111;
@@ -568,9 +563,7 @@ package riscv_instr;
   localparam [31:0] VXOR_VV            = 32'b001011???????????000?????1010111;
   localparam [31:0] VRGATHER_VV        = 32'b001100???????????000?????1010111;
   localparam [31:0] VRGATHEREI16_VV    = 32'b001110???????????000?????1010111;
-  localparam [31:0] VADC_VVM           = 32'b0100000??????????000?????1010111;
   localparam [31:0] VMADC_VVM          = 32'b010001???????????000?????1010111;
-  localparam [31:0] VSBC_VVM           = 32'b0100100??????????000?????1010111;
   localparam [31:0] VMSBC_VVM          = 32'b010011???????????000?????1010111;
   localparam [31:0] VMERGE_VVM         = 32'b0101110??????????000?????1010111;
   localparam [31:0] VMV_V_V            = 32'b010111100000?????000?????1010111;
@@ -695,20 +688,14 @@ package riscv_instr;
   localparam [31:0] VWMACCU_VV         = 32'b111100???????????010?????1010111;
   localparam [31:0] VWMACC_VV          = 32'b111101???????????010?????1010111;
   localparam [31:0] VWMACCSU_VV        = 32'b111111???????????010?????1010111;
-  localparam [31:0] VAADDU_VX          = 32'b001000???????????110?????1010111;
   localparam [31:0] VAADD_VX           = 32'b001001???????????110?????1010111;
-  localparam [31:0] VASUBU_VX          = 32'b001010???????????110?????1010111;
   localparam [31:0] VASUB_VX           = 32'b001011???????????110?????1010111;
   localparam [31:0] VMV_S_X            = 32'b010000100000?????110?????1010111;
-  localparam [31:0] VSLIDE1UP_VX       = 32'b001110???????????110?????1010111;
   localparam [31:0] VSLIDE1DOWN_VX     = 32'b001111???????????110?????1010111;
-  localparam [31:0] VDIVU_VX           = 32'b100000???????????110?????1010111;
   localparam [31:0] VDIV_VX            = 32'b100001???????????110?????1010111;
   localparam [31:0] VREMU_VX           = 32'b100010???????????110?????1010111;
   localparam [31:0] VREM_VX            = 32'b100011???????????110?????1010111;
-  localparam [31:0] VMULHU_VX          = 32'b100100???????????110?????1010111;
   localparam [31:0] VMUL_VX            = 32'b100101???????????110?????1010111;
-  localparam [31:0] VMULHSU_VX         = 32'b100110???????????110?????1010111;
   localparam [31:0] VMULH_VX           = 32'b100111???????????110?????1010111;
   localparam [31:0] VMADD_VX           = 32'b101001???????????110?????1010111;
   localparam [31:0] VNMSUB_VX          = 32'b101011???????????110?????1010111;
@@ -784,6 +771,30 @@ package riscv_instr;
   localparam [31:0] CSRRWI             = 32'b?????????????????101?????1110011;
   localparam [31:0] CSRRSI             = 32'b?????????????????110?????1110011;
   localparam [31:0] CSRRCI             = 32'b?????????????????111?????1110011;
+  localparam [31:0] P_LB_IRPOST        = 32'b?????????????????000?????0001011;
+  localparam [31:0] P_LBU_IRPOST       = 32'b?????????????????100?????0001011;
+  localparam [31:0] P_LH_IRPOST        = 32'b?????????????????001?????0001011;
+  localparam [31:0] P_LHU_IRPOST       = 32'b?????????????????101?????0001011;
+  localparam [31:0] P_LW_IRPOST        = 32'b?????????????????010?????0001011;
+  localparam [31:0] P_LB_RRPOST        = 32'b0000000??????????111?????0001011;
+  localparam [31:0] P_LBU_RRPOST       = 32'b0100000??????????111?????0001011;
+  localparam [31:0] P_LH_RRPOST        = 32'b0001000??????????111?????0001011;
+  localparam [31:0] P_LHU_RRPOST       = 32'b0101000??????????111?????0001011;
+  localparam [31:0] P_LW_RRPOST        = 32'b0010000??????????111?????0001011;
+  localparam [31:0] P_LB_RR            = 32'b0000000??????????111?????0000011;
+  localparam [31:0] P_LBU_RR           = 32'b0100000??????????111?????0000011;
+  localparam [31:0] P_LH_RR            = 32'b0001000??????????111?????0000011;
+  localparam [31:0] P_LHU_RR           = 32'b0101000??????????111?????0000011;
+  localparam [31:0] P_LW_RR            = 32'b0010000??????????111?????0000011;
+  localparam [31:0] P_SB_IRPOST        = 32'b?????????????????000?????0101011;
+  localparam [31:0] P_SH_IRPOST        = 32'b?????????????????001?????0101011;
+  localparam [31:0] P_SW_IRPOST        = 32'b?????????????????010?????0101011;
+  localparam [31:0] P_SB_RRPOST        = 32'b0000000??????????100?????0101011;
+  localparam [31:0] P_SH_RRPOST        = 32'b0000000??????????101?????0101011;
+  localparam [31:0] P_SW_RRPOST        = 32'b0000000??????????110?????0101011;
+  localparam [31:0] P_SB_RR            = 32'b0000000??????????100?????0100011;
+  localparam [31:0] P_SH_RR            = 32'b0000000??????????101?????0100011;
+  localparam [31:0] P_SW_RR            = 32'b0000000??????????110?????0100011;
   localparam [31:0] P_ABS              = 32'b000001000000?????000?????0110011;
   localparam [31:0] P_SLET             = 32'b0000010??????????010?????0110011;
   localparam [31:0] P_SLETU            = 32'b0000010??????????011?????0110011;
@@ -801,6 +812,138 @@ package riscv_instr;
   localparam [31:0] P_CLIPUR           = 32'b0001010??????????110?????0110011;
   localparam [31:0] P_BEQIMM           = 32'b?????????????????010?????1100011;
   localparam [31:0] P_BNEIMM           = 32'b?????????????????011?????1100011;
+  localparam [31:0] P_MAC              = 32'b0100001??????????000?????0110011;
+  localparam [31:0] P_MSU              = 32'b0100001??????????001?????0110011;
+  localparam [31:0] PV_ADD_H           = 32'b0000000??????????000?????1010111;
+  localparam [31:0] PV_ADD_SC_H        = 32'b0000000??????????100?????1010111;
+  localparam [31:0] PV_ADD_SCI_H       = 32'b000000???????????110?????1010111;
+  localparam [31:0] PV_ADD_B           = 32'b0000000??????????001?????1010111;
+  localparam [31:0] PV_ADD_SC_B        = 32'b0000000??????????101?????1010111;
+  localparam [31:0] PV_ADD_SCI_B       = 32'b000000???????????111?????1010111;
+  localparam [31:0] PV_SUB_H           = 32'b0000100??????????000?????1010111;
+  localparam [31:0] PV_SUB_SC_H        = 32'b0000100??????????100?????1010111;
+  localparam [31:0] PV_SUB_SCI_H       = 32'b000010???????????110?????1010111;
+  localparam [31:0] PV_SUB_B           = 32'b0000100??????????001?????1010111;
+  localparam [31:0] PV_SUB_SC_B        = 32'b0000100??????????101?????1010111;
+  localparam [31:0] PV_SUB_SCI_B       = 32'b000010???????????111?????1010111;
+  localparam [31:0] PV_AVG_H           = 32'b0001000??????????000?????1010111;
+  localparam [31:0] PV_AVG_SC_H        = 32'b0001000??????????100?????1010111;
+  localparam [31:0] PV_AVG_SCI_H       = 32'b000100???????????110?????1010111;
+  localparam [31:0] PV_AVG_B           = 32'b0001000??????????001?????1010111;
+  localparam [31:0] PV_AVG_SC_B        = 32'b0001000??????????101?????1010111;
+  localparam [31:0] PV_AVG_SCI_B       = 32'b000100???????????111?????1010111;
+  localparam [31:0] PV_AVGU_H          = 32'b0001100??????????000?????1010111;
+  localparam [31:0] PV_AVGU_SC_H       = 32'b0001100??????????100?????1010111;
+  localparam [31:0] PV_AVGU_SCI_H      = 32'b000110???????????110?????1010111;
+  localparam [31:0] PV_AVGU_B          = 32'b0001100??????????001?????1010111;
+  localparam [31:0] PV_AVGU_SC_B       = 32'b0001100??????????101?????1010111;
+  localparam [31:0] PV_AVGU_SCI_B      = 32'b000110???????????111?????1010111;
+  localparam [31:0] PV_MIN_H           = 32'b0010000??????????000?????1010111;
+  localparam [31:0] PV_MIN_SC_H        = 32'b0010000??????????100?????1010111;
+  localparam [31:0] PV_MIN_SCI_H       = 32'b001000???????????110?????1010111;
+  localparam [31:0] PV_MIN_B           = 32'b0010000??????????001?????1010111;
+  localparam [31:0] PV_MIN_SC_B        = 32'b0010000??????????101?????1010111;
+  localparam [31:0] PV_MIN_SCI_B       = 32'b001000???????????111?????1010111;
+  localparam [31:0] PV_MINU_H          = 32'b0010100??????????000?????1010111;
+  localparam [31:0] PV_MINU_SC_H       = 32'b0010100??????????100?????1010111;
+  localparam [31:0] PV_MINU_SCI_H      = 32'b001010???????????110?????1010111;
+  localparam [31:0] PV_MINU_B          = 32'b0010100??????????001?????1010111;
+  localparam [31:0] PV_MINU_SC_B       = 32'b0010100??????????101?????1010111;
+  localparam [31:0] PV_MINU_SCI_B      = 32'b001010???????????111?????1010111;
+  localparam [31:0] PV_MAX_H           = 32'b0011000??????????000?????1010111;
+  localparam [31:0] PV_MAX_SC_H        = 32'b0011000??????????100?????1010111;
+  localparam [31:0] PV_MAX_SCI_H       = 32'b001100???????????110?????1010111;
+  localparam [31:0] PV_MAX_B           = 32'b0011000??????????001?????1010111;
+  localparam [31:0] PV_MAX_SC_B        = 32'b0011000??????????101?????1010111;
+  localparam [31:0] PV_MAX_SCI_B       = 32'b001100???????????111?????1010111;
+  localparam [31:0] PV_MAXU_H          = 32'b0011100??????????000?????1010111;
+  localparam [31:0] PV_MAXU_SC_H       = 32'b0011100??????????100?????1010111;
+  localparam [31:0] PV_MAXU_SCI_H      = 32'b001110???????????110?????1010111;
+  localparam [31:0] PV_MAXU_B          = 32'b0011100??????????001?????1010111;
+  localparam [31:0] PV_MAXU_SC_B       = 32'b0011100??????????101?????1010111;
+  localparam [31:0] PV_MAXU_SCI_B      = 32'b001110???????????111?????1010111;
+  localparam [31:0] PV_SRL_H           = 32'b0100000??????????000?????1010111;
+  localparam [31:0] PV_SRL_SC_H        = 32'b0100000??????????100?????1010111;
+  localparam [31:0] PV_SRL_SCI_H       = 32'b010000???????????110?????1010111;
+  localparam [31:0] PV_SRL_B           = 32'b0100000??????????001?????1010111;
+  localparam [31:0] PV_SRL_SC_B        = 32'b0100000??????????101?????1010111;
+  localparam [31:0] PV_SRL_SCI_B       = 32'b010000???????????111?????1010111;
+  localparam [31:0] PV_SRA_H           = 32'b0100100??????????000?????1010111;
+  localparam [31:0] PV_SRA_SC_H        = 32'b0100100??????????100?????1010111;
+  localparam [31:0] PV_SRA_SCI_H       = 32'b010010???????????110?????1010111;
+  localparam [31:0] PV_SRA_B           = 32'b0100100??????????001?????1010111;
+  localparam [31:0] PV_SRA_SC_B        = 32'b0100100??????????101?????1010111;
+  localparam [31:0] PV_SRA_SCI_B       = 32'b010010???????????111?????1010111;
+  localparam [31:0] PV_SLL_H           = 32'b0101000??????????000?????1010111;
+  localparam [31:0] PV_SLL_SC_H        = 32'b0101000??????????100?????1010111;
+  localparam [31:0] PV_SLL_SCI_H       = 32'b010100???????????110?????1010111;
+  localparam [31:0] PV_SLL_B           = 32'b0101000??????????001?????1010111;
+  localparam [31:0] PV_SLL_SC_B        = 32'b0101000??????????101?????1010111;
+  localparam [31:0] PV_SLL_SCI_B       = 32'b010100???????????111?????1010111;
+  localparam [31:0] PV_OR_H            = 32'b0101100??????????000?????1010111;
+  localparam [31:0] PV_OR_SC_H         = 32'b0101100??????????100?????1010111;
+  localparam [31:0] PV_OR_SCI_H        = 32'b010110???????????110?????1010111;
+  localparam [31:0] PV_OR_B            = 32'b0101100??????????001?????1010111;
+  localparam [31:0] PV_OR_SC_B         = 32'b0101100??????????101?????1010111;
+  localparam [31:0] PV_OR_SCI_B        = 32'b010110???????????111?????1010111;
+  localparam [31:0] PV_XOR_H           = 32'b0110000??????????000?????1010111;
+  localparam [31:0] PV_XOR_SC_H        = 32'b0110000??????????100?????1010111;
+  localparam [31:0] PV_XOR_SCI_H       = 32'b011000???????????110?????1010111;
+  localparam [31:0] PV_XOR_B           = 32'b0110000??????????001?????1010111;
+  localparam [31:0] PV_XOR_SC_B        = 32'b0110000??????????101?????1010111;
+  localparam [31:0] PV_XOR_SCI_B       = 32'b011000???????????111?????1010111;
+  localparam [31:0] PV_AND_H           = 32'b0110100??????????000?????1010111;
+  localparam [31:0] PV_AND_SC_H        = 32'b0110100??????????100?????1010111;
+  localparam [31:0] PV_AND_SCI_H       = 32'b011010???????????110?????1010111;
+  localparam [31:0] PV_AND_B           = 32'b0110100??????????001?????1010111;
+  localparam [31:0] PV_AND_SC_B        = 32'b0110100??????????101?????1010111;
+  localparam [31:0] PV_AND_SCI_B       = 32'b011010???????????111?????1010111;
+  localparam [31:0] PV_ABS_H           = 32'b011100000000?????000?????1010111;
+  localparam [31:0] PV_ABS_B           = 32'b011100000000?????001?????1010111;
+  localparam [31:0] PV_EXTRACT_H       = 32'b011110???????????110?????1010111;
+  localparam [31:0] PV_EXTRACT_B       = 32'b011110???????????111?????1010111;
+  localparam [31:0] PV_EXTRACTU_H      = 32'b100100???????????110?????1010111;
+  localparam [31:0] PV_EXTRACTU_B      = 32'b100100???????????111?????1010111;
+  localparam [31:0] PV_INSERT_H        = 32'b101100???????????110?????1010111;
+  localparam [31:0] PV_INSERT_B        = 32'b101100???????????111?????1010111;
+  localparam [31:0] PV_DOTUP_H         = 32'b1000000??????????000?????1010111;
+  localparam [31:0] PV_DOTUP_SC_H      = 32'b1000000??????????100?????1010111;
+  localparam [31:0] PV_DOTUP_SCI_H     = 32'b100000???????????110?????1010111;
+  localparam [31:0] PV_DOTUP_B         = 32'b1000000??????????001?????1010111;
+  localparam [31:0] PV_DOTUP_SC_B      = 32'b1000000??????????101?????1010111;
+  localparam [31:0] PV_DOTUP_SCI_B     = 32'b100000???????????111?????1010111;
+  localparam [31:0] PV_DOTUSP_H        = 32'b1000100??????????000?????1010111;
+  localparam [31:0] PV_DOTUSP_SC_H     = 32'b1000100??????????100?????1010111;
+  localparam [31:0] PV_DOTUSP_SCI_H    = 32'b100010???????????110?????1010111;
+  localparam [31:0] PV_DOTUSP_B        = 32'b1000100??????????001?????1010111;
+  localparam [31:0] PV_DOTUSP_SC_B     = 32'b1000100??????????101?????1010111;
+  localparam [31:0] PV_DOTUSP_SCI_B    = 32'b100010???????????111?????1010111;
+  localparam [31:0] PV_DOTSP_H         = 32'b1001100??????????000?????1010111;
+  localparam [31:0] PV_DOTSP_SC_H      = 32'b1001100??????????100?????1010111;
+  localparam [31:0] PV_DOTSP_SCI_H     = 32'b100110???????????110?????1010111;
+  localparam [31:0] PV_DOTSP_B         = 32'b1001100??????????001?????1010111;
+  localparam [31:0] PV_DOTSP_SC_B      = 32'b1001100??????????101?????1010111;
+  localparam [31:0] PV_DOTSP_SCI_B     = 32'b100110???????????111?????1010111;
+  localparam [31:0] PV_SDOTUP_H        = 32'b1010000??????????000?????1010111;
+  localparam [31:0] PV_SDOTUP_SC_H     = 32'b1010000??????????100?????1010111;
+  localparam [31:0] PV_SDOTUP_SCI_H    = 32'b101000???????????110?????1010111;
+  localparam [31:0] PV_SDOTUP_B        = 32'b1010000??????????001?????1010111;
+  localparam [31:0] PV_SDOTUP_SC_B     = 32'b1010000??????????101?????1010111;
+  localparam [31:0] PV_SDOTUP_SCI_B    = 32'b101000???????????111?????1010111;
+  localparam [31:0] PV_SDOTUSP_H       = 32'b1010100??????????000?????1010111;
+  localparam [31:0] PV_SDOTUSP_SC_H    = 32'b1010100??????????100?????1010111;
+  localparam [31:0] PV_SDOTUSP_SCI_H   = 32'b101010???????????110?????1010111;
+  localparam [31:0] PV_SDOTUSP_B       = 32'b1010100??????????001?????1010111;
+  localparam [31:0] PV_SDOTUSP_SC_B    = 32'b1010100??????????101?????1010111;
+  localparam [31:0] PV_SDOTUSP_SCI_B   = 32'b101010???????????111?????1010111;
+  localparam [31:0] PV_SDOTSP_H        = 32'b1011100??????????000?????1010111;
+  localparam [31:0] PV_SDOTSP_SC_H     = 32'b1011100??????????100?????1010111;
+  localparam [31:0] PV_SDOTSP_SCI_H    = 32'b101110???????????110?????1010111;
+  localparam [31:0] PV_SDOTSP_B        = 32'b1011100??????????001?????1010111;
+  localparam [31:0] PV_SDOTSP_SC_B     = 32'b1011100??????????101?????1010111;
+  localparam [31:0] PV_SDOTSP_SCI_B    = 32'b101110???????????111?????1010111;
+  localparam [31:0] PV_SHUFFLE2_H      = 32'b1100100??????????000?????1010111;
+  localparam [31:0] PV_SHUFFLE2_B      = 32'b1100100??????????001?????1010111;
   localparam [31:0] FLAH               = 32'b?????????????????001?????0000111;
   localparam [31:0] FSAH               = 32'b?????????????????001?????0100111;
   localparam [31:0] FMADD_AH           = 32'b?????10??????????101?????1000011;
diff --git a/toolchain/riscv-opcodes/opcodes-rvv b/toolchain/riscv-opcodes/opcodes-rvv
index d34cdd81c..f2e6ba6bc 100644
--- a/toolchain/riscv-opcodes/opcodes-rvv
+++ b/toolchain/riscv-opcodes/opcodes-rvv
@@ -8,8 +8,8 @@
 
 # configuration setting
 # https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
-vsetvli      31=0 zimm11         rs1 14..12=0x7 rd 6..0=0x57
-vsetvl       31=1 30..25=0x0 rs2 rs1 14..12=0x7 rd 6..0=0x57
+@vsetvli      31=0 zimm11         rs1 14..12=0x7 rd 6..0=0x57
+#vsetvl       31=1 30..25=0x0 rs2 rs1 14..12=0x7 rd 6..0=0x57
 
 #
 # Vector Loads and Store
@@ -118,37 +118,37 @@ vs8r.v         31..29=7 28=0 27..26=0 25=1 24..20=0x08 rs1 14..12=0x0 vs3 6..0=0
 # Vector Floating-Point Instructions
 # https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#14-vector-floating-point-instructions
 # OPFVF
-vfadd.vf        31..26=0x00 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfsub.vf        31..26=0x02 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfmin.vf        31..26=0x04 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfmax.vf        31..26=0x06 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfsgnj.vf       31..26=0x08 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfsgnjn.vf      31..26=0x09 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfsgnjx.vf      31..26=0x0a vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfslide1up.vf   31..26=0x0e vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfadd.vf        31..26=0x00 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfsub.vf        31..26=0x02 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfmin.vf        31..26=0x04 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfmax.vf        31..26=0x06 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfsgnj.vf       31..26=0x08 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfsgnjn.vf      31..26=0x09 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfsgnjx.vf      31..26=0x0a vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfslide1up.vf   31..26=0x0e vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfslide1down.vf 31..26=0x0f vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfmv.s.f        31..26=0x10 25=1 24..20=0 rs1      14..12=0x5 vd 6..0=0x57
 
 vfmerge.vfm    31..26=0x17 25=0 vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfmv.v.f       31..26=0x17 25=1 24..20=0 rs1 14..12=0x5 vd 6..0=0x57
-vmfeq.vf       31..26=0x18 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vmfeq.vf       31..26=0x18 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vmfle.vf       31..26=0x19 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vmflt.vf       31..26=0x1b vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vmfne.vf       31..26=0x1c vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vmfgt.vf       31..26=0x1d vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vmfge.vf       31..26=0x1f vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 
-vfdiv.vf       31..26=0x20 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfdiv.vf       31..26=0x20 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfrdiv.vf      31..26=0x21 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfmul.vf       31..26=0x24 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfrsub.vf      31..26=0x27 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfmadd.vf      31..26=0x28 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfmadd.vf      31..26=0x28 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfnmadd.vf     31..26=0x29 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfmsub.vf      31..26=0x2a vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfmsub.vf      31..26=0x2a vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfnmsub.vf     31..26=0x2b vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfmacc.vf      31..26=0x2c vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfnmacc.vf     31..26=0x2d vm vs2 rs1 14..12=0x5 vd 6..0=0x57
-vfmsac.vf      31..26=0x2e vm vs2 rs1 14..12=0x5 vd 6..0=0x57
+@vfmsac.vf      31..26=0x2e vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfnmsac.vf     31..26=0x2f vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 
 vfwadd.vf      31..26=0x30 vm vs2 rs1 14..12=0x5 vd 6..0=0x57
@@ -162,58 +162,58 @@ vfwmsac.vf     31..26=0x3e vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 vfwnmsac.vf    31..26=0x3f vm vs2 rs1 14..12=0x5 vd 6..0=0x57
 
 # OPFVV
-vfadd.vv       31..26=0x00 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfadd.vv       31..26=0x00 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfredsum.vs    31..26=0x01 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfsub.vv       31..26=0x02 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfsub.vv       31..26=0x02 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfredosum.vs   31..26=0x03 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfmin.vv       31..26=0x04 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfmin.vv       31..26=0x04 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfredmin.vs    31..26=0x05 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfmax.vv       31..26=0x06 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfmax.vv       31..26=0x06 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfredmax.vs    31..26=0x07 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfsgnj.vv      31..26=0x08 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfsgnj.vv      31..26=0x08 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfsgnjn.vv     31..26=0x09 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfsgnjx.vv     31..26=0x0a vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfsgnjx.vv     31..26=0x0a vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfmv.f.s       31..26=0x10 25=1 vs2      19..15=0 14..12=0x1 rd 6..0=0x57
 
-vmfeq.vv       31..26=0x18 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vmfeq.vv       31..26=0x18 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vmfle.vv       31..26=0x19 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vmflt.vv       31..26=0x1b vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vmfne.vv       31..26=0x1c vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vmfne.vv       31..26=0x1c vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 
-vfdiv.vv       31..26=0x20 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfdiv.vv       31..26=0x20 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfmul.vv       31..26=0x24 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfmadd.vv      31..26=0x28 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfmadd.vv      31..26=0x28 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfnmadd.vv     31..26=0x29 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfmsub.vv      31..26=0x2a vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfmsub.vv      31..26=0x2a vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfnmsub.vv     31..26=0x2b vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfmacc.vv      31..26=0x2c vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfnmacc.vv     31..26=0x2d vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfmsac.vv      31..26=0x2e vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfmsac.vv      31..26=0x2e vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfnmsac.vv     31..26=0x2f vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 
-vfcvt.xu.f.v     31..26=0x12 vm vs2 19..15=0x00 14..12=0x1 vd 6..0=0x57
-vfcvt.x.f.v      31..26=0x12 vm vs2 19..15=0x01 14..12=0x1 vd 6..0=0x57
-vfcvt.f.xu.v     31..26=0x12 vm vs2 19..15=0x02 14..12=0x1 vd 6..0=0x57
-vfcvt.f.x.v      31..26=0x12 vm vs2 19..15=0x03 14..12=0x1 vd 6..0=0x57
-vfcvt.rtz.xu.f.v 31..26=0x12 vm vs2 19..15=0x06 14..12=0x1 vd 6..0=0x57
-vfcvt.rtz.x.f.v  31..26=0x12 vm vs2 19..15=0x07 14..12=0x1 vd 6..0=0x57
-
-vfwcvt.xu.f.v     31..26=0x12 vm vs2 19..15=0x08 14..12=0x1 vd 6..0=0x57
-vfwcvt.x.f.v      31..26=0x12 vm vs2 19..15=0x09 14..12=0x1 vd 6..0=0x57
-vfwcvt.f.xu.v     31..26=0x12 vm vs2 19..15=0x0A 14..12=0x1 vd 6..0=0x57
-vfwcvt.f.x.v      31..26=0x12 vm vs2 19..15=0x0B 14..12=0x1 vd 6..0=0x57
-vfwcvt.f.f.v      31..26=0x12 vm vs2 19..15=0x0C 14..12=0x1 vd 6..0=0x57
-vfwcvt.rtz.xu.f.v 31..26=0x12 vm vs2 19..15=0x0E 14..12=0x1 vd 6..0=0x57
-vfwcvt.rtz.x.f.v  31..26=0x12 vm vs2 19..15=0x0F 14..12=0x1 vd 6..0=0x57
-
-vfncvt.xu.f.w     31..26=0x12 vm vs2 19..15=0x10 14..12=0x1 vd 6..0=0x57
-vfncvt.x.f.w      31..26=0x12 vm vs2 19..15=0x11 14..12=0x1 vd 6..0=0x57
-vfncvt.f.xu.w     31..26=0x12 vm vs2 19..15=0x12 14..12=0x1 vd 6..0=0x57
-vfncvt.f.x.w      31..26=0x12 vm vs2 19..15=0x13 14..12=0x1 vd 6..0=0x57
-vfncvt.f.f.w      31..26=0x12 vm vs2 19..15=0x14 14..12=0x1 vd 6..0=0x57
-vfncvt.rod.f.f.w  31..26=0x12 vm vs2 19..15=0x15 14..12=0x1 vd 6..0=0x57
-vfncvt.rtz.xu.f.w 31..26=0x12 vm vs2 19..15=0x16 14..12=0x1 vd 6..0=0x57
-vfncvt.rtz.x.f.w  31..26=0x12 vm vs2 19..15=0x17 14..12=0x1 vd 6..0=0x57
+@vfcvt.xu.f.v     31..26=0x12 vm vs2 19..15=0x00 14..12=0x1 vd 6..0=0x57
+#vfcvt.x.f.v      31..26=0x12 vm vs2 19..15=0x01 14..12=0x1 vd 6..0=0x57
+@vfcvt.f.xu.v     31..26=0x12 vm vs2 19..15=0x02 14..12=0x1 vd 6..0=0x57
+@vfcvt.f.x.v      31..26=0x12 vm vs2 19..15=0x03 14..12=0x1 vd 6..0=0x57
+@vfcvt.rtz.xu.f.v 31..26=0x12 vm vs2 19..15=0x06 14..12=0x1 vd 6..0=0x57
+@vfcvt.rtz.x.f.v  31..26=0x12 vm vs2 19..15=0x07 14..12=0x1 vd 6..0=0x57
+
+@vfwcvt.xu.f.v     31..26=0x12 vm vs2 19..15=0x08 14..12=0x1 vd 6..0=0x57
+@vfwcvt.x.f.v      31..26=0x12 vm vs2 19..15=0x09 14..12=0x1 vd 6..0=0x57
+@vfwcvt.f.xu.v     31..26=0x12 vm vs2 19..15=0x0A 14..12=0x1 vd 6..0=0x57
+@vfwcvt.f.x.v      31..26=0x12 vm vs2 19..15=0x0B 14..12=0x1 vd 6..0=0x57
+@vfwcvt.f.f.v      31..26=0x12 vm vs2 19..15=0x0C 14..12=0x1 vd 6..0=0x57
+@vfwcvt.rtz.xu.f.v 31..26=0x12 vm vs2 19..15=0x0E 14..12=0x1 vd 6..0=0x57
+@vfwcvt.rtz.x.f.v  31..26=0x12 vm vs2 19..15=0x0F 14..12=0x1 vd 6..0=0x57
+
+@vfncvt.xu.f.w     31..26=0x12 vm vs2 19..15=0x10 14..12=0x1 vd 6..0=0x57
+@vfncvt.x.f.w      31..26=0x12 vm vs2 19..15=0x11 14..12=0x1 vd 6..0=0x57
+@vfncvt.f.xu.w     31..26=0x12 vm vs2 19..15=0x12 14..12=0x1 vd 6..0=0x57
+@vfncvt.f.x.w      31..26=0x12 vm vs2 19..15=0x13 14..12=0x1 vd 6..0=0x57
+@vfncvt.f.f.w      31..26=0x12 vm vs2 19..15=0x14 14..12=0x1 vd 6..0=0x57
+@vfncvt.rod.f.f.w  31..26=0x12 vm vs2 19..15=0x15 14..12=0x1 vd 6..0=0x57
+@vfncvt.rtz.xu.f.w 31..26=0x12 vm vs2 19..15=0x16 14..12=0x1 vd 6..0=0x57
+@vfncvt.rtz.x.f.w  31..26=0x12 vm vs2 19..15=0x17 14..12=0x1 vd 6..0=0x57
 
 vfsqrt.v       31..26=0x13 vm vs2 19..15=0x00 14..12=0x1 vd 6..0=0x57
 vfrsqrte7.v    31..26=0x13 vm vs2 19..15=0x04 14..12=0x1 vd 6..0=0x57
@@ -222,7 +222,7 @@ vfclass.v      31..26=0x13 vm vs2 19..15=0x10 14..12=0x1 vd 6..0=0x57
 
 vfwadd.vv      31..26=0x30 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfwredsum.vs   31..26=0x31 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
-vfwsub.vv      31..26=0x32 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
+@vfwsub.vv      31..26=0x32 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfwredosum.vs  31..26=0x33 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfwadd.wv      31..26=0x34 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfwsub.wv      31..26=0x36 vm vs2 vs1 14..12=0x1 vd 6..0=0x57
@@ -234,48 +234,48 @@ vfwmsac.vv     31..26=0x3e vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 vfwnmsac.vv    31..26=0x3f vm vs2 vs1 14..12=0x1 vd 6..0=0x57
 
 # OPIVX
-vadd.vx        31..26=0x00 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vsub.vx        31..26=0x02 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vadd.vx        31..26=0x00 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vsub.vx        31..26=0x02 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vrsub.vx       31..26=0x03 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vminu.vx       31..26=0x04 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vminu.vx       31..26=0x04 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmin.vx        31..26=0x05 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vmaxu.vx       31..26=0x06 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vmaxu.vx       31..26=0x06 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmax.vx        31..26=0x07 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vand.vx        31..26=0x09 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vor.vx         31..26=0x0a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vor.vx         31..26=0x0a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vxor.vx        31..26=0x0b vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vrgather.vx    31..26=0x0c vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vslideup.vx    31..26=0x0e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vrgather.vx    31..26=0x0c vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vslideup.vx    31..26=0x0e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vslidedown.vx  31..26=0x0f vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 
-vadc.vxm       31..26=0x10 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
+#vadc.vxm       31..26=0x10 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmadc.vxm      31..26=0x11 vm   vs2 rs1 14..12=0x4 vd 6..0=0x57
-vsbc.vxm       31..26=0x12 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
+#vsbc.vxm       31..26=0x12 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmsbc.vxm      31..26=0x13 vm   vs2 rs1 14..12=0x4 vd 6..0=0x57
-vmerge.vxm     31..26=0x17 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
-vmv.v.x        31..26=0x17 25=1 24..20=0 rs1 14..12=0x4 vd 6..0=0x57
-vmseq.vx       31..26=0x18 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vmsne.vx       31..26=0x19 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vmsltu.vx      31..26=0x1a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vmerge.vxm     31..26=0x17 25=0 vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vmv.v.x        31..26=0x17 25=1 24..20=0 rs1 14..12=0x4 vd 6..0=0x57
+@vmseq.vx       31..26=0x18 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vmsne.vx       31..26=0x19 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vmsltu.vx      31..26=0x1a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmslt.vx       31..26=0x1b vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmsleu.vx      31..26=0x1c vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmsle.vx       31..26=0x1d vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmsgtu.vx      31..26=0x1e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vmsgt.vx       31..26=0x1f vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 
-vsaddu.vx      31..26=0x20 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vsaddu.vx      31..26=0x20 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vsadd.vx       31..26=0x21 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vssubu.vx      31..26=0x22 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vssubu.vx      31..26=0x22 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vssub.vx       31..26=0x23 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vsll.vx        31..26=0x25 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vsmul.vx       31..26=0x27 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vsrl.vx        31..26=0x28 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vsrl.vx        31..26=0x28 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vsra.vx        31..26=0x29 vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vssrl.vx       31..26=0x2a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vssrl.vx       31..26=0x2a vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vssra.vx       31..26=0x2b vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vnsrl.wx       31..26=0x2c vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vnsra.wx       31..26=0x2d vm vs2 rs1 14..12=0x4 vd 6..0=0x57
-vnclipu.wx     31..26=0x2e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
+@vnclipu.wx     31..26=0x2e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vnclip.wx      31..26=0x2f vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 
 vqmaccu.vx     31..26=0x3c vm vs2 rs1 14..12=0x4 vd 6..0=0x57
@@ -284,44 +284,44 @@ vqmaccus.vx    31..26=0x3e vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 vqmaccsu.vx    31..26=0x3f vm vs2 rs1 14..12=0x4 vd 6..0=0x57
 
 # OPIVV
-vadd.vv         31..26=0x00 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vsub.vv         31..26=0x02 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vminu.vv        31..26=0x04 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vadd.vv         31..26=0x00 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vsub.vv         31..26=0x02 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vminu.vv        31..26=0x04 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmin.vv         31..26=0x05 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vmaxu.vv        31..26=0x06 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vmaxu.vv        31..26=0x06 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmax.vv         31..26=0x07 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vand.vv         31..26=0x09 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vor.vv          31..26=0x0a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vor.vv          31..26=0x0a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vxor.vv         31..26=0x0b vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vrgather.vv     31..26=0x0c vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vrgatherei16.vv 31..26=0x0e vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vrgather.vv     31..26=0x0c vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vrgatherei16.vv 31..26=0x0e vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 
-vadc.vvm       31..26=0x10 25=0 vs2 vs1 14..12=0x0 vd 6..0=0x57
+#vadc.vvm       31..26=0x10 25=0 vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmadc.vvm      31..26=0x11 vm   vs2 vs1 14..12=0x0 vd 6..0=0x57
-vsbc.vvm       31..26=0x12 25=0 vs2 vs1 14..12=0x0 vd 6..0=0x57
+#vsbc.vvm       31..26=0x12 25=0 vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmsbc.vvm      31..26=0x13 vm   vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmerge.vvm     31..26=0x17 25=0 vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmv.v.v        31..26=0x17 25=1 24..20=0 vs1 14..12=0x0 vd 6..0=0x57
-vmseq.vv       31..26=0x18 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vmseq.vv       31..26=0x18 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmsne.vv       31..26=0x19 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vmsltu.vv      31..26=0x1a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vmsltu.vv      31..26=0x1a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmslt.vv       31..26=0x1b vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vmsleu.vv      31..26=0x1c vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vmsleu.vv      31..26=0x1c vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vmsle.vv       31..26=0x1d vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 
-vsaddu.vv      31..26=0x20 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vsaddu.vv      31..26=0x20 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vsadd.vv       31..26=0x21 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vssubu.vv      31..26=0x22 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vssubu.vv      31..26=0x22 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vssub.vv       31..26=0x23 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vsll.vv        31..26=0x25 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vsmul.vv       31..26=0x27 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vsrl.vv        31..26=0x28 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vsrl.vv        31..26=0x28 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vsra.vv        31..26=0x29 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vssrl.vv       31..26=0x2a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vssrl.vv       31..26=0x2a vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vssra.vv       31..26=0x2b vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vnsrl.wv       31..26=0x2c vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vnsra.wv       31..26=0x2d vm vs2 vs1 14..12=0x0 vd 6..0=0x57
-vnclipu.wv     31..26=0x2e vm vs2 vs1 14..12=0x0 vd 6..0=0x57
+@vnclipu.wv     31..26=0x2e vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 vnclip.wv      31..26=0x2f vm vs2 vs1 14..12=0x0 vd 6..0=0x57
 
 vwredsumu.vs   31..26=0x30 vm vs2 vs1 14..12=0x0 vd 6..0=0x57
@@ -395,9 +395,9 @@ vsext.vf4      31..26=0x12 vm vs2 19..15=5 14..12=0x2 vd 6..0=0x57
 vzext.vf2      31..26=0x12 vm vs2 19..15=6 14..12=0x2 vd 6..0=0x57
 vsext.vf2      31..26=0x12 vm vs2 19..15=7 14..12=0x2 vd 6..0=0x57
 
-vcompress.vm   31..26=0x17 25=1 vs2 vs1 14..12=0x2 vd 6..0=0x57
+@vcompress.vm   31..26=0x17 25=1 vs2 vs1 14..12=0x2 vd 6..0=0x57
 vmandnot.mm    31..26=0x18 vm vs2 vs1 14..12=0x2 vd 6..0=0x57
-vmand.mm       31..26=0x19 vm vs2 vs1 14..12=0x2 vd 6..0=0x57
+@vmand.mm       31..26=0x19 vm vs2 vs1 14..12=0x2 vd 6..0=0x57
 vmor.mm        31..26=0x1a vm vs2 vs1 14..12=0x2 vd 6..0=0x57
 vmxor.mm       31..26=0x1b vm vs2 vs1 14..12=0x2 vd 6..0=0x57
 vmornot.mm     31..26=0x1c vm vs2 vs1 14..12=0x2 vd 6..0=0x57
@@ -442,22 +442,22 @@ vwmacc.vv      31..26=0x3d vm vs2 vs1 14..12=0x2 vd 6..0=0x57
 vwmaccsu.vv    31..26=0x3f vm vs2 vs1 14..12=0x2 vd 6..0=0x57
 
 # OPMVX
-vaaddu.vx      31..26=0x08 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+#vaaddu.vx      31..26=0x08 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vaadd.vx       31..26=0x09 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
-vasubu.vx      31..26=0x0a vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+#vasubu.vx      31..26=0x0a vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vasub.vx       31..26=0x0b vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 
-vmv.s.x        31..26=0x10 25=1 24..20=0 rs1 14..12=0x6 vd 6..0=0x57
-vslide1up.vx   31..26=0x0e vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+@vmv.s.x        31..26=0x10 25=1 24..20=0 rs1 14..12=0x6 vd 6..0=0x57
+#vslide1up.vx   31..26=0x0e vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vslide1down.vx 31..26=0x0f vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 
-vdivu.vx       31..26=0x20 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+#vdivu.vx       31..26=0x20 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vdiv.vx        31..26=0x21 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
-vremu.vx       31..26=0x22 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+@vremu.vx       31..26=0x22 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vrem.vx        31..26=0x23 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
-vmulhu.vx      31..26=0x24 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+#vmulhu.vx      31..26=0x24 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vmul.vx        31..26=0x25 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
-vmulhsu.vx     31..26=0x26 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
+#vmulhsu.vx     31..26=0x26 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vmulh.vx       31..26=0x27 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vmadd.vx       31..26=0x29 vm vs2 rs1 14..12=0x6 vd 6..0=0x57
 vnmsub.vx      31..26=0x2b vm vs2 rs1 14..12=0x6 vd 6..0=0x57
diff --git a/toolchain/riscv-opcodes/opcodes-xpulpimg_CUSTOM b/toolchain/riscv-opcodes/opcodes-xpulpimg_CUSTOM
index daf000556..1e4bc4956 100644
--- a/toolchain/riscv-opcodes/opcodes-xpulpimg_CUSTOM
+++ b/toolchain/riscv-opcodes/opcodes-xpulpimg_CUSTOM
@@ -9,6 +9,35 @@
 
 # Xpulpimg extension
 
+# Post-increment and reg-reg loads
+p.lb_irpost  rd rs1 imm12             14..12=0 6..2=0x02 1..0=3
+p.lbu_irpost rd rs1 imm12             14..12=4 6..2=0x02 1..0=3
+p.lh_irpost  rd rs1 imm12             14..12=1 6..2=0x02 1..0=3
+p.lhu_irpost rd rs1 imm12             14..12=5 6..2=0x02 1..0=3
+p.lw_irpost  rd rs1 imm12             14..12=2 6..2=0x02 1..0=3
+p.lb_rrpost  rd rs1 rs2   31..25=0x00 14..12=7 6..2=0x02 1..0=3
+p.lbu_rrpost rd rs1 rs2   31..25=0x20 14..12=7 6..2=0x02 1..0=3
+p.lh_rrpost  rd rs1 rs2   31..25=0x08 14..12=7 6..2=0x02 1..0=3
+p.lhu_rrpost rd rs1 rs2   31..25=0x28 14..12=7 6..2=0x02 1..0=3
+p.lw_rrpost  rd rs1 rs2   31..25=0x10 14..12=7 6..2=0x02 1..0=3
+p.lb_rr      rd rs1 rs2   31..25=0x00 14..12=7 6..2=0x00 1..0=3
+p.lbu_rr     rd rs1 rs2   31..25=0x20 14..12=7 6..2=0x00 1..0=3
+p.lh_rr      rd rs1 rs2   31..25=0x08 14..12=7 6..2=0x00 1..0=3
+p.lhu_rr     rd rs1 rs2   31..25=0x28 14..12=7 6..2=0x00 1..0=3
+p.lw_rr      rd rs1 rs2   31..25=0x10 14..12=7 6..2=0x00 1..0=3
+
+# Post-increment and reg-reg stores
+p.sb_irpost  rs1 rs2 imm12hi imm12lo              14..12=0 6..2=0x0A 1..0=3
+p.sh_irpost  rs1 rs2 imm12hi imm12lo              14..12=1 6..2=0x0A 1..0=3
+p.sw_irpost  rs1 rs2 imm12hi imm12lo              14..12=2 6..2=0x0A 1..0=3
+p.sb_rrpost  rs1 rs2 prs3             31..25=0x00 14..12=4 6..2=0x0A 1..0=3
+p.sh_rrpost  rs1 rs2 prs3             31..25=0x00 14..12=5 6..2=0x0A 1..0=3
+p.sw_rrpost  rs1 rs2 prs3             31..25=0x00 14..12=6 6..2=0x0A 1..0=3
+p.sb_rr      rs1 rs2 prs3             31..25=0x00 14..12=4 6..2=0x08 1..0=3
+p.sh_rr      rs1 rs2 prs3             31..25=0x00 14..12=5 6..2=0x08 1..0=3
+p.sw_rr      rs1 rs2 prs3             31..25=0x00 14..12=6 6..2=0x08 1..0=3
+
+# Generic ALU operations
 p.abs    rd rs1      31..25=2  24..20=0 14..12=0 6..2=0x0C 1..0=3
 p.slet   rd rs1 rs2  31..25=2           14..12=2 6..2=0x0C 1..0=3
 p.sletu  rd rs1 rs2  31..25=2           14..12=3 6..2=0x0C 1..0=3
@@ -25,5 +54,156 @@ p.clipu  rd rs1 imm5 31..25=10          14..12=2 6..2=0x0C 1..0=3
 p.clipr  rd rs1 rs2  31..25=10          14..12=5 6..2=0x0C 1..0=3
 p.clipur rd rs1 rs2  31..25=10          14..12=6 6..2=0x0C 1..0=3
 
+# Immediate branching
 p.beqimm rs1 imm5 bimm12hi bimm12lo 14..12=2 6..2=0x18 1..0=3
 p.bneimm rs1 imm5 bimm12hi bimm12lo 14..12=3 6..2=0x18 1..0=3
+
+# MAC operations
+p.mac rd rs1 rs2 31..25=33 14..12=0 6..2=0x0C 1..0=3
+p.msu rd rs1 rs2 31..25=33 14..12=1 6..2=0x0C 1..0=3
+
+# SIMD arithmetical operations
+pv.add.h     rd rs1 rs2  31..27=0  26=0 25=0       14..12=0 6..2=0x15 1..0=3
+pv.add.sc.h  rd rs1 rs2  31..27=0  26=0 25=0       14..12=4 6..2=0x15 1..0=3
+pv.add.sci.h rd rs1 imm6 31..27=0  26=0            14..12=6 6..2=0x15 1..0=3
+pv.add.b     rd rs1 rs2  31..27=0  26=0 25=0       14..12=1 6..2=0x15 1..0=3
+pv.add.sc.b  rd rs1 rs2  31..27=0  26=0 25=0       14..12=5 6..2=0x15 1..0=3
+pv.add.sci.b rd rs1 imm6 31..27=0  26=0            14..12=7 6..2=0x15 1..0=3
+pv.sub.h     rd rs1 rs2  31..27=1  26=0 25=0       14..12=0 6..2=0x15 1..0=3
+pv.sub.sc.h  rd rs1 rs2  31..27=1  26=0 25=0       14..12=4 6..2=0x15 1..0=3
+pv.sub.sci.h rd rs1 imm6 31..27=1  26=0            14..12=6 6..2=0x15 1..0=3
+pv.sub.b     rd rs1 rs2  31..27=1  26=0 25=0       14..12=1 6..2=0x15 1..0=3
+pv.sub.sc.b  rd rs1 rs2  31..27=1  26=0 25=0       14..12=5 6..2=0x15 1..0=3
+pv.sub.sci.b rd rs1 imm6 31..27=1  26=0            14..12=7 6..2=0x15 1..0=3
+
+pv.avg.h      rd rs1 rs2  31..27=2 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.avg.sc.h   rd rs1 rs2  31..27=2 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.avg.sci.h  rd rs1 imm6 31..27=2 26=0      14..12=6 6..2=0x15 1..0=3
+pv.avg.b      rd rs1 rs2  31..27=2 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.avg.sc.b   rd rs1 rs2  31..27=2 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.avg.sci.b  rd rs1 imm6 31..27=2 26=0      14..12=7 6..2=0x15 1..0=3
+pv.avgu.h     rd rs1 rs2  31..27=3 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.avgu.sc.h  rd rs1 rs2  31..27=3 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.avgu.sci.h rd rs1 imm6 31..27=3 26=0      14..12=6 6..2=0x15 1..0=3
+pv.avgu.b     rd rs1 rs2  31..27=3 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.avgu.sc.b  rd rs1 rs2  31..27=3 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.avgu.sci.b rd rs1 imm6 31..27=3 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.min.h      rd rs1 rs2  31..27=4 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.min.sc.h   rd rs1 rs2  31..27=4 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.min.sci.h  rd rs1 imm6 31..27=4 26=0      14..12=6 6..2=0x15 1..0=3
+pv.min.b      rd rs1 rs2  31..27=4 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.min.sc.b   rd rs1 rs2  31..27=4 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.min.sci.b  rd rs1 imm6 31..27=4 26=0      14..12=7 6..2=0x15 1..0=3
+pv.minu.h     rd rs1 rs2  31..27=5 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.minu.sc.h  rd rs1 rs2  31..27=5 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.minu.sci.h rd rs1 imm6 31..27=5 26=0      14..12=6 6..2=0x15 1..0=3
+pv.minu.b     rd rs1 rs2  31..27=5 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.minu.sc.b  rd rs1 rs2  31..27=5 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.minu.sci.b rd rs1 imm6 31..27=5 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.max.h      rd rs1 rs2  31..27=6 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.max.sc.h   rd rs1 rs2  31..27=6 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.max.sci.h  rd rs1 imm6 31..27=6 26=0      14..12=6 6..2=0x15 1..0=3
+pv.max.b      rd rs1 rs2  31..27=6 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.max.sc.b   rd rs1 rs2  31..27=6 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.max.sci.b  rd rs1 imm6 31..27=6 26=0      14..12=7 6..2=0x15 1..0=3
+pv.maxu.h     rd rs1 rs2  31..27=7 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.maxu.sc.h  rd rs1 rs2  31..27=7 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.maxu.sci.h rd rs1 imm6 31..27=7 26=0      14..12=6 6..2=0x15 1..0=3
+pv.maxu.b     rd rs1 rs2  31..27=7 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.maxu.sc.b  rd rs1 rs2  31..27=7 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.maxu.sci.b rd rs1 imm6 31..27=7 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.srl.h     rd rs1 rs2  31..27=8  26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.srl.sc.h  rd rs1 rs2  31..27=8  26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.srl.sci.h rd rs1 imm6 31..27=8  26=0      14..12=6 6..2=0x15 1..0=3
+pv.srl.b     rd rs1 rs2  31..27=8  26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.srl.sc.b  rd rs1 rs2  31..27=8  26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.srl.sci.b rd rs1 imm6 31..27=8  26=0      14..12=7 6..2=0x15 1..0=3
+pv.sra.h     rd rs1 rs2  31..27=9  26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.sra.sc.h  rd rs1 rs2  31..27=9  26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.sra.sci.h rd rs1 imm6 31..27=9  26=0      14..12=6 6..2=0x15 1..0=3
+pv.sra.b     rd rs1 rs2  31..27=9  26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.sra.sc.b  rd rs1 rs2  31..27=9  26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.sra.sci.b rd rs1 imm6 31..27=9  26=0      14..12=7 6..2=0x15 1..0=3
+pv.sll.h     rd rs1 rs2  31..27=10 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.sll.sc.h  rd rs1 rs2  31..27=10 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.sll.sci.h rd rs1 imm6 31..27=10 26=0      14..12=6 6..2=0x15 1..0=3
+pv.sll.b     rd rs1 rs2  31..27=10 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.sll.sc.b  rd rs1 rs2  31..27=10 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.sll.sci.b rd rs1 imm6 31..27=10 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.or.h      rd rs1 rs2  31..27=11 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.or.sc.h   rd rs1 rs2  31..27=11 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.or.sci.h  rd rs1 imm6 31..27=11 26=0      14..12=6 6..2=0x15 1..0=3
+pv.or.b      rd rs1 rs2  31..27=11 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.or.sc.b   rd rs1 rs2  31..27=11 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.or.sci.b  rd rs1 imm6 31..27=11 26=0      14..12=7 6..2=0x15 1..0=3
+pv.xor.h     rd rs1 rs2  31..27=12 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.xor.sc.h  rd rs1 rs2  31..27=12 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.xor.sci.h rd rs1 imm6 31..27=12 26=0      14..12=6 6..2=0x15 1..0=3
+pv.xor.b     rd rs1 rs2  31..27=12 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.xor.sc.b  rd rs1 rs2  31..27=12 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.xor.sci.b rd rs1 imm6 31..27=12 26=0      14..12=7 6..2=0x15 1..0=3
+pv.and.h     rd rs1 rs2  31..27=13 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.and.sc.h  rd rs1 rs2  31..27=13 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.and.sci.h rd rs1 imm6 31..27=13 26=0      14..12=6 6..2=0x15 1..0=3
+pv.and.b     rd rs1 rs2  31..27=13 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.and.sc.b  rd rs1 rs2  31..27=13 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.and.sci.b rd rs1 imm6 31..27=13 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.abs.h      rd rs1 31..27=14 26=0 25=0 24..20=0 14..12=0 6..2=0x15 1..0=3
+pv.abs.b      rd rs1 31..27=14 26=0 25=0 24..20=0 14..12=1 6..2=0x15 1..0=3
+
+pv.extract.h  rd rs1 imm6 31..27=15 26=0 14..12=6 6..2=0x15 1..0=3
+pv.extract.b  rd rs1 imm6 31..27=15 26=0 14..12=7 6..2=0x15 1..0=3
+pv.extractu.h rd rs1 imm6 31..27=18 26=0 14..12=6 6..2=0x15 1..0=3
+pv.extractu.b rd rs1 imm6 31..27=18 26=0 14..12=7 6..2=0x15 1..0=3
+pv.insert.h   rd rs1 imm6 31..27=22 26=0 14..12=6 6..2=0x15 1..0=3
+pv.insert.b   rd rs1 imm6 31..27=22 26=0 14..12=7 6..2=0x15 1..0=3
+
+pv.dotup.h     rd rs1 rs2  31..27=16 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.dotup.sc.h  rd rs1 rs2  31..27=16 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.dotup.sci.h rd rs1 imm6 31..27=16 26=0      14..12=6 6..2=0x15 1..0=3
+pv.dotup.b     rd rs1 rs2  31..27=16 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.dotup.sc.b  rd rs1 rs2  31..27=16 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.dotup.sci.b rd rs1 imm6 31..27=16 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.dotusp.h     rd rs1 rs2  31..27=17 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.dotusp.sc.h  rd rs1 rs2  31..27=17 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.dotusp.sci.h rd rs1 imm6 31..27=17 26=0      14..12=6 6..2=0x15 1..0=3
+pv.dotusp.b     rd rs1 rs2  31..27=17 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.dotusp.sc.b  rd rs1 rs2  31..27=17 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.dotusp.sci.b rd rs1 imm6 31..27=17 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.dotsp.h     rd rs1 rs2  31..27=19 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.dotsp.sc.h  rd rs1 rs2  31..27=19 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.dotsp.sci.h rd rs1 imm6 31..27=19 26=0      14..12=6 6..2=0x15 1..0=3
+pv.dotsp.b     rd rs1 rs2  31..27=19 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.dotsp.sc.b  rd rs1 rs2  31..27=19 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.dotsp.sci.b rd rs1 imm6 31..27=19 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.sdotup.h     rd rs1 rs2  31..27=20 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.sdotup.sc.h  rd rs1 rs2  31..27=20 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.sdotup.sci.h rd rs1 imm6 31..27=20 26=0      14..12=6 6..2=0x15 1..0=3
+pv.sdotup.b     rd rs1 rs2  31..27=20 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.sdotup.sc.b  rd rs1 rs2  31..27=20 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.sdotup.sci.b rd rs1 imm6 31..27=20 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.sdotusp.h     rd rs1 rs2  31..27=21 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.sdotusp.sc.h  rd rs1 rs2  31..27=21 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.sdotusp.sci.h rd rs1 imm6 31..27=21 26=0      14..12=6 6..2=0x15 1..0=3
+pv.sdotusp.b     rd rs1 rs2  31..27=21 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.sdotusp.sc.b  rd rs1 rs2  31..27=21 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.sdotusp.sci.b rd rs1 imm6 31..27=21 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.sdotsp.h     rd rs1 rs2  31..27=23 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.sdotsp.sc.h  rd rs1 rs2  31..27=23 26=0 25=0 14..12=4 6..2=0x15 1..0=3
+pv.sdotsp.sci.h rd rs1 imm6 31..27=23 26=0      14..12=6 6..2=0x15 1..0=3
+pv.sdotsp.b     rd rs1 rs2  31..27=23 26=0 25=0 14..12=1 6..2=0x15 1..0=3
+pv.sdotsp.sc.b  rd rs1 rs2  31..27=23 26=0 25=0 14..12=5 6..2=0x15 1..0=3
+pv.sdotsp.sci.b rd rs1 imm6 31..27=23 26=0      14..12=7 6..2=0x15 1..0=3
+
+pv.shuffle2.h rd rs1 rs2 31..27=25 26=0 25=0 14..12=0 6..2=0x15 1..0=3
+pv.shuffle2.b rd rs1 rs2 31..27=25 26=0 25=0 14..12=1 6..2=0x15 1..0=3
diff --git a/toolchain/riscv-opcodes/parse_opcodes b/toolchain/riscv-opcodes/parse_opcodes
index a33c7a43c..f7b0a837e 100755
--- a/toolchain/riscv-opcodes/parse_opcodes
+++ b/toolchain/riscv-opcodes/parse_opcodes
@@ -38,6 +38,8 @@ arglut['shamtw'] = (24,20)
 
 # for xpulpimg
 arglut['imm5'] = (24,20)
+arglut['prs3'] = (11,7)
+arglut['imm6'] = (25,20)
 
 # for vectors
 arglut['vd'] = (11,7)