diff --git a/CHANGELOG.md b/CHANGELOG.md
index f9c9660fd..c78894eab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Use custom compiler for VCS specified with `CC` and `CCX` environment variable
 - Implement operand gating for SIMD and MAC Units in Snitch IPU's DSP Unit
 - Add Channel Estimation application and kernels
+- Add Gauss-Jordan matrix inversion kernel
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`
diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h
new file mode 100644
index 000000000..a37d5f38c
--- /dev/null
+++ b/software/apps/mat_inv/initialization.h
@@ -0,0 +1,95 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define FIXED_POINT 16
+#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a, b) ((int32_t)((a * b) >> FIXED_POINT))
+#define MIN(a, b) (a < b ? a : b)
+
+dump(l, 1);
+dump(loopCnt, 2);
+dump(i, 3);
+
+void display(int32_t *A, int32_t n, int32_t m);
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m);
+#endif
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
+
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+                int32_t n, int32_t m, int32_t o);
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id);
+
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                       uint32_t core_id);
+
+void display(int32_t *A, int32_t n, int32_t m) {
+  int32_t i;
+  for (i = 0; i < n * m; i++) {
+    printf("Output[%d] = %8d\n", i, A[i]);
+  }
+}
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m) {
+  int32_t i, j, k, shift;
+  for (i = 0; i < n * m; i++) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    printf("Output[%d] = %8d\n", i, A[shift + j]);
+  }
+}
+#endif
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) {
+  int32_t i, j;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      t_matrix[j * n + i] = matrix[i * m + j];
+    }
+  }
+}
+
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+                int32_t n, int32_t m, int32_t o) {
+  int32_t i, j, k;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < o; j++) {
+      matrix_product[i * o + j] = 0;
+      for (k = 0; k < m; k++) {
+        matrix_product[i * o + j] +=
+            FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
+      }
+    }
+  }
+}
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id) {
+  if (core_id == 0) {
+    for (uint32_t j = 0; j < num_rows; j++) {
+      for (uint32_t i = 0; i < num_columns; i++) {
+        matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  }
+}
+
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                       uint32_t core_id) {
+  if (core_id == 0) {
+    for (uint32_t i = 0; i < num_columns; i++) {
+      for (uint32_t j = 0; j < num_rows; j++) {
+        matrix[j * num_columns + i] = 0;
+      }
+    }
+  }
+}
diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
new file mode 100644
index 000000000..7ada71ebb
--- /dev/null
+++ b/software/apps/mat_inv/main.c
@@ -0,0 +1,105 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#define N 16
+#define M 16
+#define N_BANKS (1024)
+#define N_USED_BANKS (16)
+
+#define VERBOSE
+#define SINGLE
+// #define PARALLEL
+// #define FOLDED
+
+#include "initialization.h"
+#include "kernel/mempool_mat_inv_q32p.h"
+#include "kernel/mempool_mat_inv_q32s.h"
+
+#ifdef FOLDED
+int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)]
+    __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)]
+    __attribute__((aligned(N_BANKS), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
+#else
+int32_t matrix[N * M] __attribute__((aligned(N), section(".l1")));
+int32_t inv[M * M] __attribute__((aligned(N), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
+#endif
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
+
+/* initialize the data */
+#if defined(SINGLE) || defined(PARALLEL)
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(inv, M, M, core_id);
+  if (core_id == 0) {
+    flag = 0U;
+  }
+  mempool_barrier(num_cores);
+
+#elif defined(FOLDED)
+  uint32_t nPE = N_USED_BANKS >> 2U;
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+  init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+  if (core_id == 0) {
+    flag = 0U;
+  }
+  mempool_barrier(num_cores);
+
+#endif
+
+/* Execute the kernel */
+#if defined(SINGLE)
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_GJinv_q32s(matrix, inv, M);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+
+#elif defined(PARALLEL)
+  if (core_id < MIN(NUM_CORES, N / 4)) {
+    mempool_start_benchmark();
+    mempool_GJinv_q32p(matrix, inv, M, &flag);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+
+#elif defined(FOLDED)
+  mempool_start_benchmark();
+  fold_matrix(matrix, folded_matrix, N);
+  mempool_stop_benchmark();
+  if (core_id < nPE) {
+    mempool_start_benchmark();
+    mempool_GJinv_folded_q32p(folded_matrix, inv, M, &flag, nPE);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+
+#endif
+
+/* Display the result of computation */
+#ifdef VERBOSE
+  if (core_id == 0)
+    display(inv, M, N);
+  mempool_barrier(num_cores);
+#endif
+
+  return 0;
+}
diff --git a/software/runtime/kernel/mempool_mat_inv_q32p.h b/software/runtime/kernel/mempool_mat_inv_q32p.h
new file mode 100644
index 000000000..a937ae33e
--- /dev/null
+++ b/software/runtime/kernel/mempool_mat_inv_q32p.h
@@ -0,0 +1,552 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* GAUSS JORDAN INVERSION */
+
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
+
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t i, j, k, shift;
+  for (i = core_id * 4; i < n * n; i += num_cores * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDst[shift + j] = pSrc[i];
+    pDst[shift + j + 1] = pSrc[i + 1];
+    pDst[shift + j + 2] = pSrc[i + 2];
+    pDst[shift + j + 3] = pSrc[i + 3];
+  }
+  mempool_log_barrier(2, core_id);
+}
+
+int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                       uint32_t *flag) {
+
+  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t i, j, loopCnt, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matirces must be square. */
+
+  /* CREATE THE IDENTITY MATRIX */
+
+  pDstT1 = pDst;
+  for (k = core_id * 4; k < m; k += 4 * num_cores) {
+    for (j = 0; j < m; j++) {
+      pDstT1[k * m + j] = (int32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j);
+    }
+  }
+  mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+  /* Loop over the number of columns of the input matrix. */
+  loopCnt = n;
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+
+  while (loopCnt > 0U) {
+
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    in = *pSrcT1;
+
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          pSrcT2 = pSrc + (n * k);
+          pDstT2 = pDst + (n * k);
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
+            }
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
+        }
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+
+    ///* Loop over columns to the right of pivot */
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+      // j += num_cores * 4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    /* Loop over columns */
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    /* Loop over rows */
+    for (k = core_id * 4; k < m; k += num_cores * 4) {
+      i = 0U;
+      while (i < 4) {
+        if ((i + k) != l) {
+          pSrcT1 = pSrc + (i + k) * n;
+          pSrcT2 = pDst + (i + k) * n;
+          /* Element of the reference row */
+          in = *pSrcT1;
+          pPRT_in = pPivotRowIn;
+          pPRT_pDst = pPivotRowDst;
+          /* Loop over columns to the right of pivot */
+          j = 0;
+          while (j < 4 * ((n - l) >> 2U)) {
+            in1 = pSrcT1[j];
+            in2 = pSrcT1[j + 1];
+            in3 = pSrcT1[j + 2];
+            in4 = pSrcT1[j + 3];
+            out1 = pPRT_in[j];
+            out2 = pPRT_in[j + 1];
+            out3 = pPRT_in[j + 2];
+            out4 = pPRT_in[j + 3];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+          /* Loop over columns */
+          j = 0;
+          while (j < 4 * (n >> 2U)) {
+            in1 = pSrcT2[j];
+            in2 = pSrcT2[j + 1];
+            in3 = pSrcT2[j + 2];
+            in4 = pSrcT2[j + 3];
+            out1 = pPRT_pDst[j];
+            out2 = pPRT_pDst[j + 1];
+            out3 = pPRT_pDst[j + 2];
+            out4 = pPRT_pDst[j + 3];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+        i++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    pSrc++;    /* Increment the input pointer */
+    loopCnt--; /* Decrement the loop counter */
+    l++;       /* Increment the index modifier */
+  }
+
+  //    if ((flag != 1U) && (x == 0)) {
+  //        for (i = 0; i < m * n; i++) {
+  //            if (pSrc[i] != 0)
+  //                break;
+  //        }
+  //        if (i == m * n)
+  //            return 1;
+  //    }
+  return 0;
+}
+
+/* The input matrix is folded in memory, to have ony local accesses */
+int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                              uint32_t *flag, uint32_t nPE) {
+
+  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  uint32_t shift = 0;
+  /* loop counters */
+  uint32_t i, j, k, l;
+  /* M is the number of rows. However, the matrices must be square. */
+  uint32_t m = n;
+
+  /* CREATE THE IDENTITY MATRIX */
+  pDstT1 = pDst;
+  for (i = core_id * 4; i < n * m; i += nPE * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDstT1[shift + j] = (int32_t)(k == j);
+    pDstT1[shift + j + 1] = (int32_t)(k == (j + 1));
+    pDstT1[shift + j + 2] = (int32_t)(k == (j + 2));
+    pDstT1[shift + j + 3] = (int32_t)(k == (j + 3));
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
+
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pSrcT1 = pSrc + shift;
+    pDstT1 = pDst + shift;
+    in = *pSrcT1;
+
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    // This is done by a single core
+    if (absolute_core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+          pSrcT2 = pSrc + shift;
+          pDstT2 = pDst + shift;
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
+            }
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
+        }
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pPivotRowIn = pSrc + shift;
+    pPivotRowDst = pDst + shift;
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+
+    /* Loop over columns to the right of pivot */
+    core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+    }
+    if (core_id == 0) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+
+    /* Loop over columns */
+    core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (k != l) {
+        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+        pSrcT1 = pSrc + shift;
+        pSrcT2 = pDst + shift;
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
+        /* Loop over the columns */
+        core_id = absolute_core_id % (n >> 2U);
+        core_id = core_id - (l >> 2U);
+        j = core_id * 4;
+        while (j < 4 * ((n - l) >> 2U)) {
+          out1 = pPRT_in[j];
+          out2 = pPRT_in[j + 1];
+          out3 = pPRT_in[j + 2];
+          out4 = pPRT_in[j + 3];
+          in1 = pSrcT1[j];
+          in2 = pSrcT1[j + 1];
+          in3 = pSrcT1[j + 2];
+          in4 = pSrcT1[j + 3];
+          pSrcT1[j] = in1 - FIX_MUL(in, out1);
+          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
+        }
+        if (core_id == 0) {
+          j = 4 * ((n - l) >> 2U);
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+        core_id = absolute_core_id % (n >> 2U);
+        /* Loop over the columns */
+        j = core_id * 4;
+        while (j < 4 * (n >> 2U)) {
+          out1 = pPRT_pDst[j];
+          out2 = pPRT_pDst[j + 1];
+          out3 = pPRT_pDst[j + 2];
+          out4 = pPRT_pDst[j + 3];
+          in1 = pSrcT2[j];
+          in2 = pSrcT2[j + 1];
+          in3 = pSrcT2[j + 2];
+          in4 = pSrcT2[j + 3];
+          pSrcT2[j] = in1 - FIX_MUL(in, out1);
+          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
+        }
+        if (core_id == (n >> 2U) - 1) {
+          j = 4 * (n >> 2U);
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  return 0;
+}
diff --git a/software/runtime/kernel/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h
new file mode 100644
index 000000000..ce84de24e
--- /dev/null
+++ b/software/runtime/kernel/mempool_mat_inv_q32s.h
@@ -0,0 +1,275 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* GAUSS JORDAN INVERSION */
+
+int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n);
+
+/* GAUSS JORDAN ALGORITHM
+  - Form the augmented matrix by the identity matrix
+  - LOOP OVER ROWS ...
+  - Check if the element on the diagonal of the input matrix is zero
+    > The element is zero, check if there is a nonzero element in one of the
+  rows below on the same column > Exchange the row with the row containing a
+  nonzero element on the same column > If there is no such element then the
+  matrix is singular and the algorithm fails
+
+  - Divide the current row by the element on the diagonal
+  - Replace all the rows below with the sum of that row and a multiple of the
+  current row (row i), so that each new element in column i, below row i is
+  zero.
+*/
+
+int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
+
+  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
+
+  uint32_t m =
+      n; /* M is the number of rows. However, the matrices must be square. */
+  uint32_t i, j, k, l; /* loop counters */
+  uint32_t flag = 0U;  /* Flag to check if the matrix is singular */
+
+  pDstT1 = pDst; /* Working pointer for destination matrix */
+  /* CREATE THE IDENTITY MATRIX */
+  for (k = 0; k < m; k += 4) {
+    for (j = 0; j < n; j++) {
+      pDstT1[k * m + j] = (int32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j);
+    }
+  }
+
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
+
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    k = 1U;
+    in = *pSrcT1;
+
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+
+    if (in == 0) {
+      /* Loop over the rows present below */
+      for (i = (l + 1U); i < m; i++) {
+        pSrcT2 = pSrc + (n * i);
+        pDstT2 = pDstT1 + (n * k);
+
+        /* EXCHANGE */
+
+        if (*pSrcT2 != 0) {
+          /* Loop over colums to the right of the pivot */
+          j = 0;
+          while (j < (n - l) - (n - l) % 4) {
+            Xchg1 = *(pSrcT2);
+            Xchg2 = *(pSrcT2 + 1);
+            Xchg3 = *(pSrcT2 + 2);
+            Xchg4 = *(pSrcT2 + 3);
+            out1 = *(pSrcT1);
+            out2 = *(pSrcT1 + 1);
+            out3 = *(pSrcT1 + 2);
+            out4 = *(pSrcT1 + 3);
+            *pSrcT2++ = out1;
+            *pSrcT2++ = out2;
+            *pSrcT2++ = out3;
+            *pSrcT2++ = out4;
+            *pSrcT1++ = Xchg1;
+            *pSrcT1++ = Xchg2;
+            *pSrcT1++ = Xchg3;
+            *pSrcT1++ = Xchg4;
+            j += 4;
+          }
+          while (j < n - l) {
+            Xchg1 = *pSrcT2;
+            *pSrcT2++ = *pSrcT1;
+            *pSrcT1++ = Xchg1;
+            j++;
+          }
+          /* Loop over colums */
+          j = 0;
+          while (j < n - n % 4) {
+            Xchg1 = *(pDstT2);
+            Xchg2 = *(pDstT2 + 1);
+            Xchg3 = *(pDstT2 + 2);
+            Xchg4 = *(pDstT2 + 3);
+            out1 = *(pDstT1);
+            out2 = *(pDstT1 + 1);
+            out3 = *(pDstT1 + 2);
+            out4 = *(pDstT1 + 3);
+            *pDstT2++ = out1;
+            *pDstT2++ = out2;
+            *pDstT2++ = out3;
+            *pDstT2++ = out4;
+            *pDstT1++ = Xchg1;
+            *pDstT1++ = Xchg2;
+            *pDstT1++ = Xchg3;
+            *pDstT1++ = Xchg4;
+            j += 4;
+          }
+          while (j < n) {
+            Xchg1 = *pDstT2;
+            *pDstT2++ = *pDstT1;
+            *pDstT1++ = Xchg1;
+            j++;
+          }
+          flag = 1U;
+          break;
+        }
+        k++;
+      }
+    }
+    /* Return when the matrix is singular */
+    if ((flag == 0U) && (in == 0)) {
+      return 1;
+    }
+
+    /* DIVIDE BY THE PIVOT */
+
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+
+    /* Loop over columns to the right of the pilot element */
+    j = 0;
+    while (j < 4 * ((n - l) >> 2U)) {
+      in1 = *pSrcT1;
+      in2 = *(pSrcT1 + 1);
+      in3 = *(pSrcT1 + 2);
+      in4 = *(pSrcT1 + 3);
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      *pSrcT1++ = out1;
+      *pSrcT1++ = out2;
+      *pSrcT1++ = out3;
+      *pSrcT1++ = out4;
+      j += 4;
+    }
+    while (j < n - l) {
+      in1 = *pSrcT1;
+      *pSrcT1++ = FIX_DIV(in1, in);
+      j++;
+    }
+
+    /* Loop over columns of the destination matrix */
+    j = 0;
+    while (j < 4 * (n >> 2U)) {
+      in1 = *pSrcT2;
+      in2 = *(pSrcT2 + 1);
+      in3 = *(pSrcT2 + 2);
+      in4 = *(pSrcT2 + 3);
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      *pSrcT2++ = out1;
+      *pSrcT2++ = out2;
+      *pSrcT2++ = out3;
+      *pSrcT2++ = out4;
+      j += 4;
+    }
+    while (j < n) {
+      in1 = *pSrcT2;
+      *pSrcT2++ = FIX_DIV(in1, in);
+      j++;
+    }
+
+    /* REPLACE ROWS */
+
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    i = 0U; /* pivot index */
+    k = m;  /* row index */
+    while (k > 0U) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (i == l) {
+        pSrcT1 += n - l;
+        pSrcT2 += n;
+      } else {
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
+        j = 0;
+        while (j < 4 * ((n - l) >> 2U)) {
+          in1 = *pSrcT1;
+          in2 = *(pSrcT1 + 1);
+          in3 = *(pSrcT1 + 2);
+          in4 = *(pSrcT1 + 3);
+          out1 = *pPRT_in++;
+          out2 = *pPRT_in++;
+          out3 = *pPRT_in++;
+          out4 = *pPRT_in++;
+          *pSrcT1++ = in1 - FIX_MUL(in, out1);
+          *pSrcT1++ = in2 - FIX_MUL(in, out2);
+          *pSrcT1++ = in3 - FIX_MUL(in, out3);
+          *pSrcT1++ = in4 - FIX_MUL(in, out4);
+          j += 4;
+        }
+        while (j < n - l) {
+          in1 = *pSrcT1;
+          out1 = *pPRT_in++;
+          *pSrcT1++ = in1 - FIX_MUL(in, out1);
+          j++;
+        }
+
+        /* Loop over the columns to
+           replace the elements in the destination matrix */
+        j = 0;
+        while (j < 4 * (n >> 2U)) {
+          in1 = *pSrcT2;
+          in2 = *(pSrcT2 + 1);
+          in3 = *(pSrcT2 + 2);
+          in4 = *(pSrcT2 + 3);
+          out1 = *pPRT_pDst++;
+          out2 = *pPRT_pDst++;
+          out3 = *pPRT_pDst++;
+          out4 = *pPRT_pDst++;
+          *pSrcT2++ = in1 - FIX_MUL(in, out1);
+          *pSrcT2++ = in2 - FIX_MUL(in, out2);
+          *pSrcT2++ = in3 - FIX_MUL(in, out3);
+          *pSrcT2++ = in4 - FIX_MUL(in, out4);
+          j += 4;
+        }
+        while (j < n) {
+          in1 = *pSrcT2;
+          out1 = *pPRT_pDst;
+          *pSrcT2++ = in1 - FIX_MUL(in, out1);
+          j++;
+        }
+      }
+      /* Increment temporary input pointer */
+      pSrcT1 = pSrcT1 + l;
+      /* Decrement loop counter */
+      k--;
+      /* Increment pivot index */
+      i++;
+    }
+
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+
+  return 0;
+}
diff --git a/software/runtime/serial.c b/software/runtime/serial.c
index a53ec2e1f..44aa30fe2 100644
--- a/software/runtime/serial.c
+++ b/software/runtime/serial.c
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 
-extern char fake_uart;
+extern volatile char fake_uart;
 
 void _putchar(char character) {
   // send char to console