From 03a8625e36329e539ef807143d130f06f3195eca Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 13 Jul 2022 18:39:21 +0200
Subject: [PATCH 01/22] [software] Add Moore Penrose inversion kernel

---
 .../apps/MP_matrix_inverse/initialization.h   |  53 +++
 software/apps/MP_matrix_inverse/inverse.h     | 390 ++++++++++++++++++
 software/apps/MP_matrix_inverse/main.c        |  82 ++++
 3 files changed, 525 insertions(+)
 create mode 100644 software/apps/MP_matrix_inverse/initialization.h
 create mode 100644 software/apps/MP_matrix_inverse/inverse.h
 create mode 100644 software/apps/MP_matrix_inverse/main.c

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h
new file mode 100644
index 000000000..e7e834de9
--- /dev/null
+++ b/software/apps/MP_matrix_inverse/initialization.h
@@ -0,0 +1,53 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
+                 uint32_t num_cores);
+
+void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                         uint32_t core_id, uint32_t num_cores);
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
+                 uint32_t num_cores) {
+  uint32_t const split = 8; // How many rows/columns to split the matrix into
+  if (num_columns > num_rows) {
+    // Parallelize over columns
+    uint32_t const c_start = (num_rows / split) * (core_id % split);
+    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
+    for (uint32_t j = (core_id / split); j < num_columns;
+         j += (num_cores / split)) {
+      for (uint32_t i = c_start; i < c_end; ++i) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  } else {
+    // Parallelize over rows
+    uint32_t const c_start = (num_columns / split) * (core_id % split);
+    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
+    for (uint32_t i = (core_id / split); i < num_rows;
+         i += (num_cores / split)) {
+      for (uint32_t j = c_start; j < c_end; ++j) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  }
+}
+
+void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                         uint32_t core_id, uint32_t num_cores) {
+
+  if(core_id == 0) {
+    for(uint32_t i = 0; i < num_columns; i++) {
+      for(uint32_t j = 0; j < num_rows; j++) {
+          matrix[j * num_rows + i] = 0;
+      }
+    }
+    printf("SONO QUI\n");
+  }
+
+}
diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h
new file mode 100644
index 000000000..19dfc9b1e
--- /dev/null
+++ b/software/apps/MP_matrix_inverse/inverse.h
@@ -0,0 +1,390 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n);
+
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n);
+
+void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n);
+
+int32_t determinant(int32_t *A, int32_t n);
+
+void adjoint(int32_t *A,int32_t *adj, int32_t n);
+
+int32_t inverse(int32_t *A, int32_t *inverse, int32_t n);
+
+int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n);
+
+ 
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) {
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+             t_matrix[j * n + i]=matrix[i * n + j];
+        }
+    }
+}
+ 
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) {
+    int k;
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {             // not j<M
+            matrix_product[i * n + j] = 0;
+            for (k = 0; k < n; k++) {
+                matrix_product[i * n + j] += matrix_1[i * n + k] * matrix_2[k * n + j];
+            }
+        }
+    }
+}
+
+// Function to get cofactor
+void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
+    int32_t i = 0, j = 0;
+    // Looping for each element of the matrix
+    for (int32_t row = 0; row < n; row++) {
+      for (int32_t col = 0; col < n; col++) {
+        // Copying into temporary matrix only those element
+        // which are not in given row and column
+        if (row != p && col != q) {
+          temp[i * N + j++] = A[row * N + col];
+          // Row is filled, so increase row index and
+          // reset col index
+          if (j == n - 1) {
+              j = 0;
+              i++;
+          }
+        }
+      }
+    }
+}
+ 
+// Recursive function for finding determinant of matrix.
+int32_t determinant(int32_t *A, int32_t n) {
+
+    int32_t D = 0; // Initialize result
+    // Base case : if matrix contains single element
+    if (n == 1)
+        return A[0];
+ 
+    int32_t temp[N * N]; // To store cofactors
+    for(int32_t i =0; i < N*N; i++)
+      temp[i] = 0;
+
+    int32_t sign = 1; // To store sign multiplier
+    // Iterate for each element of first row
+    for (int32_t f = 0; f < n; f++) {
+
+        // Getting Cofactor of A[0][f]
+        getCofactor(A, temp, 0, f, n);
+
+        D += sign * A[0 * N + f] * determinant(temp, n - 1);
+        // terms are to be added with alternate sign
+        sign = -sign;
+    }
+
+    return D;
+}
+ 
+// Function to get adjoint
+void adjoint(int32_t *A,int32_t *adj, int32_t n) {
+    if (n == 1) {
+        adj[0] = 1;
+        return;
+    }
+    // temp is used to store cofactors 
+    int32_t sign = 1;
+    int32_t temp[N * N];
+    for (int32_t i = 0; i < N; i++) {
+        for (int32_t j = 0; j < N; j++) {
+            // Get cofactor
+            getCofactor(A, temp, i, j, N);
+            // sign of adj positive if sum of row
+            // and column indexes is even.
+            sign = ((i + j) % 2 == 0) ? 1 : -1;
+            // Interchanging rows and columns to get the
+            // transpose of the cofactor matrix
+            adj[j * N + i] = (sign)*(determinant(temp, N - 1));
+        }
+    }
+}
+ 
+// Function to calculate and store inverse, returns false if
+// matrix is singular
+int32_t inverse(int32_t *A, int32_t *inverse, int32_t n) {
+    // Find determinant of A[][]
+    int32_t det = determinant(A, n);
+    if (det == 0) {
+        printf("Singular matrix, can't find its inverse\n");
+        return 0;
+    }
+ 
+    // Find adjoint
+    int32_t adj[n * n];
+    adjoint(A, adj, n);
+ 
+    // Find Inverse using formula "inverse(A) = adj(A)/det(A)"
+    for (int32_t i = 0; i < n; i++)
+        for (int32_t j = 0; j < n; j++)
+            inverse[i * n + j]= adj[i * n + j] / det;
+    return 1;
+}
+
+
+int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n) {
+
+    int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
+    int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
+    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
+    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+    int32_t Xchg, in = 0, in1;                      /* Temporary input values  */
+    uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
+
+    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+
+    /* Working pointer for destination matrix */
+    pDstT1 = pDst;
+
+    /* Loop over the number of rows */
+    rowCnt = m;
+
+    /* Making the destination matrix as identity matrix */
+    while (rowCnt > 0U) {
+        /* Writing all zeroes in lower triangle of the destination matrix */
+        j = m - rowCnt;
+        while (j > 0U) {
+            *pDstT1++ = 0;
+            j--;
+        }
+
+        /* Writing all ones in the diagonal of the destination matrix */
+        *pDstT1++ = 1;
+
+        /* Writing all zeroes in upper triangle of the destination matrix */
+        j = rowCnt - 1U;
+        while (j > 0U) {
+            *pDstT1++ = 0;
+            j--;
+        }
+
+        /* Decrement loop counter */
+        rowCnt--;
+    }
+
+    /* Loop over the number of columns of the input matrix.
+       All the elements in each column are processed by the row operations */
+    loopCnt = n;
+
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+
+    while (loopCnt > 0U) {
+        /* Check if the pivot element is zero..
+         * If it is zero then interchange the row with non zero row below.
+         * If there is no non zero element to replace in the rows below,
+         * then the matrix is Singular. */
+
+        /* Working pointer for the input matrix that points
+         * to the pivot element of the particular row  */
+        pSrcT1 = pSrc + (l * n);
+
+        /* Working pointer for the destination matrix that points
+         * to the pivot element of the particular row  */
+        pDstT1 = pDst + (l * n);
+
+        /* Temporary variable to hold the pivot value */
+        in = *pSrcT1;
+
+        /* Destination pointer modifier */
+        k = 1U;
+
+        /* Check if the pivot element is zero */
+        if (*pSrcT1 == 0) {
+            /* Loop over the number rows present below */
+
+            for (i = (l + 1U); i < m; i++) {
+                /* Update the input and destination pointers */
+                pSrcT2 = pSrcT1 + (n * i);
+                pDstT2 = pDstT1 + (n * k);
+
+                /* Check if there is a non zero pivot element to
+                 * replace in the rows below */
+                if (*pSrcT2 != 0) {
+                    /* Loop over number of columns
+                     * to the right of the pilot element */
+                    j = n - l;
+
+                    while (j > 0U) {
+                        /* Exchange the row elements of the input matrix */
+                        Xchg = *pSrcT2;
+                        *pSrcT2++ = *pSrcT1;
+                        *pSrcT1++ = Xchg;
+
+                        /* Decrement the loop counter */
+                        j--;
+                    }
+
+                    /* Loop over number of columns of the destination matrix */
+                    j = n;
+
+                    while (j > 0U) {
+                        /* Exchange the row elements of the destination matrix */
+                        Xchg = *pDstT2;
+                        *pDstT2++ = *pDstT1;
+                        *pDstT1++ = Xchg;
+
+                        /* Decrement loop counter */
+                        j--;
+                    }
+
+                    /* Flag to indicate whether exchange is done or not */
+                    flag = 1U;
+
+                    /* Break after exchange is done */
+                    break;
+                }
+
+                /* Update the destination pointer modifier */
+                k++;
+
+                /* Decrement loop counter */
+            }
+        }
+
+        /* Update the status if the matrix is singular */
+        if ((flag != 1U) && (in == 0)) {
+            return 1;
+        }
+
+        /* Points to the pivot row of input and destination matrices */
+        pPivotRowIn = pSrc + (l * n);
+        pPivotRowDst = pDst + (l * n);
+
+        /* Temporary pointers to the pivot row pointers */
+        pSrcT1 = pPivotRowIn;
+        pSrcT2 = pPivotRowDst;
+
+        /* Pivot element of the row */
+        in = *pPivotRowIn;
+
+        /* Loop over number of columns
+         * to the right of the pilot element */
+        j = (n - l);
+
+        while (j > 0U) {
+            /* Divide each element of the row of the input matrix
+             * by the pivot element */
+            in1 = *pSrcT1;
+            *pSrcT1++ = in1 / in;
+
+            /* Decrement the loop counter */
+            j--;
+        }
+
+        /* Loop over number of columns of the destination matrix */
+        j = n;
+
+        while (j > 0U) {
+            /* Divide each element of the row of the destination matrix
+             * by the pivot element */
+            in1 = *pSrcT2;
+            *pSrcT2++ = in1 / in;
+
+            /* Decrement the loop counter */
+            j--;
+        }
+
+        /* Replace the rows with the sum of that row and a multiple of row i
+         * so that each new element in column i above row i is zero.*/
+
+        /* Temporary pointers for input and destination matrices */
+        pSrcT1 = pSrc;
+        pSrcT2 = pDst;
+
+        /* index used to check for pivot element */
+        i = 0U;
+
+        /* Loop over number of rows */
+        /*  to be replaced by the sum of that row and a multiple of row i */
+        k = m;
+
+        while (k > 0U) {
+            /* Check for the pivot element */
+            if (i == l) {
+                /* If the processing element is the pivot element,
+                   only the columns to the right are to be processed */
+                pSrcT1 += n - l;
+
+                pSrcT2 += n;
+            } else {
+                /* Element of the reference row */
+                in = *pSrcT1;
+
+                /* Working pointers for input and destination pivot rows */
+                pPRT_in = pPivotRowIn;
+                pPRT_pDst = pPivotRowDst;
+
+                /* Loop over the number of columns to the right of the pivot element,
+                   to replace the elements in the input matrix */
+                j = (n - l);
+
+                while (j > 0U) {
+                    /* Replace the element by the sum of that row
+                       and a multiple of the reference row  */
+                    in1 = *pSrcT1;
+                    *pSrcT1++ = in1 - (in * *pPRT_in++);
+
+                    /* Decrement the loop counter */
+                    j--;
+                }
+
+                /* Loop over the number of columns to
+                   replace the elements in the destination matrix */
+                j = n;
+
+                while (j > 0U) {
+                    /* Replace the element by the sum of that row
+                       and a multiple of the reference row  */
+                    in1 = *pSrcT2;
+                    *pSrcT2++ = in1 - (in * *pPRT_pDst++);
+
+                    /* Decrement loop counter */
+                    j--;
+                }
+            }
+
+            /* Increment temporary input pointer */
+            pSrcT1 = pSrcT1 + l;
+
+            /* Decrement loop counter */
+            k--;
+
+            /* Increment pivot index */
+            i++;
+        }
+
+        /* Increment the input pointer */
+        pSrc++;
+
+        /* Decrement the loop counter */
+        loopCnt--;
+
+        /* Increment the index modifier */
+        l++;
+    }
+
+    if ((flag != 1U) && (in == 0)) {
+        for (i = 0; i < m * n; i++) {
+            if (pSrc[i] != 0)
+                break;
+        }
+
+        if (i == m * n)
+            return 1;
+    }
+
+    return 0;
+}
+ 
diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c
new file mode 100644
index 000000000..587ee06b0
--- /dev/null
+++ b/software/apps/MP_matrix_inverse/main.c
@@ -0,0 +1,82 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+//#include <stdint.h>
+//#include <string.h>
+
+#define N 5
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "inverse.h"
+#include "initialization.h"
+
+// C++ program to find Moore-Penrose inverse  matrix
+ 
+// Generic function to display the matrix. We use it to display
+// both adjoin and inverse. adjoin is integer matrix and inverse
+// is a int32_t.
+void display(int32_t *A, int32_t n)
+{
+    for (int i = 0; i < n; i++)
+    {
+        for (int j = 0; j < n; j++)
+            printf("%4d ", A[i * n + j]);
+        printf("\n");
+    }
+}
+
+// Driver program
+int main()
+{
+
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t num_cores = mempool_get_core_count();
+    // Initialize barrier and synchronize
+    mempool_barrier_init(core_id);
+
+    int32_t matrix[N * N] = {  -2, 2, 7, 9, 4, 0, 8,
+                                1, 0, 0, 3, 1, 0, 9,
+                               -3, 1, 5, 0, 2, 1, 7,
+                                3,-1,-9, 4, 6, 5, 2,
+                                1, 0, 4, 4, 1, 0, 9,
+                                8, 0, 3, 8, 6, 5, 2,
+                                5, 6, 4, 1, 3, 2, 0  };
+
+    int32_t t_matrix[N * N];
+    int32_t matrix_mult[N * N];
+    int32_t pseudoinverse[N * N];
+    int32_t inv[N * N]; // To store inverse 
+
+//    init_matrix_zeros(t_matrix, N, N, core_id, num_cores);
+//    init_matrix_zeros(matrix_mult, N, N, core_id, num_cores);
+//    init_matrix_zeros(pseudoinverse, N, N, core_id, num_cores);
+//    init_matrix_zeros(adj, N, N, core_id, num_cores);
+//    init_matrix_zeros(inv, N, N, core_id, num_cores);
+    if(core_id == 0)
+      display(matrix, N);
+
+    if(core_id == 0) {
+      Transpose(matrix, t_matrix, N);
+      printf("\nThe Transpose is :\n");
+      display(t_matrix, N);
+      printf("The product of the matrix is: \n");
+      MatrixMult(t_matrix,matrix,matrix_mult, N);
+      display(matrix_mult, N);
+      printf("\nThe Inverse is :\n");
+      if (inverse(matrix_mult, inv, N))
+          display(inv, N);
+      MatrixMult(inv,t_matrix,pseudoinverse, N);
+      printf("\nThe Monroe-penrose inverse is :\n");
+      display(pseudoinverse, N);
+    }
+
+    mempool_barrier(num_cores);
+    return 0;
+}

From c56a0552808469af4c91c9b63bbe4fc24abd49e9 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 13 Jul 2022 18:39:55 +0200
Subject: [PATCH 02/22] [software] add singular value decomposition kernel

---
 software/apps/svd/SVD_Householder.txt | 781 ++++++++++++++++++++++++++
 software/apps/svd/main.c              |  93 +++
 software/apps/svd/nrutil.h            |  65 +++
 software/apps/svd/svd.c               | 237 ++++++++
 4 files changed, 1176 insertions(+)
 create mode 100644 software/apps/svd/SVD_Householder.txt
 create mode 100644 software/apps/svd/main.c
 create mode 100644 software/apps/svd/nrutil.h
 create mode 100644 software/apps/svd/svd.c

diff --git a/software/apps/svd/SVD_Householder.txt b/software/apps/svd/SVD_Householder.txt
new file mode 100644
index 000000000..1631212de
--- /dev/null
+++ b/software/apps/svd/SVD_Householder.txt
@@ -0,0 +1,781 @@
+////////////////////////////////////////////////////////////////////////////////
+// File: singular_value_decomposition.c                                       //
+// Contents:                                                                  //
+//    Singular_Value_Decomposition                                            //
+//    Singular_Value_Decomposition_Solve                                      //
+//    Singular_Value_Decomposition_Inverse                                    //
+////////////////////////////////////////////////////////////////////////////////
+
+#include <string.h>              // required for memcpy()
+#include <float.h>               // required for DBL_EPSILON
+#include <math.h>                // required for fabs(), sqrt();
+
+#define MAX_ITERATION_COUNT 30   // Maximum number of iterations
+
+//                        Internally Defined Routines 
+static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
+    int ncols, double* U, double* V, double* diagonal, double* superdiagonal );
+static int  Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
+           double* U, double* V, double* diagonal, double* superdiagonal );
+static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
+                                double* singular_value, double* U, double* V);
+
+////////////////////////////////////////////////////////////////////////////////
+//  int Singular_Value_Decomposition(double* A, int nrows, int ncols,         //
+//        double* U, double* singular_values, double* V, double* dummy_array) //
+//                                                                            //
+//  Description:                                                              //
+//     This routine decomposes an m x n matrix A, with m >= n, into a product //
+//     of the three matrices U, D, and V', i.e. A = UDV', where U is an m x n //
+//     matrix whose columns are orthogonal, D is a n x n diagonal matrix, and //
+//     V is an n x n orthogonal matrix.  V' denotes the transpose of V.  If   //
+//     m < n, then the procedure may be used for the matrix A'.  The singular //
+//     values of A are the diagonal elements of the diagonal matrix D and     //
+//     correspond to the positive square roots of the eigenvalues of the      //
+//     matrix A'A.                                                            //
+//                                                                            //
+//     This procedure programmed here is based on the method of Golub and     //
+//     Reinsch as given on pages 134 - 151 of the "Handbook for Automatic     //
+//     Computation vol II - Linear Algebra" edited by Wilkinson and Reinsch   //
+//     and published by Springer-Verlag, 1971.                                //
+//                                                                            //
+//     The Golub and Reinsch's method for decomposing the matrix A into the   //
+//     product U, D, and V' is performed in three stages:                     //
+//       Stage 1:  Decompose A into the product of three matrices U1, B, V1'  //
+//         A = U1 B V1' where B is a bidiagonal matrix, and U1, and V1 are a  //
+//         product of Householder transformations.                            //
+//       Stage 2:  Use Given' transformations to reduce the bidiagonal matrix //
+//         B into the product of the three matrices U2, D, V2'.  The singular //
+//         value decomposition is then UDV'where U = U2 U1 and V' = V1' V2'.  //
+//       Stage 3:  Sort the matrix D in decreasing order of the singular      //
+//         values and interchange the columns of both U and V to reflect any  //
+//         change in the order of the singular values.                        //
+//                                                                            //
+//     After performing the singular value decomposition for A, call          //
+//     Singular_Value_Decomposition to solve the equation Ax = B or call      //
+//     Singular_Value_Decomposition_Inverse to calculate the pseudo-inverse   //
+//     of A.                                                                  //
+//                                                                            //
+//  Arguments:                                                                //
+//     double* A                                                              //
+//        On input, the pointer to the first element of the matrix            //
+//        A[nrows][ncols].  The matrix A is unchanged.                        //
+//     int nrows                                                              //
+//        The number of rows of the matrix A.                                 //
+//     int ncols                                                              //
+//        The number of columns of the matrix A.                              //
+//     double* U                                                              //
+//        On input, a pointer to a matrix with the same number of rows and    //
+//        columns as the matrix A.  On output, the matrix with mutually       //
+//        orthogonal columns which is the left-most factor in the singular    //
+//        value decomposition of A.                                           //
+//     double* singular_values                                                //
+//        On input, a pointer to an array dimensioned to same as the number   //
+//        of columns of the matrix A, ncols.  On output, the singular values  //
+//        of the matrix A sorted in decreasing order.  This array corresponds //
+//        to the diagonal matrix in the singular value decomposition of A.    //
+//     double* V                                                              //
+//        On input, a pointer to a square matrix with the same number of rows //
+//        and columns as the columns of the matrix A, i.e. V[ncols][ncols].   //
+//        On output, the orthogonal matrix whose transpose is the right-most  //
+//        factor in the singular value decomposition of A.                    //
+//     double* dummy_array                                                    //
+//        On input, a pointer to an array dimensioned to same as the number   //
+//        of columns of the matrix A, ncols.  This array is used to store     //
+//        the super-diagonal elements resulting from the Householder reduction//
+//        of the matrix A to bidiagonal form.  And as an input to the Given's //
+//        procedure to reduce the bidiagonal form to diagonal form.           //
+//                                                                            //
+//  Return Values:                                                            //
+//     0  Success                                                             //
+//    -1  Failure - During the Given's reduction of the bidiagonal form to    //
+//                  diagonal form the procedure failed to terminate within    //
+//                  MAX_ITERATION_COUNT iterations.                           //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     double A[M][N];                                                        //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double singular_values[N];                                             //
+//     double* dummy_array;                                                   //
+//                                                                            //
+//     (your code to initialize the matrix A)                                 //
+//     dummy_array = (double*) malloc(N * sizeof(double));                    //
+//     if (dummy_array == NULL) {printf(" No memory available\n"); exit(0); } //
+//                                                                            //
+//     err = Singular_Value_Decomposition((double*) A, M, N, (double*) U,     //
+//                              singular_values, (double*) V, dummy_array);   //
+//                                                                            //
+//     free(dummy_array);                                                     //
+//     if (err < 0) printf(" Failed to converge\n");                          //
+//     else { printf(" The singular value decomposition of A is \n");         //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+int Singular_Value_Decomposition(double* A, int nrows, int ncols, double* U, 
+                      double* singular_values, double* V, double* dummy_array)
+{
+   Householders_Reduction_to_Bidiagonal_Form( A, nrows, ncols, U, V,
+                                                singular_values, dummy_array);
+
+   if (Givens_Reduction_to_Diagonal_Form( nrows, ncols, U, V,
+                                singular_values, dummy_array ) < 0) return -1;
+
+   Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, U, V);
+  
+   return 0;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,//
+//  int ncols, double* U, double* V, double* diagonal, double* superdiagonal )//
+//                                                                            //
+//  Description:                                                              //
+//     This routine decomposes an m x n matrix A, with m >= n, into a product //
+//     of the three matrices U, B, and V', i.e. A = UBV', where U is an m x n //
+//     matrix whose columns are orthogonal, B is a n x n bidiagonal matrix,   //
+//     and V is an n x n orthogonal matrix.  V' denotes the transpose of V.   //
+//     If m < n, then the procedure may be used for the matrix A'.  The       //
+//                                                                            //
+//     The matrix U is the product of Householder transformations which       //
+//     annihilate the subdiagonal components of A while the matrix V is       //
+//     the product of Householder transformations which annihilate the        //
+//     components of A to the right of the superdiagonal.                     //
+//                                                                            //
+//     The Householder transformation which leaves invariant the first k-1    //
+//     elements of the k-th column and annihilates the all the elements below //
+//     the diagonal element is P = I - (2/u'u)uu', u is an nrows-dimensional  //
+//     vector the first k-1 components of which are zero and the last         //
+//     components agree with the current transformed matrix below the diagonal//
+//     diagonal, the remaining k-th element is the diagonal element - s, where//
+//     s = (+/-)sqrt(sum of squares of the elements below the diagonal), the  //
+//     sign is chosen opposite that of the diagonal element.                  //
+//                                                                            //
+//  Arguments:                                                                //
+//     double* A                                                              //
+//        On input, the pointer to the first element of the matrix            //
+//        A[nrows][ncols].  The matrix A is unchanged.                        //
+//     int nrows                                                              //
+//        The number of rows of the matrix A.                                 //
+//     int ncols                                                              //
+//        The number of columns of the matrix A.                              //
+//     double* U                                                              //
+//        On input, a pointer to a matrix with the same number of rows and    //
+//        columns as the matrix A.  On output, the matrix with mutually       //
+//        orthogonal columns which is the left-most factor in the bidiagonal  //
+//        decomposition of A.                                                 //
+//     double* V                                                              //
+//        On input, a pointer to a square matrix with the same number of rows //
+//        and columns as the columns of the matrix A, i.e. V[ncols][ncols].   //
+//        On output, the orthogonal matrix whose transpose is the right-most  //
+//        factor in the bidiagonal decomposition of A.                        //
+//     double* diagonal                                                       //
+//        On input, a pointer to an array dimensioned to same as the number   //
+//        of columns of the matrix A, ncols.  On output, the diagonal of the  //
+//        bidiagonal matrix.                                                  //
+//     double* superdiagonal                                                  //
+//        On input, a pointer to an array dimensioned to same as the number   //
+//        of columns of the matrix A, ncols.  On output, the superdiagonal    //
+//        of the bidiagonal matrix.                                           //
+//                                                                            //
+//  Return Values:                                                            //
+//     The function is of type void and therefore does not return a value.    //
+//     The matrices U, V, and the diagonal and superdiagonal are calculated   //
+//     using the addresses passed in the argument list.                       //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     double A[M][N];                                                        //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double diagonal[N];                                                    //
+//     double superdiagonal[N];                                               //
+//                                                                            //
+//     (your code to initialize the matrix A - Note this routine is not       //
+//     (accessible from outside i.e. it is declared static)                   //
+//                                                                            //
+//     Householders_Reduction_to_Bidiagonal_Form((double*) A, nrows, ncols,   //
+//                   (double*) U, (double*) V, diagonal, superdiagonal )      //
+//                                                                            //
+//     free(dummy_array);                                                     //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
+    int ncols, double* U, double* V, double* diagonal, double* superdiagonal )
+{
+   int i,j,k,ip1;
+   double s, s2, si, scale;
+   double dum;
+   double *pu, *pui, *pv, *pvi;
+   double half_norm_squared;
+
+// Copy A to U
+
+   memcpy(U,A, sizeof(double) * nrows * ncols);
+
+//
+ 
+   diagonal[0] = 0.0;
+   s = 0.0;
+   scale = 0.0;
+   for ( i = 0, pui = U, ip1 = 1; i < ncols; pui += ncols, i++, ip1++ ) {
+      superdiagonal[i] = scale * s;
+//       
+//                  Perform Householder transform on columns.
+//
+//       Calculate the normed squared of the i-th column vector starting at 
+//       row i.
+//
+      for (j = i, pu = pui, scale = 0.0; j < nrows; j++, pu += ncols)
+         scale += fabs( *(pu + i) );
+       
+      if (scale > 0.0) {
+         for (j = i, pu = pui, s2 = 0.0; j < nrows; j++, pu += ncols) {
+            *(pu + i) /= scale;
+            s2 += *(pu + i) * *(pu + i);
+         }
+//
+//    
+//       Chose sign of s which maximizes the norm
+//  
+         s = ( *(pui + i) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
+//
+//       Calculate -2/u'u
+//
+         half_norm_squared = *(pui + i) * s - s2;
+//
+//       Transform remaining columns by the Householder transform.
+//
+         *(pui + i) -= s;
+         
+         for (j = ip1; j < ncols; j++) {
+            for (k = i, si = 0.0, pu = pui; k < nrows; k++, pu += ncols)
+               si += *(pu + i) * *(pu + j);
+            si /= half_norm_squared;
+            for (k = i, pu = pui; k < nrows; k++, pu += ncols) {
+               *(pu + j) += si * *(pu + i);
+            }
+         }
+      }
+      for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) *= scale;
+      diagonal[i] = s * scale;
+//       
+//                  Perform Householder transform on rows.
+//
+//       Calculate the normed squared of the i-th row vector starting at 
+//       column i.
+//
+      s = 0.0;
+      scale = 0.0;
+      if (i >= nrows || i == (ncols - 1) ) continue;
+      for (j = ip1; j < ncols; j++) scale += fabs ( *(pui + j) );
+      if ( scale > 0.0 ) {
+         for (j = ip1, s2 = 0.0; j < ncols; j++) {
+            *(pui + j) /= scale;
+            s2 += *(pui + j) * *(pui + j);
+         }
+         s = ( *(pui + ip1) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
+//
+//       Calculate -2/u'u
+//
+         half_norm_squared = *(pui + ip1) * s - s2;
+//
+//       Transform the rows by the Householder transform.
+//
+         *(pui + ip1) -= s;
+         for (k = ip1; k < ncols; k++)
+            superdiagonal[k] = *(pui + k) / half_norm_squared;
+         if ( i < (nrows - 1) ) {
+            for (j = ip1, pu = pui + ncols; j < nrows; j++, pu += ncols) {
+               for (k = ip1, si = 0.0; k < ncols; k++) 
+                  si += *(pui + k) * *(pu + k);
+               for (k = ip1; k < ncols; k++) { 
+                  *(pu + k) += si * superdiagonal[k];
+               }
+            }
+         }
+         for (k = ip1; k < ncols; k++) *(pui + k) *= scale;
+      }
+   }
+
+// Update V
+   pui = U + ncols * (ncols - 2);
+   pvi = V + ncols * (ncols - 1);
+   *(pvi + ncols - 1) = 1.0;
+   s = superdiagonal[ncols - 1];
+   pvi -= ncols;
+   for (i = ncols - 2, ip1 = ncols - 1; i >= 0; i--, pui -= ncols,
+                                                      pvi -= ncols, ip1-- ) {
+      if ( s != 0.0 ) {
+         pv = pvi + ncols;
+         for (j = ip1; j < ncols; j++, pv += ncols)
+            *(pv + i) = ( *(pui + j) / *(pui + ip1) ) / s;
+         for (j = ip1; j < ncols; j++) { 
+            si = 0.0;
+            for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
+               si += *(pui + k) * *(pv + j);
+            for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
+               *(pv + j) += si * *(pv + i);                  
+         }
+      }
+      pv = pvi + ncols;
+      for ( j = ip1; j < ncols; j++, pv += ncols ) {
+         *(pvi + j) = 0.0;
+         *(pv + i) = 0.0;
+      }
+      *(pvi + i) = 1.0;
+      s = superdiagonal[i];
+   }
+
+// Update U
+
+   pui = U + ncols * (ncols - 1);
+   for (i = ncols - 1, ip1 = ncols; i >= 0; ip1 = i, i--, pui -= ncols ) {
+      s = diagonal[i];
+      for ( j = ip1; j < ncols; j++) *(pui + j) = 0.0;
+      if ( s != 0.0 ) {
+         for (j = ip1; j < ncols; j++) { 
+            si = 0.0;
+            pu = pui + ncols;
+            for (k = ip1; k < nrows; k++, pu += ncols)
+               si += *(pu + i) * *(pu + j);
+            si = (si / *(pui + i) ) / s;
+            for (k = i, pu = pui; k < nrows; k++, pu += ncols)
+               *(pu + j) += si * *(pu + i);                  
+         }
+         for (j = i, pu = pui; j < nrows; j++, pu += ncols){
+            *(pu + i) /= s;
+         }
+      }
+      else 
+         for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) = 0.0;
+      *(pui + i) += 1.0;
+   }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,        //
+//         double* U, double* V, double* diagonal, double* superdiagonal )    //
+//                                                                            //
+//  Description:                                                              //
+//     This routine decomposes a bidiagonal matrix given by the arrays        //
+//     diagonal and superdiagonal into a product of three matrices U1, D and  //
+//     V1', the matrix U1 premultiplies U and is returned in U, the matrix    //
+//     V1 premultiplies V and is returned in V.  The matrix D is a diagonal   //
+//     matrix and replaces the array diagonal.                                //
+//                                                                            //
+//     The method used to annihilate the offdiagonal elements is a variant    //
+//     of the QR transformation.  The method consists of applying Givens      //
+//     rotations to the right and the left of the current matrix until        //
+//     the new off-diagonal elements are chased out of the matrix.            //
+//                                                                            //
+//     The process is an iterative process which due to roundoff errors may   //
+//     not converge within a predefined number of iterations.  (This should   //
+//     be unusual.)                                                           //
+//                                                                            //
+//  Arguments:                                                                //
+//     int nrows                                                              //
+//        The number of rows of the matrix U.                                 //
+//     int ncols                                                              //
+//        The number of columns of the matrix U.                              //
+//     double* U                                                              //
+//        On input, a pointer to a matrix already initialized to a matrix     //
+//        with mutually orthogonal columns.   On output, the matrix with      //
+//        mutually orthogonal columns.                                        //
+//     double* V                                                              //
+//        On input, a pointer to a square matrix with the same number of rows //
+//        and columns as the columns of the matrix U, i.e. V[ncols][ncols].   //
+//        The matrix V is assumed to be initialized to an orthogonal matrix.  //
+//        On output, V is an orthogonal matrix.                               //
+//     double* diagonal                                                       //
+//        On input, a pointer to an array of dimension ncols which initially  //
+//        contains the diagonal of the bidiagonal matrix.  On output, the     //
+//        it contains the diagonal of the diagonal matrix.                    //
+//     double* superdiagonal                                                  //
+//        On input, a pointer to an array of dimension ncols which initially  //
+//        the first component is zero and the successive components form the  //
+//        superdiagonal of the bidiagonal matrix.                             //
+//                                                                            //
+//  Return Values:                                                            //
+//     0  Success                                                             //
+//    -1  Failure - The procedure failed to terminate within                  //
+//                  MAX_ITERATION_COUNT iterations.                           //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double diagonal[N];                                                    //
+//     double superdiagonal[N];                                               //
+//     int err;                                                               //
+//                                                                            //
+//     (your code to initialize the matrices U, V, diagonal, and )            //
+//     ( superdiagonal.  - Note this routine is not accessible from outside)  //
+//     ( i.e. it is declared static.)                                         //
+//                                                                            //
+//     err = Givens_Reduction_to_Diagonal_Form( M,N,(double*)U,(double*)V,    //
+//                                                 diagonal, superdiagonal ); //
+//     if ( err < 0 ) printf("Failed to converge\n");                         //
+//     else { ... }                                                           //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
+           double* U, double* V, double* diagonal, double* superdiagonal )
+{
+
+   double epsilon;
+   double c, s;
+   double f,g,h;
+   double x,y,z;
+   double *pu, *pv;
+   int i,j,k,m;
+   int rotation_test;
+   int iteration_count;
+  
+   for (i = 0, x = 0.0; i < ncols; i++) {
+      y = fabs(diagonal[i]) + fabs(superdiagonal[i]);
+      if ( x < y ) x = y;
+   }
+   epsilon = x * DBL_EPSILON;
+   for (k = ncols - 1; k >= 0; k--) {
+      iteration_count = 0;
+      while(1) {
+         rotation_test = 1;
+         for (m = k; m >= 0; m--) { 
+            if (fabs(superdiagonal[m]) <= epsilon) {rotation_test = 0; break;}
+            if (fabs(diagonal[m-1]) <= epsilon) break;
+         }
+         if (rotation_test) {
+            c = 0.0;
+            s = 1.0;
+            for (i = m; i <= k; i++) {  
+               f = s * superdiagonal[i];
+               superdiagonal[i] *= c;
+               if (fabs(f) <= epsilon) break;
+               g = diagonal[i];
+               h = sqrt(f*f + g*g);
+               diagonal[i] = h;
+               c = g / h;
+               s = -f / h; 
+               for (j = 0, pu = U; j < nrows; j++, pu += ncols) { 
+                  y = *(pu + m - 1);
+                  z = *(pu + i);
+                  *(pu + m - 1 ) = y * c + z * s;
+                  *(pu + i) = -y * s + z * c;
+               }
+            }
+         }
+         z = diagonal[k];
+         if (m == k ) {
+            if ( z < 0.0 ) {
+               diagonal[k] = -z;
+               for ( j = 0, pv = V; j < ncols; j++, pv += ncols) 
+                  *(pv + k) = - *(pv + k);
+            }
+            break;
+         }
+         else {
+            if ( iteration_count >= MAX_ITERATION_COUNT ) return -1;
+            iteration_count++;
+            x = diagonal[m];
+            y = diagonal[k-1];
+            g = superdiagonal[k-1];
+            h = superdiagonal[k];
+            f = ( (y - z) * ( y + z ) + (g - h) * (g + h) )/(2.0 * h * y);
+            g = sqrt( f * f + 1.0 );
+            if ( f < 0.0 ) g = -g;
+            f = ( (x - z) * (x + z) + h * (y / (f + g) - h) ) / x;
+// Next QR Transformtion
+            c = 1.0;
+            s = 1.0;
+            for (i = m + 1; i <= k; i++) {
+               g = superdiagonal[i];
+               y = diagonal[i];
+               h = s * g;
+               g *= c;
+               z = sqrt( f * f + h * h );
+               superdiagonal[i-1] = z;
+               c = f / z;
+               s = h / z;
+               f =  x * c + g * s;
+               g = -x * s + g * c;
+               h = y * s;
+               y *= c;
+               for (j = 0, pv = V; j < ncols; j++, pv += ncols) {
+                  x = *(pv + i - 1);
+                  z = *(pv + i);
+                  *(pv + i - 1) = x * c + z * s;
+                  *(pv + i) = -x * s + z * c;
+               }
+               z = sqrt( f * f + h * h );
+               diagonal[i - 1] = z;
+               if (z != 0.0) {
+                  c = f / z;
+                  s = h / z;
+               } 
+               f = c * g + s * y;
+               x = -s * g + c * y;
+               for (j = 0, pu = U; j < nrows; j++, pu += ncols) {
+                  y = *(pu + i - 1);
+                  z = *(pu + i);
+                  *(pu + i - 1) = c * y + s * z;
+                  *(pu + i) = -s * y + c * z;
+               }
+            }
+            superdiagonal[m] = 0.0;
+            superdiagonal[k] = f;
+            diagonal[k] = x;
+         }
+      } 
+   }
+   return 0;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,       //
+//                            double* singular_values, double* U, double* V)  //
+//                                                                            //
+//  Description:                                                              //
+//     This routine sorts the singular values from largest to smallest        //
+//     singular value and interchanges the columns of U and the columns of V  //
+//     whenever a swap is made.  I.e. if the i-th singular value is swapped   //
+//     with the j-th singular value, then the i-th and j-th columns of U are  //
+//     interchanged and the i-th and j-th columns of V are interchanged.      //
+//                                                                            //
+//  Arguments:                                                                //
+//     int nrows                                                              //
+//        The number of rows of the matrix U.                                 //
+//     int ncols                                                              //
+//        The number of columns of the matrix U.                              //
+//     double* singular_values                                                //
+//        On input, a pointer to the array of singular values.  On output, the//
+//        sorted array of singular values.                                    //
+//     double* U                                                              //
+//        On input, a pointer to a matrix already initialized to a matrix     //
+//        with mutually orthogonal columns.  On output, the matrix with       //
+//        mutually orthogonal possibly permuted columns.                      //
+//     double* V                                                              //
+//        On input, a pointer to a square matrix with the same number of rows //
+//        and columns as the columns of the matrix U, i.e. V[ncols][ncols].   //
+//        The matrix V is assumed to be initialized to an orthogonal matrix.  //
+//        On output, V is an orthogonal matrix with possibly permuted columns.//
+//                                                                            //
+//  Return Values:                                                            //
+//        The function is of type void.                                       //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double diagonal[N];                                                    //
+//                                                                            //
+//     (your code to initialize the matrices U, V, and diagonal. )            //
+//     ( - Note this routine is not accessible from outside)                  //
+//     ( i.e. it is declared static.)                                         //
+//                                                                            //
+//     Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values,      //
+//                                                 (double*) U, (double*) V); //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
+                                double* singular_values, double* U, double* V)
+{
+   int i,j,max_index;
+   double temp;
+   double *p1, *p2;
+
+   for (i = 0; i < ncols - 1; i++) {
+      max_index = i;
+      for (j = i + 1; j < ncols; j++)
+         if (singular_values[j] > singular_values[max_index] ) 
+            max_index = j;
+      if (max_index == i) continue;
+      temp = singular_values[i];
+      singular_values[i] = singular_values[max_index];
+      singular_values[max_index] = temp;
+      p1 = U + max_index;
+      p2 = U + i;
+      for (j = 0; j < nrows; j++, p1 += ncols, p2 += ncols) {
+         temp = *p1;
+         *p1 = *p2;
+         *p2 = temp;
+      } 
+      p1 = V + max_index;
+      p2 = V + i;
+      for (j = 0; j < ncols; j++, p1 += ncols, p2 += ncols) {
+         temp = *p1;
+         *p1 = *p2;
+         *p2 = temp;
+      }
+   } 
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//  void Singular_Value_Decomposition_Solve(double* U, double* D, double* V,  //
+//              double tolerance, int nrows, int ncols, double *B, double* x) //
+//                                                                            //
+//  Description:                                                              //
+//     This routine solves the system of linear equations Ax=B where A =UDV', //
+//     is the singular value decomposition of A.  Given UDV'x=B, then         //
+//     x = V(1/D)U'B, where 1/D is the pseudo-inverse of D, i.e. if D[i] > 0  //
+//     then (1/D)[i] = 1/D[i] and if D[i] = 0, then (1/D)[i] = 0.  Since      //
+//     the singular values are subject to round-off error.  A tolerance is    //
+//     given so that if D[i] < tolerance, D[i] is treated as if it is 0.      //
+//     The default tolerance is D[0] * DBL_EPSILON * ncols, if the user       //
+//     specified tolerance is less than the default tolerance, the default    //
+//     tolerance is used.                                                     //
+//                                                                            //
+//  Arguments:                                                                //
+//     double* U                                                              //
+//        A matrix with mutually orthonormal columns.                         //
+//     double* D                                                              //
+//        A diagonal matrix with decreasing non-negative diagonal elements.   //
+//        i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i.                  //
+//     double* V                                                              //
+//        An orthogonal matrix.                                               //
+//     double tolerance                                                       //
+//        An lower bound for non-zero singular values (provided tolerance >   //
+//        ncols * DBL_EPSILON * D[0]).                                        //
+//     int nrows                                                              //
+//        The number of rows of the matrix U and B.                           //
+//     int ncols                                                              //
+//        The number of columns of the matrix U.  Also the number of rows and //
+//        columns of the matrices D and V.                                    //
+//     double* B                                                              //
+//        A pointer to a vector dimensioned as nrows which is the  right-hand //
+//        side of the equation Ax = B where A = UDV'.                         //
+//     double* x                                                              //
+//        A pointer to a vector dimensioned as ncols, which is the least      //
+//        squares solution of the equation Ax = B where A = UDV'.             //
+//                                                                            //
+//  Return Values:                                                            //
+//        The function is of type void.                                       //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     #define NB                                                             //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double D[N];                                                           //
+//     double B[M];                                                           //
+//     double x[N];                                                           //
+//     double tolerance;                                                      //
+//                                                                            //
+//     (your code to initialize the matrices U,D,V,B)                         //
+//                                                                            //
+//     Singular_Value_Decomposition_Solve((double*) U, D, (double*) V,        //
+//                                              tolerance, M, N, B, x, bcols) //
+//                                                                            //
+//     printf(" The solution of Ax=B is \n");                                 //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+
+void Singular_Value_Decomposition_Solve(double* U, double* D, double* V,  
+                double tolerance, int nrows, int ncols, double *B, double* x) 
+{
+   int i,j,k;
+   double *pu, *pv;
+   double dum;
+
+   dum = DBL_EPSILON * D[0] * (double) ncols;
+   if (tolerance < dum) tolerance = dum;
+
+   for ( i = 0, pv = V; i < ncols; i++, pv += ncols) {
+      x[i] = 0.0;
+      for (j = 0; j < ncols; j++)
+         if (D[j] > tolerance ) {
+            for (k = 0, dum = 0.0, pu = U; k < nrows; k++, pu += ncols)
+               dum += *(pu + j) * B[k];
+            x[i] += dum * *(pv + j) / D[j];
+         }
+   } 
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//  void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,//
+//                     double tolerance, int nrows, int ncols, double *Astar) //
+//                                                                            //
+//  Description:                                                              //
+//     This routine calculates the pseudo-inverse of the matrix A = UDV'.     //
+//     where U, D, V constitute the singular value decomposition of A.        //
+//     Let Astar be the pseudo-inverse then Astar = V(1/D)U', where 1/D is    //
+//     the pseudo-inverse of D, i.e. if D[i] > 0 then (1/D)[i] = 1/D[i] and   //
+//     if D[i] = 0, then (1/D)[i] = 0.  Because the singular values are       //
+//     subject to round-off error.  A tolerance is given so that if           //
+//     D[i] < tolerance, D[i] is treated as if it were 0.                     //
+//     The default tolerance is D[0] * DBL_EPSILON * ncols, assuming that the //
+//     diagonal matrix of singular values is sorted from largest to smallest, //
+//     if the user specified tolerance is less than the default tolerance,    //
+//     then the default tolerance is used.                                    //
+//                                                                            //
+//  Arguments:                                                                //
+//     double* U                                                              //
+//        A matrix with mutually orthonormal columns.                         //
+//     double* D                                                              //
+//        A diagonal matrix with decreasing non-negative diagonal elements.   //
+//        i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i.                  //
+//     double* V                                                              //
+//        An orthogonal matrix.                                               //
+//     double tolerance                                                       //
+//        An lower bound for non-zero singular values (provided tolerance >   //
+//        ncols * DBL_EPSILON * D[0]).                                        //
+//     int nrows                                                              //
+//        The number of rows of the matrix U and B.                           //
+//     int ncols                                                              //
+//        The number of columns of the matrix U.  Also the number of rows and //
+//        columns of the matrices D and V.                                    //
+//     double* Astar                                                          //
+//        On input, a pointer to the first element of an ncols x nrows matrix.//
+//        On output, the pseudo-inverse of UDV'.                              //
+//                                                                            //
+//  Return Values:                                                            //
+//        The function is of type void.                                       //
+//                                                                            //
+//  Example:                                                                  //
+//     #define M                                                              //
+//     #define N                                                              //
+//     double U[M][N];                                                        //
+//     double V[N][N];                                                        //
+//     double D[N];                                                           //
+//     double Astar[N][M];                                                    //
+//     double tolerance;                                                      //
+//                                                                            //
+//     (your code to initialize the matrices U,D,V)                           //
+//                                                                            //
+//     Singular_Value_Decomposition_Inverse((double*) U, D, (double*) V,      //
+//                                        tolerance, M, N, (double*) Astar);  //
+//                                                                            //
+//     printf(" The pseudo-inverse of A = UDV' is \n");                       //
+//           ...                                                              //
+////////////////////////////////////////////////////////////////////////////////
+//                                                                            //
+
+void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,  
+                        double tolerance, int nrows, int ncols, double *Astar) 
+{
+   int i,j,k;
+   double *pu, *pv, *pa;
+   double dum;
+
+   dum = DBL_EPSILON * D[0] * (double) ncols;
+   if (tolerance < dum) tolerance = dum;
+   for ( i = 0, pv = V, pa = Astar; i < ncols; i++, pv += ncols) 
+      for ( j = 0, pu = U; j < nrows; j++, pa++) 
+        for (k = 0, *pa = 0.0; k < ncols; k++, pu++)
+           if (D[k] > tolerance) *pa += *(pv + k) * *pu / D[k];
+}
diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c
new file mode 100644
index 000000000..18e35f510
--- /dev/null
+++ b/software/apps/svd/main.c
@@ -0,0 +1,93 @@
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "nrutil.h"
+#include "svd.c"
+
+
+// Define Matrix dimensions:
+#define M 4
+#define N 32
+
+int32_t matrix_U[M * N] __attribute__((section(".l1_prio")));
+int32_t matrix_V[M * N] __attribute__((section(".l1_prio")));
+int32_t matrix_W[N] __attribute__((section(".l1_prio")));
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
+                 uint32_t num_cores) {
+  uint32_t const split = 8; // How many rows/columns to split the matrix into
+  if (num_columns > num_rows) {
+    // Parallelize over columns
+    uint32_t const c_start = (num_rows / split) * (core_id % split);
+    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
+    for (uint32_t j = (core_id / split); j < num_columns;
+         j += (num_cores / split)) {
+      for (uint32_t i = c_start; i < c_end; ++i) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  } else {
+    // Parallelize over rows
+    uint32_t const c_start = (num_columns / split) * (core_id % split);
+    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
+    for (uint32_t i = (core_id / split); i < num_rows;
+         i += (num_cores / split)) {
+      for (uint32_t j = c_start; j < c_end; ++j) {
+        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+      }
+    }
+  }
+}
+
+void init_vector(int32_t *vector, uint32_t num_el,
+                 int32_t a, int32_t b, uint32_t core_id) {
+  uint32_t const split = 8; // How many blocks to split the vector into
+  uint32_t const reminder = num_el % split;
+  uint32_t i, j;
+  for (i = core_id * split; i < core_id * split + split; i++) {
+    j = i % split;
+    vector[i] = a * (int32_t)j + b;
+  }
+  while (i < reminder) {
+    j = i % split;
+    vector[i] = a * (int32_t)j + b;
+  }
+}
+
+int volatile error __attribute__((section(".l1")));
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
+
+  if (core_id == 0) {
+    error = 0;
+  }
+
+  int32_t const U_a = 1;
+  int32_t const U_b = 1;
+  int32_t const U_c = -32;
+  int32_t const V_a = 2;
+  int32_t const V_b = 1;
+  int32_t const V_c = 16;
+  // Init matrix
+  init_matrix(matrix_U, M, N, U_a, U_b, U_c, core_id, num_cores);
+  init_matrix(matrix_V, M, N, V_a, V_b, V_c, core_id, num_cores);
+  init_vector(matrix_W, N, V_a, V_b, core_id);
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Test the Matri x SVD
+    svdcmp(matrix_U, M, N, matrix_W, matrix_V);
+  }
+
+  // Wait until all cores have finished
+  mempool_barrier(num_cores);
+
+  return error;
+}
diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h
new file mode 100644
index 000000000..27b55fec2
--- /dev/null
+++ b/software/apps/svd/nrutil.h
@@ -0,0 +1,65 @@
+//#include <stdio.h>
+//#include <stddef.h>
+//#include <stdlib.h>
+
+#ifndef NR_UTILS_H
+#define NR_UTILS_H
+
+#define NR_END 1
+#define FREE_ARG char *
+
+static int32_t sqrarg;
+#define SQR(a)     ((sqrarg = (a)) == 0 ? 0 : sqrarg *sqrarg)
+static int32_t dsqrarg;
+#define DSQR(a)    ((dsqrarg = (a)) == 0 ? 0 : dsqrarg *dsqrarg)
+static int32_t dmaxarg1, dmaxarg2;
+#define DMAX(a, b) (dmaxarg1 = (a), dmaxarg2 = (b), (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))
+static int32_t dminarg1, dminarg2;
+#define DMIN(a, b) (dminarg1 = (a), dminarg2 = (b), (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))
+static int32_t maxarg1, maxarg2;
+#define FMAX(a, b) (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
+static int32_t minarg1, minarg2;
+#define FMIN(a, b) (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))
+static long lmaxarg1, lmaxarg2;
+#define LMAX(a, b) (lmaxarg1 = (a), lmaxarg2 = (b), (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))
+static long lminarg1, lminarg2;
+#define LMIN(a, b) (lminarg1 = (a), lminarg2 = (b), (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))
+static int32_t imaxarg1, imaxarg2;
+#define IMAX(a, b) (imaxarg1 = (a), imaxarg2 = (b), (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))
+static int32_t iminarg1, iminarg2;
+#define IMIN(a, b) (iminarg1 = (a), iminarg2 = (b), (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
+#define ABS(a) (a < 0 ? -a : a)
+#define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a))
+
+int32_t sqrt_q32  (   const int32_t number,
+                      const uint32_t fracBits);
+
+#define sqrt2 0b1011010100000100
+int32_t sqrt_q32  (   const int32_t number,
+                      const uint32_t fracBits) {
+
+    int32_t root = 0;
+    int32_t start = 0;
+    int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF)
+    int32_t mid;
+
+    if (number > 0) {
+      while (start <= end) {
+          mid = (start + end) >> 1;
+          if (((mid * mid) >> fracBits) == number) {
+              root = mid;
+              break;
+          }
+          if (((mid * mid) >> fracBits) < number) {
+              start = mid + 1;
+              root = mid;
+          } else {
+              end = mid - 1;
+          }
+      }
+    }
+
+    return root;
+}
+
+#endif
diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c
new file mode 100644
index 000000000..a53c2695b
--- /dev/null
+++ b/software/apps/svd/svd.c
@@ -0,0 +1,237 @@
+int32_t pythag(int32_t a, int32_t b);
+void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v);
+
+
+int32_t pythag(int32_t a, int32_t b) {
+    int32_t absa = ABS(a);
+    int32_t absb = ABS(b);
+    if (absa > absb) {
+        return absa * sqrt_q32(1 + SQR(absb / absa), 4);
+    } else {
+        return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4));
+    }
+}
+
+void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) {
+    int32_t flag, i, its, j, jj, k, l, nm;
+    int32_t anorm, c, f, g, h, s, scale, x, y, z;
+    int32_t rv1[n];
+
+    //printf("PROVA\n");
+
+    g = scale = anorm = 0.0;
+    for (i = 1; i <= n; i++) {
+        l = i + 1;
+        rv1[i] = scale * g;
+        g = s = scale = 0.0;
+        if (i <= m) {
+            for (k = i; k <= m; k++) {
+                scale += ABS(a[k * m + i]);
+            }
+            if (scale) {
+                for (k = i; k <= m; k++) {
+                    a[k * m + i] /= scale;
+                    s += a[k * m + i] * a[k * m + i];
+                }
+                f = a[i * m + i];
+                g = -SIGN(sqrt_q32(s,4), f);
+                h = f * g - s;
+                a[i * m + i] = f - g;
+                for (j = l; j <= n; j++) {
+                    for (s = 0.0, k = i; k <= m; k++) {
+                        s += a[k * m + i] * a[k * m + i];
+                    }
+                    f = s / h;
+                    for (k = i; k <= m; k++) {
+                        a[k * m + i] += f * a[k * m + i];
+                    }
+                }
+                for (k = i; k <= m; k++) {
+                    a[k * m + i] *= scale;
+                }
+            }
+        }
+        w[i] = scale * g;
+        g = s = scale = 0.0;
+        if (i <= m && i != n) {
+            for (k = l; k <= n; k++) {
+                scale += ABS(a[k * m + i]);
+            }
+            if (scale) {
+                for (k = l; k <= n; k++) {
+                    a[k * m + i] /= scale;
+                    s += a[i * m + k] * a[i * m + k];
+                }
+                f = a[i * m + l];
+                g = -SIGN(sqrt_q32(s,4), f);
+                h = f * g - s;
+                a[i * m + l] = f - g;
+                for (k = l; k <= n; k++) {
+                    rv1[k] = a[i * m + k] / h;
+                }
+                for (j = l; j <= m; j++) {
+                    for (s = 0, k = l; k <= n; k++) {
+                        s += a[j * m + k] * a[i * m + k];
+                    }
+                    for (k = l; k <= n; k++) {
+                        a[j * m + k] += s * rv1[k];
+                    }
+                }
+                for (k = l; k <= n; k++) {
+                    a[i * m + k] *= scale;
+                }
+            }
+        }
+        anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i])));
+    }
+
+    for (i = n; i >= 1; i--) {
+        if (i < n) {
+            if (g) {
+                for (j = l; j <= n; j++) {
+                    v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g;
+                }
+                for (j = l; j <= n; j++) {
+                    for (s = 0, k = l; k <= n; k++) {
+                        s += a[i * m + k] * v[k * m + j];
+                    }
+                    for (k = l; k <= n; k++) {
+                        v[k * m + j] += s * v[k * m + i];
+                    }
+                }
+            }
+            for (j = l; j <= n; j++) {
+                v[i * m + j] = v[j * m + i] = 0;
+            }
+        }
+        v[i * m + i] = 1;
+        g = rv1[i];
+        l = i;
+    }
+
+//    for (i = IMIN(m, n); i >= 1; i--) {
+//        l = i + 1;
+//        g = w[i];
+//        for (j = l; j <= n; j++) {
+//            a[i][j] = 0;
+//        }
+//        if (g) {
+//            g = 1.0 / g;
+//            for (j = l; j <= n; j++) {
+//                for (s = 0.0, k = l; k <= m; k++) {
+//                    s += a[k][i] * a[k][j];
+//                }
+//                f = (s / a[i][i]) * g;
+//                for (k = i; k <= m; k++) {
+//                    a[k][j] += f * a[k][i];
+//                }
+//            }
+//            for (j = i; j <= m; j++) {
+//                a[j][i] *= g;
+//            }
+//        } else { for (j = i; j <= m; j++) {
+//                     a[j][i] = 0.0;
+//                 }
+//        }
+//        ++a[i][i];
+//    }
+//    for (k = n; k >= 1; k--) {
+//        for (its = 1; its <= 30; its++) {
+//            flag = 1;
+//            for (l = k; l >= 1; l--) {
+//                nm = l - 1;
+//                if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) {
+//                    flag = 0;
+//                    break;
+//                }
+//                if ((int32_t) (ABS(w[nm]) + anorm) == anorm) {
+//                    break;
+//                }
+//            }
+//            if (flag) {
+//                c = 0.0;
+//                s = 1.0;
+//                for (i = l; i <= k; i++) {
+//                    f = s * rv1[i];
+//                    rv1[i] = c * rv1[i];
+//                    if ((int32_t) (ABS(f) + anorm) == anorm) {
+//                        break;
+//                    }
+//                    g = w[i];
+//                    h = pythag(f, g);
+//                    w[i] = h;
+//                    h = 1.0 / h;
+//                    c = g * h;
+//                    s = -f * h;
+//                    for (j = 1; j <= m; j++) {
+//                        y = a[j][nm];
+//                        z = a[j][i];
+//                        a[j][nm] = y * c + z * s;
+//                        a[j][i] = z * c - y * s;
+//                    }
+//                }
+//            }
+//            z = w[k];
+//            if (l == k) {
+//                if (z < 0.0) {
+//                    w[k] = -z;
+//                    for (j = 1; j <= n; j++) {
+//                        v[j][k] = -v[j][k];
+//                    }
+//                }
+//                break;
+//            }
+//            if (its == 30) {
+//                exit(1);
+//            }
+//            x = w[l];
+//            nm = k - 1;
+//            y = w[nm];
+//            g = rv1[nm];
+//            h = rv1[k];
+//            f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+//            g = pythag(f, 1.0);
+//            f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
+//            c = s = 1.0;
+//            for (j = l; j <= nm; j++) {
+//                i = j + 1;
+//                g = rv1[i];
+//                y = w[i];
+//                h = s * g;
+//                g = c * g;
+//                z = pythag(f, h);
+//                rv1[j] = z;
+//                c = f / z;
+//                s = h / z;
+//                f = x * c + g * s;
+//                g = g * c - x * s;
+//                h = y * s;
+//                y *= c;
+//                for (jj = 1; jj <= n; jj++) {
+//                    x = v[jj][j];
+//                    z = v[jj][i];
+//                    v[jj][j] = x * c + z * s;
+//                    v[jj][i] = z * c - x * s;
+//                }
+//                z = pythag(f, h);
+//                w[j] = z;
+//                if (z) {
+//                    z = 1.0 / z;
+//                    c = f * z;
+//                    s = h * z;
+//                }
+//                f = c * g + s * y;
+//                x = c * y - s * g;
+//                for (jj = 1; jj <= m; jj++) {
+//                    y = a[jj][j];
+//                    z = a[jj][i];
+//                    a[jj][j] = y * c + z * s;
+//                    a[jj][i] = z * c - y * s;
+//                }
+//            }
+//            rv1[l] = 0.0;
+//            rv1[k] = f;
+//            w[k] = x;
+//        }
+//    }
+}

From 77becf1e3b431ac7a93ef7e8e15f7cfc35152a0f Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 14 Jul 2022 18:52:02 +0200
Subject: [PATCH 03/22] [software] Clean Gauss Jordan inverse function

---
 .../apps/MP_matrix_inverse/initialization.h   |  34 +--
 software/apps/MP_matrix_inverse/inverse.h     | 213 ++++++------------
 software/apps/MP_matrix_inverse/main.c        |  73 +++---
 software/runtime/serial.c                     |   2 +-
 4 files changed, 118 insertions(+), 204 deletions(-)

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h
index e7e834de9..9046f4be9 100644
--- a/software/apps/MP_matrix_inverse/initialization.h
+++ b/software/apps/MP_matrix_inverse/initialization.h
@@ -5,41 +5,24 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores);
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id);
 
 void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                         uint32_t core_id, uint32_t num_cores);
+                         uint32_t core_id);
 
 void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id) {
+  if(core_id == 0) {
+    for(uint32_t i = 0; i < num_columns; i++) {
+      for(uint32_t j = 0; j < num_rows; j++) {
+          matrix[j * num_rows + i] = a * (int32_t)i + b * (int32_t)j + c;
       }
     }
   }
 }
 
 void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                         uint32_t core_id, uint32_t num_cores) {
+                         uint32_t core_id) {
 
   if(core_id == 0) {
     for(uint32_t i = 0; i < num_columns; i++) {
@@ -47,7 +30,6 @@ void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_column
           matrix[j * num_rows + i] = 0;
       }
     }
-    printf("SONO QUI\n");
   }
 
 }
diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h
index 19dfc9b1e..cb98aadac 100644
--- a/software/apps/MP_matrix_inverse/inverse.h
+++ b/software/apps/MP_matrix_inverse/inverse.h
@@ -4,21 +4,30 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+#define FIXED_POINT 0
+#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT)/b))
+#define FIX_MUL(a,b) ((int32_t)((a*b) >> FIXED_POINT))
+
+dump(prova, 1);
+
 void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n);
 
 void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n);
 
+
+
 void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n);
 
 int32_t determinant(int32_t *A, int32_t n);
 
 void adjoint(int32_t *A,int32_t *adj, int32_t n);
 
-int32_t inverse(int32_t *A, int32_t *inverse, int32_t n);
+int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n);
 
-int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n);
 
- 
+
+int GJ_inverse(int32_t *pSrc, int32_t *pDst, uint32_t n);
+
 void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) {
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
@@ -26,7 +35,6 @@ void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) {
         }
     }
 }
- 
 void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) {
     int k;
     for (int i = 0; i < n; i++) {
@@ -39,6 +47,8 @@ void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, i
     }
 }
 
+/* CRAMER MATRIX INVERSION */
+
 // Function to get cofactor
 void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
     int32_t i = 0, j = 0;
@@ -112,7 +122,7 @@ void adjoint(int32_t *A,int32_t *adj, int32_t n) {
  
 // Function to calculate and store inverse, returns false if
 // matrix is singular
-int32_t inverse(int32_t *A, int32_t *inverse, int32_t n) {
+int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n) {
     // Find determinant of A[][]
     int32_t det = determinant(A, n);
     if (det == 0) {
@@ -127,263 +137,178 @@ int32_t inverse(int32_t *A, int32_t *inverse, int32_t n) {
     // Find Inverse using formula "inverse(A) = adj(A)/det(A)"
     for (int32_t i = 0; i < n; i++)
         for (int32_t j = 0; j < n; j++)
-            inverse[i * n + j]= adj[i * n + j] / det;
+            inverse[i * n + j]= FIX_DIV(adj[i * n + j], det);
     return 1;
 }
 
+/* GAUSS JORDAN INVERSION */
 
-int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n) {
+int GJ_inverse(int32_t * pSrc, int32_t * pDst, uint32_t n) {
 
     int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
     int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
     int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
     int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t Xchg, in = 0, in1;                      /* Temporary input values  */
-    uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
+    int32_t Xchg, x = 0, y;                    /* Temporary input values  */
+    uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;  /* loop counters */
 
     uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+    pDstT1 = pDst;  /* Working pointer for destination matrix */
+    rowCnt = m;     /* Loop over the number of rows */
 
-    /* Working pointer for destination matrix */
-    pDstT1 = pDst;
-
-    /* Loop over the number of rows */
-    rowCnt = m;
-
-    /* Making the destination matrix as identity matrix */
+    /* CREATE THE IDENTITY MATRIX */
     while (rowCnt > 0U) {
-        /* Writing all zeroes in lower triangle of the destination matrix */
         j = m - rowCnt;
         while (j > 0U) {
             *pDstT1++ = 0;
             j--;
         }
-
-        /* Writing all ones in the diagonal of the destination matrix */
         *pDstT1++ = 1;
-
-        /* Writing all zeroes in upper triangle of the destination matrix */
         j = rowCnt - 1U;
         while (j > 0U) {
             *pDstT1++ = 0;
             j--;
         }
-
-        /* Decrement loop counter */
         rowCnt--;
     }
 
-    /* Loop over the number of columns of the input matrix.
-       All the elements in each column are processed by the row operations */
+    /* Loop over the number of columns of the input matrix. */
     loopCnt = n;
-
     /* Index modifier to navigate through the columns */
     l = 0U;
 
     while (loopCnt > 0U) {
-        /* Check if the pivot element is zero..
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO...
          * If it is zero then interchange the row with non zero row below.
          * If there is no non zero element to replace in the rows below,
          * then the matrix is Singular. */
 
-        /* Working pointer for the input matrix that points
-         * to the pivot element of the particular row  */
         pSrcT1 = pSrc + (l * n);
-
-        /* Working pointer for the destination matrix that points
-         * to the pivot element of the particular row  */
         pDstT1 = pDst + (l * n);
-
-        /* Temporary variable to hold the pivot value */
-        in = *pSrcT1;
-
-        /* Destination pointer modifier */
+        x = *pSrcT1;
         k = 1U;
-
-        /* Check if the pivot element is zero */
-        if (*pSrcT1 == 0) {
-            /* Loop over the number rows present below */
-
+        if (x == 0) {
+            /* Loop over the rows present below */
             for (i = (l + 1U); i < m; i++) {
-                /* Update the input and destination pointers */
                 pSrcT2 = pSrcT1 + (n * i);
                 pDstT2 = pDstT1 + (n * k);
 
-                /* Check if there is a non zero pivot element to
-                 * replace in the rows below */
+                /* Check if there is a non zero pivot element to replace in the rows below */
                 if (*pSrcT2 != 0) {
-                    /* Loop over number of columns
-                     * to the right of the pilot element */
+                    /* Exchange the row elements of the input matrix at the right of the pivot */
                     j = n - l;
-
                     while (j > 0U) {
-                        /* Exchange the row elements of the input matrix */
                         Xchg = *pSrcT2;
                         *pSrcT2++ = *pSrcT1;
                         *pSrcT1++ = Xchg;
-
-                        /* Decrement the loop counter */
                         j--;
                     }
-
-                    /* Loop over number of columns of the destination matrix */
+                    /* Exchange the row elements of the destination matrix */
                     j = n;
-
                     while (j > 0U) {
-                        /* Exchange the row elements of the destination matrix */
                         Xchg = *pDstT2;
                         *pDstT2++ = *pDstT1;
                         *pDstT1++ = Xchg;
-
-                        /* Decrement loop counter */
                         j--;
                     }
-
-                    /* Flag to indicate whether exchange is done or not */
                     flag = 1U;
-
-                    /* Break after exchange is done */
                     break;
                 }
-
-                /* Update the destination pointer modifier */
                 k++;
-
-                /* Decrement loop counter */
             }
         }
-
-        /* Update the status if the matrix is singular */
-        if ((flag != 1U) && (in == 0)) {
+        /* Return when the matrix is singular */
+        if ((flag != 1U) && (x == 0)) {
             return 1;
         }
 
+        /* DIVIDE BY THE PIVOT */
+
         /* Points to the pivot row of input and destination matrices */
         pPivotRowIn = pSrc + (l * n);
         pPivotRowDst = pDst + (l * n);
-
         /* Temporary pointers to the pivot row pointers */
         pSrcT1 = pPivotRowIn;
         pSrcT2 = pPivotRowDst;
-
         /* Pivot element of the row */
-        in = *pPivotRowIn;
+        x = *pPivotRowIn;
 
-        /* Loop over number of columns
-         * to the right of the pilot element */
+        /* Loop over number of columns to the right of the pilot element */
         j = (n - l);
-
         while (j > 0U) {
-            /* Divide each element of the row of the input matrix
-             * by the pivot element */
-            in1 = *pSrcT1;
-            *pSrcT1++ = in1 / in;
-
-            /* Decrement the loop counter */
+            y = *pSrcT1;
+            *pSrcT1++ = FIX_DIV(y, x);
             j--;
         }
-
         /* Loop over number of columns of the destination matrix */
         j = n;
-
         while (j > 0U) {
-            /* Divide each element of the row of the destination matrix
-             * by the pivot element */
-            in1 = *pSrcT2;
-            *pSrcT2++ = in1 / in;
-
-            /* Decrement the loop counter */
+            y = *pSrcT2;
+            *pSrcT2++ = FIX_DIV(y, x);
             j--;
         }
 
+        /* SUM THE MULTIPLE OF A BOTTOM ROW */
         /* Replace the rows with the sum of that row and a multiple of row i
          * so that each new element in column i above row i is zero.*/
-
         /* Temporary pointers for input and destination matrices */
+
         pSrcT1 = pSrc;
         pSrcT2 = pDst;
 
-        /* index used to check for pivot element */
-        i = 0U;
-
-        /* Loop over number of rows */
-        /*  to be replaced by the sum of that row and a multiple of row i */
-        k = m;
-
+        i = 0U; /* pivot index */
+        k = m; /* row index */
         while (k > 0U) {
-            /* Check for the pivot element */
+
+            /* Only the columns to the right of the pivot are to be processed */
             if (i == l) {
-                /* If the processing element is the pivot element,
-                   only the columns to the right are to be processed */
                 pSrcT1 += n - l;
-
                 pSrcT2 += n;
+
             } else {
-                /* Element of the reference row */
-                in = *pSrcT1;
 
-                /* Working pointers for input and destination pivot rows */
+                /* Element of the reference row */
+                x = *pSrcT1;
+                /* Reference row pointers */
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
 
-                /* Loop over the number of columns to the right of the pivot element,
-                   to replace the elements in the input matrix */
-                j = (n - l);
-
+                j = (n - l); /* Replace the elements to the right of the pivot */
                 while (j > 0U) {
-                    /* Replace the element by the sum of that row
-                       and a multiple of the reference row  */
-                    in1 = *pSrcT1;
-                    *pSrcT1++ = in1 - (in * *pPRT_in++);
-
-                    /* Decrement the loop counter */
+                    y = *pSrcT1;
+                    *pSrcT1++ = y - FIX_MUL(x, *pPRT_in++);
                     j--;
                 }
-
-                /* Loop over the number of columns to
-                   replace the elements in the destination matrix */
-                j = n;
-
+                j = n; /* Replace the elements in the destination matrix */
                 while (j > 0U) {
-                    /* Replace the element by the sum of that row
-                       and a multiple of the reference row  */
-                    in1 = *pSrcT2;
-                    *pSrcT2++ = in1 - (in * *pPRT_pDst++);
-
-                    /* Decrement loop counter */
+                    y = *pSrcT2;
+                    *pSrcT2++ = y - FIX_MUL(x, *pPRT_pDst++);
                     j--;
                 }
             }
-
             /* Increment temporary input pointer */
             pSrcT1 = pSrcT1 + l;
-
             /* Decrement loop counter */
             k--;
-
             /* Increment pivot index */
             i++;
         }
 
-        /* Increment the input pointer */
-        pSrc++;
-
-        /* Decrement the loop counter */
-        loopCnt--;
-
-        /* Increment the index modifier */
-        l++;
+        pSrc++; /* Increment the input pointer */
+        loopCnt--; /* Decrement the loop counter */
+        l++; /* Increment the index modifier */
     }
 
-    if ((flag != 1U) && (in == 0)) {
-        for (i = 0; i < m * n; i++) {
-            if (pSrc[i] != 0)
-                break;
-        }
-
-        if (i == m * n)
-            return 1;
-    }
+//    if ((flag != 1U) && (x == 0)) {
+//        for (i = 0; i < m * n; i++) {
+//            if (pSrc[i] != 0)
+//                break;
+//        }
+//        if (i == m * n)
+//            return 1;
+//    }
 
     return 0;
 }
diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c
index 587ee06b0..f1f771d89 100644
--- a/software/apps/MP_matrix_inverse/main.c
+++ b/software/apps/MP_matrix_inverse/main.c
@@ -7,7 +7,7 @@
 //#include <stdint.h>
 //#include <string.h>
 
-#define N 5
+#define N 6
 
 #include "encoding.h"
 #include "printf.h"
@@ -17,18 +17,18 @@
 #include "inverse.h"
 #include "initialization.h"
 
-// C++ program to find Moore-Penrose inverse  matrix
- 
+#define GAUSS_JORDAN
+// #define CRAMER
+
 // Generic function to display the matrix. We use it to display
 // both adjoin and inverse. adjoin is integer matrix and inverse
 // is a int32_t.
-void display(int32_t *A, int32_t n)
-{
-    for (int i = 0; i < n; i++)
-    {
-        for (int j = 0; j < n; j++)
-            printf("%4d ", A[i * n + j]);
-        printf("\n");
+void display(int32_t *A, int32_t n) {
+    int32_t volatile i = 0;
+    while (i < n * n) {
+        // printf("ciao mamma\n");
+        printf("Value %d: %d\n", i, A[i]);
+        i++;
     }
 }
 
@@ -41,40 +41,47 @@ int main()
     // Initialize barrier and synchronize
     mempool_barrier_init(core_id);
 
-    int32_t matrix[N * N] = {  -2, 2, 7, 9, 4, 0, 8,
-                                1, 0, 0, 3, 1, 0, 9,
-                               -3, 1, 5, 0, 2, 1, 7,
-                                3,-1,-9, 4, 6, 5, 2,
-                                1, 0, 4, 4, 1, 0, 9,
-                                8, 0, 3, 8, 6, 5, 2,
-                                5, 6, 4, 1, 3, 2, 0  };
+//    int32_t matrix[N * N] = {  -2, 2, 7, 9, 4, 0, 8,
+//                                1, 0, 0, 3, 1, 0, 9,
+//                               -3, 1, 5, 0, 2, 1, 7,
+//                                3,-1,-9, 4, 6, 5, 2,
+//                                1, 0, 4, 4, 1, 0, 9,
+//                                8, 0, 3, 8, 6, 5, 2,
+//                                5, 6, 4, 1, 3, 2, 0  };
 
     int32_t t_matrix[N * N];
     int32_t matrix_mult[N * N];
     int32_t pseudoinverse[N * N];
     int32_t inv[N * N]; // To store inverse 
 
-//    init_matrix_zeros(t_matrix, N, N, core_id, num_cores);
-//    init_matrix_zeros(matrix_mult, N, N, core_id, num_cores);
-//    init_matrix_zeros(pseudoinverse, N, N, core_id, num_cores);
-//    init_matrix_zeros(adj, N, N, core_id, num_cores);
-//    init_matrix_zeros(inv, N, N, core_id, num_cores);
-    if(core_id == 0)
-      display(matrix, N);
+    int32_t matrix[N * N];
+    init_matrix(matrix, N, N, -125, 2423, -1294, core_id);
+    init_matrix_zeros(t_matrix, N, N, core_id);
+    init_matrix_zeros(matrix_mult, N, N, core_id);
+    init_matrix_zeros(pseudoinverse, N, N, core_id);
+    init_matrix_zeros(inv, N, N, core_id);
 
     if(core_id == 0) {
+
+      //display(matrix, N);
       Transpose(matrix, t_matrix, N);
-      printf("\nThe Transpose is :\n");
-      display(t_matrix, N);
-      printf("The product of the matrix is: \n");
+      //printf("\nThe Transpose is :\n");
+      //display(t_matrix, N);
       MatrixMult(t_matrix,matrix,matrix_mult, N);
-      display(matrix_mult, N);
-      printf("\nThe Inverse is :\n");
-      if (inverse(matrix_mult, inv, N))
-          display(inv, N);
+      //printf("The product of the matrix is: \n");
+      //display(matrix_mult, N);
+      //printf("\nThe Inverse is :\n");
+      #if defined(CRAMER)
+        if (C_inverse(matrix_mult, inv, N))
+            //display(inv, N);
+      #elif defined(GAUSS_JORDAN)
+        GJ_inverse(matrix_mult, inv, N);
+        //display(inv, N);
+      #endif
       MatrixMult(inv,t_matrix,pseudoinverse, N);
-      printf("\nThe Monroe-penrose inverse is :\n");
-      display(pseudoinverse, N);
+      //printf("\nThe Moore-Penrose inverse is :\n");
+      //display(pseudoinverse, N);
+
     }
 
     mempool_barrier(num_cores);
diff --git a/software/runtime/serial.c b/software/runtime/serial.c
index a53ec2e1f..44aa30fe2 100644
--- a/software/runtime/serial.c
+++ b/software/runtime/serial.c
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 
-extern char fake_uart;
+extern volatile char fake_uart;
 
 void _putchar(char character) {
   // send char to console

From 157790692503799418e16c14bec49e8a61091a86 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Mon, 18 Jul 2022 15:30:26 +0200
Subject: [PATCH 04/22] [software] Allow use of input N by M rectangular
 matrices

---
 .../apps/MP_matrix_inverse/initialization.h   |  17 ++-
 software/apps/MP_matrix_inverse/inverse.h     |  58 +++++-----
 software/apps/MP_matrix_inverse/main.c        | 105 ++++++++++--------
 3 files changed, 99 insertions(+), 81 deletions(-)

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h
index 9046f4be9..2ba789234 100644
--- a/software/apps/MP_matrix_inverse/initialization.h
+++ b/software/apps/MP_matrix_inverse/initialization.h
@@ -4,30 +4,29 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
                  int32_t a, int32_t b, int32_t c, uint32_t core_id);
 
-void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
                          uint32_t core_id);
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
                  int32_t a, int32_t b, int32_t c, uint32_t core_id) {
   if(core_id == 0) {
-    for(uint32_t i = 0; i < num_columns; i++) {
-      for(uint32_t j = 0; j < num_rows; j++) {
-          matrix[j * num_rows + i] = a * (int32_t)i + b * (int32_t)j + c;
+    for(uint32_t j = 0; j < num_rows; j++) {
+      for(uint32_t i = 0; i < num_columns; i++) {
+          matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
       }
     }
   }
 }
 
-void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
                          uint32_t core_id) {
-
   if(core_id == 0) {
     for(uint32_t i = 0; i < num_columns; i++) {
       for(uint32_t j = 0; j < num_rows; j++) {
-          matrix[j * num_rows + i] = 0;
+          matrix[j * num_columns + i] = 0;
       }
     }
   }
diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h
index cb98aadac..4d230c422 100644
--- a/software/apps/MP_matrix_inverse/inverse.h
+++ b/software/apps/MP_matrix_inverse/inverse.h
@@ -4,53 +4,55 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-#define FIXED_POINT 0
-#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT)/b))
-#define FIX_MUL(a,b) ((int32_t)((a*b) >> FIXED_POINT))
+#define FIXED_POINT 16
+#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
 
 dump(prova, 1);
 
-void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n);
+void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m);
 
-void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n);
+void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o);
 
 
 
-void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n);
+void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n);
 
-int32_t determinant(int32_t *A, int32_t n);
+int32_t determinant(volatile int32_t *A, int32_t n);
 
-void adjoint(int32_t *A,int32_t *adj, int32_t n);
+void adjoint(volatile int32_t *A,int32_t *adj, int32_t n);
 
-int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n);
+int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n);
 
 
 
-int GJ_inverse(int32_t *pSrc, int32_t *pDst, uint32_t n);
+int GJ_inverse(volatile int32_t *pSrc, volatile int32_t *pDst, uint32_t n);
 
-void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) {
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-             t_matrix[j * n + i]=matrix[i * n + j];
+void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m) {
+  int32_t i, j;
+    for (i = 0; i < n; i++) {
+        for (j = 0; j < m; j++) {
+             t_matrix[j * n + i] = matrix[i * m + j];
         }
     }
 }
-void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) {
-    int k;
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {             // not j<M
-            matrix_product[i * n + j] = 0;
-            for (k = 0; k < n; k++) {
-                matrix_product[i * n + j] += matrix_1[i * n + k] * matrix_2[k * n + j];
-            }
-        }
+void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o) {
+  int32_t i, j, k;
+  for (i = 0; i < n; i++) {
+      for (j = 0; j < o; j++) {
+        matrix_product[i * o + j] = 0;
+        for (k = 0; k < m; k++) {
+          matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
+      }
     }
+  }
+
 }
 
 /* CRAMER MATRIX INVERSION */
 
 // Function to get cofactor
-void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
+void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
     int32_t i = 0, j = 0;
     // Looping for each element of the matrix
     for (int32_t row = 0; row < n; row++) {
@@ -71,7 +73,7 @@ void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
 }
  
 // Recursive function for finding determinant of matrix.
-int32_t determinant(int32_t *A, int32_t n) {
+int32_t determinant(volatile int32_t *A, int32_t n) {
 
     int32_t D = 0; // Initialize result
     // Base case : if matrix contains single element
@@ -98,7 +100,7 @@ int32_t determinant(int32_t *A, int32_t n) {
 }
  
 // Function to get adjoint
-void adjoint(int32_t *A,int32_t *adj, int32_t n) {
+void adjoint(volatile int32_t *A,int32_t *adj, int32_t n) {
     if (n == 1) {
         adj[0] = 1;
         return;
@@ -122,7 +124,7 @@ void adjoint(int32_t *A,int32_t *adj, int32_t n) {
  
 // Function to calculate and store inverse, returns false if
 // matrix is singular
-int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n) {
+int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n) {
     // Find determinant of A[][]
     int32_t det = determinant(A, n);
     if (det == 0) {
@@ -143,7 +145,7 @@ int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n) {
 
 /* GAUSS JORDAN INVERSION */
 
-int GJ_inverse(int32_t * pSrc, int32_t * pDst, uint32_t n) {
+int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
 
     int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
     int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c
index f1f771d89..bb076ecd2 100644
--- a/software/apps/MP_matrix_inverse/main.c
+++ b/software/apps/MP_matrix_inverse/main.c
@@ -7,7 +7,9 @@
 //#include <stdint.h>
 //#include <string.h>
 
-#define N 6
+#define N 32
+#define M 4
+#define O 4
 
 #include "encoding.h"
 #include "printf.h"
@@ -19,19 +21,33 @@
 
 #define GAUSS_JORDAN
 // #define CRAMER
+// #define VERBOSE
 
 // Generic function to display the matrix. We use it to display
 // both adjoin and inverse. adjoin is integer matrix and inverse
 // is a int32_t.
-void display(int32_t *A, int32_t n) {
-    int32_t volatile i = 0;
-    while (i < n * n) {
-        // printf("ciao mamma\n");
-        printf("Value %d: %d\n", i, A[i]);
-        i++;
+void display(volatile int32_t *A, int32_t n, int32_t m) {
+    //int32_t volatile i = 0;
+    //while (i < n * m) {
+    //    // printf("ciao mamma\n");
+    //    printf("Value %d: %d\n", i, A[i]);
+    //    i++;
+    //}
+    int32_t volatile i, j;
+    for (i = 0; i < n; i++) {
+      for (j = 0; j < m; j++) {
+        printf("%5d ", A[i * m + j]);
+      }
+      printf("\n");
     }
 }
 
+volatile int32_t matrix[N * M];
+volatile int32_t t_matrix[M * N];
+volatile int32_t matrix_mult[M * M];
+volatile int32_t inv[M * M]; // To store inverse
+volatile int32_t pseudoinverse[M * N];
+
 // Driver program
 int main()
 {
@@ -41,47 +57,48 @@ int main()
     // Initialize barrier and synchronize
     mempool_barrier_init(core_id);
 
-//    int32_t matrix[N * N] = {  -2, 2, 7, 9, 4, 0, 8,
-//                                1, 0, 0, 3, 1, 0, 9,
-//                               -3, 1, 5, 0, 2, 1, 7,
-//                                3,-1,-9, 4, 6, 5, 2,
-//                                1, 0, 4, 4, 1, 0, 9,
-//                                8, 0, 3, 8, 6, 5, 2,
-//                                5, 6, 4, 1, 3, 2, 0  };
-
-    int32_t t_matrix[N * N];
-    int32_t matrix_mult[N * N];
-    int32_t pseudoinverse[N * N];
-    int32_t inv[N * N]; // To store inverse 
-
-    int32_t matrix[N * N];
-    init_matrix(matrix, N, N, -125, 2423, -1294, core_id);
-    init_matrix_zeros(t_matrix, N, N, core_id);
-    init_matrix_zeros(matrix_mult, N, N, core_id);
-    init_matrix_zeros(pseudoinverse, N, N, core_id);
-    init_matrix_zeros(inv, N, N, core_id);
+    init_matrix(matrix,N, M, -156, 427, -219, core_id);
+    init_matrix_zeros(t_matrix, M, N, core_id);
+    init_matrix_zeros(matrix_mult, M, M, core_id);
+    init_matrix_zeros(inv, M, M, core_id);
+    init_matrix_zeros(pseudoinverse, M, N, core_id);
 
     if(core_id == 0) {
 
-      //display(matrix, N);
-      Transpose(matrix, t_matrix, N);
-      //printf("\nThe Transpose is :\n");
-      //display(t_matrix, N);
-      MatrixMult(t_matrix,matrix,matrix_mult, N);
-      //printf("The product of the matrix is: \n");
-      //display(matrix_mult, N);
-      //printf("\nThe Inverse is :\n");
-      #if defined(CRAMER)
-        if (C_inverse(matrix_mult, inv, N))
-            //display(inv, N);
-      #elif defined(GAUSS_JORDAN)
-        GJ_inverse(matrix_mult, inv, N);
-        //display(inv, N);
+      #if defined(VERBOSE)
+          display(matrix, N, M);
+          Transpose(matrix, t_matrix, N,  M);
+          printf("\nThe Transpose is :\n");
+          display(t_matrix, M, N);
+          MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
+          printf("The product of the matrix is: \n");
+          display(matrix_mult, M, M);
+          printf("\nThe Inverse is :\n");
+          #if defined(CRAMER)
+            if (C_inverse(matrix_mult, inv, N));
+                display(inv, N, N);
+          #elif defined(GAUSS_JORDAN)
+            GJ_inverse(matrix_mult, inv, N);
+            display(inv, N, N);
+          #endif
+          MatrixMult(t_matrix, inv, pseudoinverse, M, N, N);
+          printf("\nThe Moore-Penrose inverse is :\n");
+          display(pseudoinverse, M, N);
+      #else
+          Transpose(matrix, t_matrix, N, M);
+          MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
+          mempool_start_benchmark();
+          #if defined(CRAMER)
+          C_inverse(matrix_mult, inv, M);
+          #elif defined(GAUSS_JORDAN)
+          GJ_inverse(matrix_mult, inv, M);
+          #endif
+          mempool_stop_benchmark();
+          MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
+
+          MatrixMult(pseudoinverse, matrix, inv, M, N, M);
+          display(inv, M, M);
       #endif
-      MatrixMult(inv,t_matrix,pseudoinverse, N);
-      //printf("\nThe Moore-Penrose inverse is :\n");
-      //display(pseudoinverse, N);
-
     }
 
     mempool_barrier(num_cores);

From 21ded46cf92d217f1dfd432c190480979f2bb59c Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 19 Jul 2022 08:51:09 +0200
Subject: [PATCH 05/22] [software] Parallelize Gauss-Jordan matrix inversion

---
 .../apps/MP_matrix_inverse/initialization.h   |  47 ++++-
 software/apps/MP_matrix_inverse/main.c        | 106 +++++++----
 .../MP_matrix_inverse/mempool_mat_inv_q16p.h  | 166 ++++++++++++++++
 .../{inverse.h => mempool_mat_inv_q16s.h}     | 178 +++---------------
 4 files changed, 292 insertions(+), 205 deletions(-)
 create mode 100644 software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
 rename software/apps/MP_matrix_inverse/{inverse.h => mempool_mat_inv_q16s.h} (51%)

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h
index 2ba789234..1d9d6396f 100644
--- a/software/apps/MP_matrix_inverse/initialization.h
+++ b/software/apps/MP_matrix_inverse/initialization.h
@@ -4,14 +4,44 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id);
+#define FIXED_POINT 16
+#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
 
-void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
-                         uint32_t core_id);
+dump(flag, 1);
 
-void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id) {
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
+
+void MatrixMult(int32_t *matrix_1,  int32_t *matrix_2,  int32_t *matrix_product, int32_t n, int32_t m, int32_t o);
+
+void init_matrix(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id);
+
+void init_matrix_zeros(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id);
+
+
+void Transpose(int32_t *matrix,  int32_t *t_matrix, int32_t n, int32_t m) {
+  int32_t i, j;
+  for (i = 0; i < n; i++) {
+      for (j = 0; j < m; j++) {
+          t_matrix[j * n + i] = matrix[i * m + j];
+      }
+  }
+}
+
+void MatrixMult(int32_t *matrix_1,  int32_t *matrix_2,  int32_t *matrix_product, int32_t n, int32_t m, int32_t o) {
+  int32_t i, j, k;
+  for (i = 0; i < n; i++) {
+      for (j = 0; j < o; j++) {
+        matrix_product[i * o + j] = 0;
+        for (k = 0; k < m; k++) {
+          matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
+      }
+    }
+  }
+}
+
+void init_matrix(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id) {
   if(core_id == 0) {
     for(uint32_t j = 0; j < num_rows; j++) {
       for(uint32_t i = 0; i < num_columns; i++) {
@@ -21,8 +51,8 @@ void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_colum
   }
 }
 
-void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns,
-                         uint32_t core_id) {
+
+void init_matrix_zeros (int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id) {
   if(core_id == 0) {
     for(uint32_t i = 0; i < num_columns; i++) {
       for(uint32_t j = 0; j < num_rows; j++) {
@@ -30,5 +60,4 @@ void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t n
       }
     }
   }
-
 }
diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c
index bb076ecd2..4cb660e23 100644
--- a/software/apps/MP_matrix_inverse/main.c
+++ b/software/apps/MP_matrix_inverse/main.c
@@ -7,7 +7,7 @@
 //#include <stdint.h>
 //#include <string.h>
 
-#define N 32
+#define N 4
 #define M 4
 #define O 4
 
@@ -16,24 +16,29 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "inverse.h"
 #include "initialization.h"
+#include "mempool_mat_inv_q16s.h"
+#include "mempool_mat_inv_q16p.h"
 
-#define GAUSS_JORDAN
-// #define CRAMER
 // #define VERBOSE
+// #define SINGLE
+#define PARALLEL
 
-// Generic function to display the matrix. We use it to display
-// both adjoin and inverse. adjoin is integer matrix and inverse
-// is a int32_t.
-void display(volatile int32_t *A, int32_t n, int32_t m) {
+int32_t matrix[N * M]         __attribute__((section(".l1")));
+int32_t t_matrix[M * N]       __attribute__((section(".l1")));
+int32_t matrix_mult[M * M]    __attribute__((section(".l1")));
+int32_t inv[M * M]            __attribute__((section(".l1")));
+int32_t pseudoinverse[M * N]  __attribute__((section(".l1")));
+uint32_t flag                __attribute__((section(".l1")));
+
+void display(int32_t *A, int32_t n, int32_t m) {
     //int32_t volatile i = 0;
     //while (i < n * m) {
     //    // printf("ciao mamma\n");
     //    printf("Value %d: %d\n", i, A[i]);
     //    i++;
     //}
-    int32_t volatile i, j;
+    int32_t i, j;
     for (i = 0; i < n; i++) {
       for (j = 0; j < m; j++) {
         printf("%5d ", A[i * m + j]);
@@ -42,14 +47,8 @@ void display(volatile int32_t *A, int32_t n, int32_t m) {
     }
 }
 
-volatile int32_t matrix[N * M];
-volatile int32_t t_matrix[M * N];
-volatile int32_t matrix_mult[M * M];
-volatile int32_t inv[M * M]; // To store inverse
-volatile int32_t pseudoinverse[M * N];
-
 // Driver program
-int main()
+void single_core()
 {
 
     uint32_t core_id = mempool_get_core_id();
@@ -57,14 +56,14 @@ int main()
     // Initialize barrier and synchronize
     mempool_barrier_init(core_id);
 
-    init_matrix(matrix,N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(t_matrix, M, N, core_id);
-    init_matrix_zeros(matrix_mult, M, M, core_id);
-    init_matrix_zeros(inv, M, M, core_id);
-    init_matrix_zeros(pseudoinverse, M, N, core_id);
+    init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    //init_matrix_zeros(t_matrix, M, N, core_id);
+    //init_matrix_zeros(matrix_mult, M, M, core_id);
+    //init_matrix_zeros(inv, M, M, core_id);
+    //init_matrix_zeros(pseudoinverse, M, N, core_id);
+    mempool_barrier(num_cores);
 
     if(core_id == 0) {
-
       #if defined(VERBOSE)
           display(matrix, N, M);
           Transpose(matrix, t_matrix, N,  M);
@@ -74,33 +73,62 @@ int main()
           printf("The product of the matrix is: \n");
           display(matrix_mult, M, M);
           printf("\nThe Inverse is :\n");
-          #if defined(CRAMER)
-            if (C_inverse(matrix_mult, inv, N));
-                display(inv, N, N);
-          #elif defined(GAUSS_JORDAN)
-            GJ_inverse(matrix_mult, inv, N);
-            display(inv, N, N);
-          #endif
+          mempool_mat_inv_q16s(matrix_mult, inv, N);
+          display(inv, N, N);
           MatrixMult(t_matrix, inv, pseudoinverse, M, N, N);
           printf("\nThe Moore-Penrose inverse is :\n");
           display(pseudoinverse, M, N);
       #else
-          Transpose(matrix, t_matrix, N, M);
-          MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
+          //Transpose(matrix, t_matrix, N, M);
+          //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
+
           mempool_start_benchmark();
-          #if defined(CRAMER)
-          C_inverse(matrix_mult, inv, M);
-          #elif defined(GAUSS_JORDAN)
-          GJ_inverse(matrix_mult, inv, M);
-          #endif
+          mempool_GJinv_q16s(matrix, inv, M);
           mempool_stop_benchmark();
-          MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
 
-          MatrixMult(pseudoinverse, matrix, inv, M, N, M);
-          display(inv, M, M);
+          //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
+          //MatrixMult(pseudoinverse, matrix, inv, M, N, M);
       #endif
     }
+    mempool_barrier(num_cores);
+}
+
+void multi_core()
+{
 
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t num_cores = mempool_get_core_count();
+    // Initialize barrier and synchronize
+    mempool_barrier_init(core_id);
+
+    init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    if (core_id == 0) {
+        flag = 0U;
+    }
+    //init_matrix_zeros(t_matrix, M, N, core_id);
+    //init_matrix_zeros(matrix_mult, M, M, core_id);
+    //init_matrix_zeros(inv, M, M, core_id);
+    //init_matrix_zeros(pseudoinverse, M, N, core_id);
     mempool_barrier(num_cores);
+
+    //Transpose(matrix, t_matrix, N, M);
+    //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
+
+    mempool_start_benchmark();
+    mempool_GJinv_q16p(matrix, inv, M, &flag);
+    mempool_stop_benchmark();
+
+    //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
+    //MatrixMult(pseudoinverse, matrix, inv, M, N, M);
+
+    mempool_barrier(num_cores);
+}
+
+int main() {
+    #if defined(SINGLE)
+    single_core();
+    #elif defined(PARALLEL)
+    multi_core();
+    #endif
     return 0;
 }
diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
new file mode 100644
index 000000000..c40bafe5b
--- /dev/null
+++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
@@ -0,0 +1,166 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* GAUSS JORDAN INVERSION */
+
+int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
+
+int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
+
+    int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
+    int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
+    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
+    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+    int32_t Xchg, in = 0, in1;                    /* Temporary input values  */
+
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t i, j, loopCnt, k, l;  /* loop counters */
+    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+
+    /* CREATE THE IDENTITY MATRIX */
+
+    pDstT1 = pDst;  /* Working pointer for destination matrix */
+    for (i = core_id; i < m; i += NUM_CORES) {
+        for (j = 0; j < m; j++) {
+            pDstT1[i * m + j] = (uint32_t) (i == j);
+        }
+    }
+    mempool_barrier(NUM_CORES);
+
+    /* Loop over the number of columns of the input matrix. */
+    loopCnt = n;
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+
+    while (loopCnt > 0U) {
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO */
+
+        pSrcT1 = pSrc + (l * n);
+        pDstT1 = pDst + (l * n);
+
+        in = *pSrcT1;
+        k = 1U;
+        /* Check if the pivot element is zero */
+        if (*pSrcT1 == 0U) {
+
+            /* Loop over the number rows present below */
+            for (i = (l + 1U) + core_id; i < m; i += NUM_CORES) {
+                pSrcT2 = pSrcT1 + (n * i);
+                /* Check if there is element to exchange */
+                //if (*flag != 0U)
+                //    break;
+                if (*pSrcT2 != 0U)  {
+                    __atomic_fetch_add(flag, k, __ATOMIC_RELAXED);
+                }
+            }
+            mempool_barrier(NUM_CORES);
+
+            if (*flag != 0U) {
+                pSrcT2 = pSrcT1 + (n * *flag + l);
+                pDstT2 = pDstT1 + (n * *flag);
+                /* Loop over number of columns
+                 * to the right of the pilot element */
+                for (j = core_id; j < n - l; j += NUM_CORES) {
+                    /* Exchange the row elements of the input matrix */
+                    Xchg = pSrcT2[j];
+                    pSrcT2[j] = pSrcT1[j];
+                    pSrcT1[j] = Xchg;
+                }
+                pSrcT1 += n - l;
+                pSrcT2 += n - l;
+                /* Loop over number of columns of the destination matrix */
+                for(j = core_id; j < n; j += NUM_CORES) {
+                    /* Exchange the row elements of the destination matrix */
+                    Xchg = pDstT2[j];
+                    pDstT2[j] = pDstT1[j];
+                    pDstT1[j] = Xchg;
+                }
+                pDstT2 += n;
+                pDstT1 += n;
+            }
+            k++;
+            mempool_barrier(NUM_CORES);
+        }
+
+        /* Update the status if the matrix is singular */
+        if ((*flag == 0U) && (in == 0U)) {
+            return 1;
+        }
+
+        /* DIVIDE BY THE PIVOT */
+
+        /* Points to the pivot row of input and destination matrices */
+        pPivotRowIn = pSrc + (l * n);
+        pPivotRowDst = pDst + (l * n);
+        /* Temporary pointers to the pivot row pointers */
+        pSrcT1 = pPivotRowIn;
+        pSrcT2 = pPivotRowDst;
+        /* Pivot element of the row */
+        in = *pPivotRowIn;
+        /* Loop over number of columns to the right of the pilot element */
+        for(j = core_id; j < n - l; j += NUM_CORES) {
+            in1 = pSrcT1[j];
+            pSrcT1[j] = FIX_DIV(in1, in);
+        }
+        /* Loop over number of columns of the destination matrix */
+        for(j = core_id; j < n; j += NUM_CORES) {
+            in1 = pSrcT2[j];
+            pSrcT2[j] = FIX_DIV(in1, in);
+        }
+        mempool_barrier(NUM_CORES);
+
+        /*REPLACE ROWS */
+
+        pSrcT1 = pSrc + core_id * n;
+        pSrcT2 = pDst + core_id * n;
+        i = core_id;
+        k = m;
+        for(k = core_id; k < m; k += NUM_CORES) {
+            if (i != l) {
+                /* Element of the reference row */
+                in = *pSrcT1;
+                /* Working pointers for input and destination pivot rows */
+                pPRT_in = pPivotRowIn;
+                pPRT_pDst = pPivotRowDst;
+                /* Loop over the number of columns to the right of the pivot element,
+                   to replace the elements in the input matrix */
+                for (j = 0; j < n - l; j++) {
+                    in1 = pSrcT1[j];
+                    pSrcT1[j] = in1 - FIX_MUL(in, pPRT_in[j]);
+                }
+                /* Loop over the number of columns to
+                   replace the elements in the destination matrix */
+                for (j = 0; j < n; j++) {
+                    in1 = pSrcT2[j];
+                    pSrcT2[j] = in1 - FIX_MUL(in, pPRT_pDst[j]);
+                }
+            }
+            i += NUM_CORES;
+            pSrcT1 += NUM_CORES * n;
+            pSrcT2 += NUM_CORES * n;
+        }
+        /* Increment the input pointer */
+        pSrc++;
+        /* Decrement the loop counter */
+        loopCnt--;
+        /* Increment the index modifier */
+        l++;
+        mempool_barrier(NUM_CORES);
+    }
+
+//    if ((flag != 1U) && (x == 0)) {
+//        for (i = 0; i < m * n; i++) {
+//            if (pSrc[i] != 0)
+//                break;
+//        }
+//        if (i == m * n)
+//            return 1;
+//    }
+
+    return 0;
+}
diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
similarity index 51%
rename from software/apps/MP_matrix_inverse/inverse.h
rename to software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
index 4d230c422..9f2201224 100644
--- a/software/apps/MP_matrix_inverse/inverse.h
+++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
@@ -4,160 +4,25 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-#define FIXED_POINT 16
-#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b))
-#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
-
-dump(prova, 1);
-
-void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m);
-
-void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o);
-
-
-
-void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n);
-
-int32_t determinant(volatile int32_t *A, int32_t n);
-
-void adjoint(volatile int32_t *A,int32_t *adj, int32_t n);
-
-int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n);
-
-
-
-int GJ_inverse(volatile int32_t *pSrc, volatile int32_t *pDst, uint32_t n);
-
-void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m) {
-  int32_t i, j;
-    for (i = 0; i < n; i++) {
-        for (j = 0; j < m; j++) {
-             t_matrix[j * n + i] = matrix[i * m + j];
-        }
-    }
-}
-void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o) {
-  int32_t i, j, k;
-  for (i = 0; i < n; i++) {
-      for (j = 0; j < o; j++) {
-        matrix_product[i * o + j] = 0;
-        for (k = 0; k < m; k++) {
-          matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
-      }
-    }
-  }
-
-}
-
-/* CRAMER MATRIX INVERSION */
-
-// Function to get cofactor
-void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) {
-    int32_t i = 0, j = 0;
-    // Looping for each element of the matrix
-    for (int32_t row = 0; row < n; row++) {
-      for (int32_t col = 0; col < n; col++) {
-        // Copying into temporary matrix only those element
-        // which are not in given row and column
-        if (row != p && col != q) {
-          temp[i * N + j++] = A[row * N + col];
-          // Row is filled, so increase row index and
-          // reset col index
-          if (j == n - 1) {
-              j = 0;
-              i++;
-          }
-        }
-      }
-    }
-}
- 
-// Recursive function for finding determinant of matrix.
-int32_t determinant(volatile int32_t *A, int32_t n) {
-
-    int32_t D = 0; // Initialize result
-    // Base case : if matrix contains single element
-    if (n == 1)
-        return A[0];
- 
-    int32_t temp[N * N]; // To store cofactors
-    for(int32_t i =0; i < N*N; i++)
-      temp[i] = 0;
-
-    int32_t sign = 1; // To store sign multiplier
-    // Iterate for each element of first row
-    for (int32_t f = 0; f < n; f++) {
-
-        // Getting Cofactor of A[0][f]
-        getCofactor(A, temp, 0, f, n);
-
-        D += sign * A[0 * N + f] * determinant(temp, n - 1);
-        // terms are to be added with alternate sign
-        sign = -sign;
-    }
-
-    return D;
-}
- 
-// Function to get adjoint
-void adjoint(volatile int32_t *A,int32_t *adj, int32_t n) {
-    if (n == 1) {
-        adj[0] = 1;
-        return;
-    }
-    // temp is used to store cofactors 
-    int32_t sign = 1;
-    int32_t temp[N * N];
-    for (int32_t i = 0; i < N; i++) {
-        for (int32_t j = 0; j < N; j++) {
-            // Get cofactor
-            getCofactor(A, temp, i, j, N);
-            // sign of adj positive if sum of row
-            // and column indexes is even.
-            sign = ((i + j) % 2 == 0) ? 1 : -1;
-            // Interchanging rows and columns to get the
-            // transpose of the cofactor matrix
-            adj[j * N + i] = (sign)*(determinant(temp, N - 1));
-        }
-    }
-}
- 
-// Function to calculate and store inverse, returns false if
-// matrix is singular
-int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n) {
-    // Find determinant of A[][]
-    int32_t det = determinant(A, n);
-    if (det == 0) {
-        printf("Singular matrix, can't find its inverse\n");
-        return 0;
-    }
- 
-    // Find adjoint
-    int32_t adj[n * n];
-    adjoint(A, adj, n);
- 
-    // Find Inverse using formula "inverse(A) = adj(A)/det(A)"
-    for (int32_t i = 0; i < n; i++)
-        for (int32_t j = 0; j < n; j++)
-            inverse[i * n + j]= FIX_DIV(adj[i * n + j], det);
-    return 1;
-}
-
 /* GAUSS JORDAN INVERSION */
 
-int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
+int mempool_GJinv_q16s(int32_t *pSrc, int32_t *pDst, uint32_t n);
+
+int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
 
     int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
     int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
     int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
     int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t Xchg, x = 0, y;                    /* Temporary input values  */
-    uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;  /* loop counters */
+    int32_t Xchg, in = 0, in1;                   /* Temporary input values  */
+    uint32_t i, rowCnt, j, loopCnt, k, l;        /* loop counters */
+    uint32_t flag;
 
     uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
     pDstT1 = pDst;  /* Working pointer for destination matrix */
     rowCnt = m;     /* Loop over the number of rows */
+    flag = 0U;
 
     /* CREATE THE IDENTITY MATRIX */
     while (rowCnt > 0U) {
@@ -181,7 +46,6 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
     l = 0U;
 
     while (loopCnt > 0U) {
-
         /* CHECK IF PIVOT ELEMENT IS ZERO...
          * If it is zero then interchange the row with non zero row below.
          * If there is no non zero element to replace in the rows below,
@@ -189,9 +53,8 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
 
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
-        x = *pSrcT1;
         k = 1U;
-        if (x == 0) {
+        if (*pSrcT1 == 0) {
             /* Loop over the rows present below */
             for (i = (l + 1U); i < m; i++) {
                 pSrcT2 = pSrcT1 + (n * i);
@@ -222,10 +85,11 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
             }
         }
         /* Return when the matrix is singular */
-        if ((flag != 1U) && (x == 0)) {
+        if ((flag == 0U) && (in == 0)) {
             return 1;
         }
 
+
         /* DIVIDE BY THE PIVOT */
 
         /* Points to the pivot row of input and destination matrices */
@@ -235,20 +99,20 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
         pSrcT1 = pPivotRowIn;
         pSrcT2 = pPivotRowDst;
         /* Pivot element of the row */
-        x = *pPivotRowIn;
+        in = *pPivotRowIn;
 
         /* Loop over number of columns to the right of the pilot element */
         j = (n - l);
         while (j > 0U) {
-            y = *pSrcT1;
-            *pSrcT1++ = FIX_DIV(y, x);
+            in1 = *pSrcT1;
+            *pSrcT1++ = FIX_DIV(in1, in);
             j--;
         }
         /* Loop over number of columns of the destination matrix */
         j = n;
         while (j > 0U) {
-            y = *pSrcT2;
-            *pSrcT2++ = FIX_DIV(y, x);
+            in1 = *pSrcT2;
+            *pSrcT2++ = FIX_DIV(in1, in);
             j--;
         }
 
@@ -272,21 +136,21 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
             } else {
 
                 /* Element of the reference row */
-                x = *pSrcT1;
+                in = *pSrcT1;
                 /* Reference row pointers */
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
 
                 j = (n - l); /* Replace the elements to the right of the pivot */
                 while (j > 0U) {
-                    y = *pSrcT1;
-                    *pSrcT1++ = y - FIX_MUL(x, *pPRT_in++);
+                    in1 = *pSrcT1;
+                    *pSrcT1++ = in1 - FIX_MUL(in, *pPRT_in++);
                     j--;
                 }
                 j = n; /* Replace the elements in the destination matrix */
                 while (j > 0U) {
-                    y = *pSrcT2;
-                    *pSrcT2++ = y - FIX_MUL(x, *pPRT_pDst++);
+                    in1 = *pSrcT2;
+                    *pSrcT2++ = in1 - FIX_MUL(in, *pPRT_pDst++);
                     j--;
                 }
             }
@@ -303,7 +167,7 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) {
         l++; /* Increment the index modifier */
     }
 
-//    if ((flag != 1U) && (x == 0)) {
+//    if ((flag != 1U) && (in == 0)) {
 //        for (i = 0; i < m * n; i++) {
 //            if (pSrc[i] != 0)
 //                break;

From faeca5086e77ed8a96a3e7bec043f8f581d93432 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 19 Jul 2022 18:09:48 +0200
Subject: [PATCH 06/22] [software] Unroll single core

---
 .../MP_matrix_inverse/mempool_mat_inv_q16s.h  | 167 ++++++++++++++----
 1 file changed, 137 insertions(+), 30 deletions(-)

diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
index 9f2201224..e217119cd 100644
--- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
+++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
@@ -15,11 +15,15 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
     int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
     int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t Xchg, in = 0, in1;                   /* Temporary input values  */
+    int32_t in = 0;
+    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+    int32_t in1, in2, in3, in4;
+    int32_t out1, out2, out3, out4;
+
     uint32_t i, rowCnt, j, loopCnt, k, l;        /* loop counters */
     uint32_t flag;
-
     uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+
     pDstT1 = pDst;  /* Working pointer for destination matrix */
     rowCnt = m;     /* Loop over the number of rows */
     flag = 0U;
@@ -54,29 +58,69 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
         k = 1U;
-        if (*pSrcT1 == 0) {
+
+        in = *pSrcT1;
+        if (in == 0) {
             /* Loop over the rows present below */
             for (i = (l + 1U); i < m; i++) {
-                pSrcT2 = pSrcT1 + (n * i);
+                pSrcT2 = pSrc + (n * i);
                 pDstT2 = pDstT1 + (n * k);
 
                 /* Check if there is a non zero pivot element to replace in the rows below */
                 if (*pSrcT2 != 0) {
                     /* Exchange the row elements of the input matrix at the right of the pivot */
-                    j = n - l;
-                    while (j > 0U) {
-                        Xchg = *pSrcT2;
-                        *pSrcT2++ = *pSrcT1;
-                        *pSrcT1++ = Xchg;
-                        j--;
+                    j = 0;
+                    while (j < (n - l) - (n - l) % 4) {
+                        Xchg1 = *(pSrcT2);
+                        Xchg2 = *(pSrcT2 + 1);
+                        Xchg3 = *(pSrcT2 + 2);
+                        Xchg4 = *(pSrcT2 + 3);
+                        out1 = *(pSrcT1);
+                        out2 = *(pSrcT1 + 1);
+                        out3 = *(pSrcT1 + 2);
+                        out4 = *(pSrcT1 + 3);
+                        *pSrcT2++ = out1;
+                        *pSrcT2++ = out2;
+                        *pSrcT2++ = out3;
+                        *pSrcT2++ = out4;
+                        *pSrcT1++ = Xchg1;
+                        *pSrcT1++ = Xchg2;
+                        *pSrcT1++ = Xchg3;
+                        *pSrcT1++ = Xchg4;
+                        j += 4;
+                    }
+                    while (j < n - l) {
+                      Xchg1 = *pSrcT2;
+                      *pSrcT2++ = *pSrcT1;
+                      *pSrcT1++ = Xchg1;
+                      j++;
                     }
                     /* Exchange the row elements of the destination matrix */
-                    j = n;
-                    while (j > 0U) {
-                        Xchg = *pDstT2;
+                    j = 0;
+                    while (j < n - n % 4) {
+                        Xchg1 = *(pDstT2);
+                        Xchg2 = *(pDstT2 + 1);
+                        Xchg3 = *(pDstT2 + 2);
+                        Xchg4 = *(pDstT2 + 3);
+                        out1 = *(pDstT1);
+                        out2 = *(pDstT1 + 1);
+                        out3 = *(pDstT1 + 2);
+                        out4 = *(pDstT1 + 3);
+                        *pDstT2++ = out1;
+                        *pDstT2++ = out2;
+                        *pDstT2++ = out3;
+                        *pDstT2++ = out4;
+                        *pDstT1++ = Xchg1;
+                        *pDstT1++ = Xchg2;
+                        *pDstT1++ = Xchg3;
+                        *pDstT1++ = Xchg4;
+                        j += 4;
+                    }
+                    while (j < n) {
+                        Xchg1 = *pDstT2;
                         *pDstT2++ = *pDstT1;
-                        *pDstT1++ = Xchg;
-                        j--;
+                        *pDstT1++ = Xchg1;
+                        j++;
                     }
                     flag = 1U;
                     break;
@@ -102,20 +146,49 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
         in = *pPivotRowIn;
 
         /* Loop over number of columns to the right of the pilot element */
-        j = (n - l);
-        while (j > 0U) {
+        j = 0;
+        while (j < (n - l) - (n - l) % 4) {
+            in1 = *pSrcT1;
+            in2 = *(pSrcT1 + 1);
+            in3 = *(pSrcT1 + 2);
+            in4 = *(pSrcT1 + 3);
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            *pSrcT1++ = out1;
+            *pSrcT1++ = out2;
+            *pSrcT1++ = out3;
+            *pSrcT1++ = out4;
+            j += 4;
+        }
+        while (j < n - l) {
             in1 = *pSrcT1;
             *pSrcT1++ = FIX_DIV(in1, in);
-            j--;
+            j++;
         }
         /* Loop over number of columns of the destination matrix */
-        j = n;
-        while (j > 0U) {
+        j = 0;
+        while (j < n - n % 4) {
+            in1 = *pSrcT2;
+            in2 = *(pSrcT2 + 1);
+            in3 = *(pSrcT2 + 2);
+            in4 = *(pSrcT2 + 3);
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            *pSrcT2++ = out1;
+            *pSrcT2++ = out2;
+            *pSrcT2++ = out3;
+            *pSrcT2++ = out4;
+            j += 4;
+        }
+        while (j < n) {
             in1 = *pSrcT2;
             *pSrcT2++ = FIX_DIV(in1, in);
-            j--;
+            j++;
         }
-
         /* SUM THE MULTIPLE OF A BOTTOM ROW */
         /* Replace the rows with the sum of that row and a multiple of row i
          * so that each new element in column i above row i is zero.*/
@@ -141,17 +214,51 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
 
-                j = (n - l); /* Replace the elements to the right of the pivot */
-                while (j > 0U) {
+                j = 0;
+                while (j < (n - l) - (n - l) % 2) {
                     in1 = *pSrcT1;
-                    *pSrcT1++ = in1 - FIX_MUL(in, *pPRT_in++);
-                    j--;
+                    in2 = *(pSrcT1 + 1);
+                    // in3 = *(pSrcT1 + 2);
+                    // in4 = *(pSrcT1 + 3);
+                    out1 = *pPRT_in++;
+                    out2 = *pPRT_in++;
+                    // out3 = *pPRT_in++;
+                    // out4 = *pPRT_in++;
+                    *pSrcT1++ = in1 - FIX_MUL(in, out1);
+                    *pSrcT1++ = in2 - FIX_MUL(in, out2);
+                    // *pSrcT1++ = in3 - FIX_MUL(in, out3);
+                    // *pSrcT1++ = in4 - FIX_MUL(in, out4);
+                    j += 2;
+                }
+                while (j < n - l) {
+                    in1 = *pSrcT1;
+                    out1 = *pPRT_in++;
+                    *pSrcT1++ = in1 - FIX_MUL(in, out1);
+                    j++;
+                }
+                /* Loop over the number of columns to
+                   replace the elements in the destination matrix */
+                j = 0;
+                while (j < n - n % 4) {
+                    in1 = *pSrcT2;
+                    in2 = *(pSrcT2 + 1);
+                    in3 = *(pSrcT2 + 2);
+                    in4 = *(pSrcT2 + 3);
+                    out1 = *pPRT_pDst++;
+                    out2 = *pPRT_pDst++;
+                    out3 = *pPRT_pDst++;
+                    out4 = *pPRT_pDst++;
+                    *pSrcT2++ = in1 - FIX_MUL(in, out1);
+                    *pSrcT2++ = in2 - FIX_MUL(in, out2);
+                    *pSrcT2++ = in3 - FIX_MUL(in, out3);
+                    *pSrcT2++ = in4 - FIX_MUL(in, out4);
+                    j += 4;
                 }
-                j = n; /* Replace the elements in the destination matrix */
-                while (j > 0U) {
+                while (j < n) {
                     in1 = *pSrcT2;
-                    *pSrcT2++ = in1 - FIX_MUL(in, *pPRT_pDst++);
-                    j--;
+                    out1 = *pPRT_pDst;
+                    *pSrcT2++ = in1 - FIX_MUL(in, out1);
+                    j++;
                 }
             }
             /* Increment temporary input pointer */

From 972802348515d0a9295c1fd210c392341f153f4e Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 19 Jul 2022 18:42:29 +0200
Subject: [PATCH 07/22] [software] Unroll parallel core

---
 .../apps/MP_matrix_inverse/initialization.h   |   4 +-
 .../MP_matrix_inverse/mempool_mat_inv_q16p.h  | 295 +++++++++++++-----
 2 files changed, 213 insertions(+), 86 deletions(-)

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h
index 1d9d6396f..c8c874ea3 100644
--- a/software/apps/MP_matrix_inverse/initialization.h
+++ b/software/apps/MP_matrix_inverse/initialization.h
@@ -7,9 +7,7 @@
 #define FIXED_POINT 16
 #define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b))
 #define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
-
-dump(flag, 1);
-
+#define MIN(a,b) (a < b ? a : b)
 
 void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
 
diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
index c40bafe5b..445d70cd7 100644
--- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
+++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
@@ -6,6 +6,10 @@
 
 /* GAUSS JORDAN INVERSION */
 
+dump(l, 1);
+dump(loopCnt, 2);
+dump(i, 3);
+
 int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
 
 int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
@@ -15,7 +19,10 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
     int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
     int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t Xchg, in = 0, in1;                    /* Temporary input values  */
+    int32_t in = 0;
+    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+    int32_t in1, in2, in3, in4;
+    int32_t out1, out2, out3, out4;
 
     uint32_t core_id = mempool_get_core_id();
     uint32_t i, j, loopCnt, k, l;  /* loop counters */
@@ -23,13 +30,16 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
 
     /* CREATE THE IDENTITY MATRIX */
 
-    pDstT1 = pDst;  /* Working pointer for destination matrix */
-    for (i = core_id; i < m; i += NUM_CORES) {
+    pDstT1 = pDst;
+    for (i = core_id * 4; i < m; i += 4 * NUM_CORES) {
         for (j = 0; j < m; j++) {
             pDstT1[i * m + j] = (uint32_t) (i == j);
+            pDstT1[(i + 1) * m + j] = (uint32_t) ((i + 1) == j);
+            pDstT1[(i + 2) * m + j] = (uint32_t) ((i + 2) == j);
+            pDstT1[(i + 3) * m + j] = (uint32_t) ((i + 3) == j);
         }
     }
-    mempool_barrier(NUM_CORES);
+    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
     /* Loop over the number of columns of the input matrix. */
     loopCnt = n;
@@ -38,53 +48,101 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
 
     while (loopCnt > 0U) {
 
-        /* CHECK IF PIVOT ELEMENT IS ZERO */
-
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
-
         in = *pSrcT1;
-        k = 1U;
-        /* Check if the pivot element is zero */
-        if (*pSrcT1 == 0U) {
-
-            /* Loop over the number rows present below */
-            for (i = (l + 1U) + core_id; i < m; i += NUM_CORES) {
-                pSrcT2 = pSrcT1 + (n * i);
-                /* Check if there is element to exchange */
-                //if (*flag != 0U)
-                //    break;
-                if (*pSrcT2 != 0U)  {
-                    __atomic_fetch_add(flag, k, __ATOMIC_RELAXED);
+
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO */
+
+        if (in == 0U) {
+
+            //if (core_id == 0) {
+            //    k = 1U;
+            //    while (k < m - l) {
+            //        pSrcT2 = pSrcT1 + k * n;
+            //        if (*pSrcT2 != 0) {
+            //            *flag = k;
+            //            break;
+            //        }
+            //        k++;
+            //    }
+            //}
+            //mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+
+            k = 1U + core_id;
+            while ((k < m - l) && (*flag == 0)) {
+                pSrcT2 = pSrcT1 + k * n;
+                if (*pSrcT2 != 0) {
+                    __atomic_store_n(flag, k, __ATOMIC_RELAXED);
                 }
+                k += MIN(n / 4, NUM_CORES);
+                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
             }
-            mempool_barrier(NUM_CORES);
+
+            /* EXCHANGE */
 
             if (*flag != 0U) {
-                pSrcT2 = pSrcT1 + (n * *flag + l);
+                pSrcT2 = pSrcT1 + (n * *flag);
                 pDstT2 = pDstT1 + (n * *flag);
-                /* Loop over number of columns
-                 * to the right of the pilot element */
-                for (j = core_id; j < n - l; j += NUM_CORES) {
-                    /* Exchange the row elements of the input matrix */
-                    Xchg = pSrcT2[j];
-                    pSrcT2[j] = pSrcT1[j];
-                    pSrcT1[j] = Xchg;
+                /* Loop over columns to the right of pivot */
+                for (j = core_id * 4; j < (n - l) - (n - l) % 4; j += 4 * NUM_CORES) {
+                    Xchg1 = pSrcT2[j];
+                    Xchg2 = pSrcT2[j + 1];
+                    Xchg3 = pSrcT2[j + 2];
+                    Xchg4 = pSrcT2[j + 3];
+                    out1 = pSrcT1[j];
+                    out2 = pSrcT1[j + 1];
+                    out3 = pSrcT1[j + 2];
+                    out4 = pSrcT1[j + 3];
+                    pSrcT2[j] = out1;
+                    pSrcT2[j + 1] = out2;
+                    pSrcT2[j + 2] = out3;
+                    pSrcT2[j + 3] = out4;
+                    pSrcT1[j] = Xchg1;
+                    pSrcT1[j + 1] = Xchg2;
+                    pSrcT1[j + 2] = Xchg3;
+                    pSrcT1[j + 3] = Xchg4;
+                }
+                if (core_id == (n - l) / 4) {
+                    j = (n - l) - (n - l) % 4;
+                    while (j < n - l) {
+                        Xchg1 = pSrcT2[j];
+                        pSrcT2[j] = pSrcT1[j];
+                        pSrcT1[j] = Xchg1;
+                        j++;
+                    }
+                }
+                /* Loop over columns */
+                for (j = core_id * 4; j < n - n % 4; j += 4 * NUM_CORES) {
+                    Xchg1 = pDstT2[j];
+                    Xchg2 = pDstT2[j + 1];
+                    Xchg3 = pDstT2[j + 2];
+                    Xchg4 = pDstT2[j + 3];
+                    out1 = pDstT1[j];
+                    out2 = pDstT1[j + 1];
+                    out3 = pDstT1[j + 2];
+                    out4 = pDstT1[j + 3];
+                    pDstT2[j] = out1;
+                    pDstT2[j + 1] = out2;
+                    pDstT2[j + 2] = out3;
+                    pDstT2[j + 3] = out4;
+                    pDstT1[j] = Xchg1;
+                    pDstT1[j + 1] = Xchg2;
+                    pDstT1[j + 2] = Xchg3;
+                    pDstT1[j + 3] = Xchg4;
                 }
-                pSrcT1 += n - l;
-                pSrcT2 += n - l;
-                /* Loop over number of columns of the destination matrix */
-                for(j = core_id; j < n; j += NUM_CORES) {
-                    /* Exchange the row elements of the destination matrix */
-                    Xchg = pDstT2[j];
-                    pDstT2[j] = pDstT1[j];
-                    pDstT1[j] = Xchg;
+                if (core_id == n / 4) {
+                    j = n - n % 4;
+                    while (j < n) {
+                        Xchg1 = pDstT2[j];
+                        pDstT2[j] = pDstT1[j];
+                        pDstT1[j] = Xchg1;
+                        j++;
+                    }
                 }
-                pDstT2 += n;
-                pDstT1 += n;
             }
-            k++;
-            mempool_barrier(NUM_CORES);
+            mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
         }
 
         /* Update the status if the matrix is singular */
@@ -92,6 +150,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
             return 1;
         }
 
+
         /* DIVIDE BY THE PIVOT */
 
         /* Points to the pivot row of input and destination matrices */
@@ -102,55 +161,126 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         pSrcT2 = pPivotRowDst;
         /* Pivot element of the row */
         in = *pPivotRowIn;
-        /* Loop over number of columns to the right of the pilot element */
-        for(j = core_id; j < n - l; j += NUM_CORES) {
+
+        ///* Loop over columns to the right of pivot */
+        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
             in1 = pSrcT1[j];
-            pSrcT1[j] = FIX_DIV(in1, in);
+            in2 = pSrcT1[j + 1];
+            in3 = pSrcT1[j + 2];
+            in4 = pSrcT1[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT1[j] = out1;
+            pSrcT1[j + 1] = out2;
+            pSrcT1[j + 2] = out3;
+            pSrcT1[j + 3] = out4;
+        }
+        if (core_id == (n - l) / 4) {
+            j = 4 * ((n - l) >> 2U);
+            while (j < n - l) {
+                in1 = pSrcT1[j];
+                pSrcT1[j] = FIX_DIV(in1, in);
+                j++;
+            }
         }
-        /* Loop over number of columns of the destination matrix */
-        for(j = core_id; j < n; j += NUM_CORES) {
+        /* Loop over columns */
+        for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
             in1 = pSrcT2[j];
-            pSrcT2[j] = FIX_DIV(in1, in);
+            in2 = pSrcT2[j + 1];
+            in3 = pSrcT2[j + 2];
+            in4 = pSrcT2[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT2[j] = out1;
+            pSrcT2[j + 1] = out2;
+            pSrcT2[j + 2] = out3;
+            pSrcT2[j + 3] = out4;
         }
-        mempool_barrier(NUM_CORES);
-
-        /*REPLACE ROWS */
-
-        pSrcT1 = pSrc + core_id * n;
-        pSrcT2 = pDst + core_id * n;
-        i = core_id;
-        k = m;
-        for(k = core_id; k < m; k += NUM_CORES) {
-            if (i != l) {
-                /* Element of the reference row */
-                in = *pSrcT1;
-                /* Working pointers for input and destination pivot rows */
-                pPRT_in = pPivotRowIn;
-                pPRT_pDst = pPivotRowDst;
-                /* Loop over the number of columns to the right of the pivot element,
-                   to replace the elements in the input matrix */
-                for (j = 0; j < n - l; j++) {
-                    in1 = pSrcT1[j];
-                    pSrcT1[j] = in1 - FIX_MUL(in, pPRT_in[j]);
-                }
-                /* Loop over the number of columns to
-                   replace the elements in the destination matrix */
-                for (j = 0; j < n; j++) {
-                    in1 = pSrcT2[j];
-                    pSrcT2[j] = in1 - FIX_MUL(in, pPRT_pDst[j]);
+        if (core_id == n / 4) {
+            j = 4 * (n >> 2U);
+            while (j < n) {
+                in1 = pSrcT1[j];
+                pSrcT1[j] = FIX_DIV(in1, in);
+                j++;
+            }
+        }
+        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+
+
+        /* REPLACE ROWS */
+
+        pSrcT1 = pSrc;
+        pSrcT2 = pDst;
+
+        /* Loop over rows */
+        for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+            i = 0;
+            while (i < 4) {
+                if ((i + k) != l) {
+                    pSrcT1 = pSrc + (i + k) * n;
+                    pSrcT2 = pDst + (i + k) * n;
+                    /* Element of the reference row */
+                    in = *pSrcT1;
+                    pPRT_in = pPivotRowIn;
+                    pPRT_pDst = pPivotRowDst;
+                    /* Loop over columns to the right of pivot */
+                    while (j < (n - l) - (n - l) % 4) {
+                        in1 = pSrcT1[j];
+                        in2 = pSrcT1[j + 1];
+                        in3 = pSrcT1[j + 2];
+                        in4 = pSrcT1[j + 3];
+                        out1 = pPRT_in[j];
+                        out2 = pPRT_in[j + 1];
+                        out3 = pPRT_in[j + 2];
+                        out4 = pPRT_in[j + 3];
+                        pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+                        pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+                        pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+                        pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+                        j += 4;
+                    }
+                    while (j < n - l) {
+                        in1 = pSrcT1[j];
+                        out1 = pPRT_in[j];
+                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
+                    /* Loop over columns */
+                    j = 0;
+                    while (j < n - n % 4) {
+                        in1 = pSrcT2[j];
+                        in2 = pSrcT2[j + 1];
+                        in3 = pSrcT2[j + 2];
+                        in4 = pSrcT2[j + 3];
+                        out1 = pPRT_pDst[j];
+                        out2 = pPRT_pDst[j + 1];
+                        out3 = pPRT_pDst[j + 2];
+                        out4 = pPRT_pDst[j + 3];
+                        pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+                        pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+                        pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+                        pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+                        j += 4;
+                    }
+                    while (j < n) {
+                        in1 = pSrcT2[j];
+                        out1 = pPRT_pDst[j];
+                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
                 }
+                i++;
             }
-            i += NUM_CORES;
-            pSrcT1 += NUM_CORES * n;
-            pSrcT2 += NUM_CORES * n;
         }
-        /* Increment the input pointer */
-        pSrc++;
-        /* Decrement the loop counter */
-        loopCnt--;
-        /* Increment the index modifier */
-        l++;
-        mempool_barrier(NUM_CORES);
+        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+
+        pSrc++;     /* Increment the input pointer */
+        loopCnt--;  /* Decrement the loop counter */
+        l++;        /* Increment the index modifier */
     }
 
 //    if ((flag != 1U) && (x == 0)) {
@@ -161,6 +291,5 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
 //        if (i == m * n)
 //            return 1;
 //    }
-
     return 0;
 }

From ee0119ca63814b204dda840c6a157a5a2c7c7f67 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 21 Jul 2022 08:31:44 +0200
Subject: [PATCH 08/22] [software] Clean comments on single-core

---
 .../MP_matrix_inverse/mempool_mat_inv_q16s.h  | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
index e217119cd..83c5a3c21 100644
--- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
+++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
@@ -29,6 +29,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
     flag = 0U;
 
     /* CREATE THE IDENTITY MATRIX */
+
     while (rowCnt > 0U) {
         j = m - rowCnt;
         while (j > 0U) {
@@ -50,25 +51,24 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
     l = 0U;
 
     while (loopCnt > 0U) {
-        /* CHECK IF PIVOT ELEMENT IS ZERO...
-         * If it is zero then interchange the row with non zero row below.
-         * If there is no non zero element to replace in the rows below,
-         * then the matrix is Singular. */
 
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
         k = 1U;
-
         in = *pSrcT1;
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO */
+
         if (in == 0) {
             /* Loop over the rows present below */
             for (i = (l + 1U); i < m; i++) {
                 pSrcT2 = pSrc + (n * i);
                 pDstT2 = pDstT1 + (n * k);
 
-                /* Check if there is a non zero pivot element to replace in the rows below */
+                /* EXCHANGE */
+
                 if (*pSrcT2 != 0) {
-                    /* Exchange the row elements of the input matrix at the right of the pivot */
+                    /* Loop over colums to the right of the pivot */
                     j = 0;
                     while (j < (n - l) - (n - l) % 4) {
                         Xchg1 = *(pSrcT2);
@@ -95,7 +95,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
                       *pSrcT1++ = Xchg1;
                       j++;
                     }
-                    /* Exchange the row elements of the destination matrix */
+                    /* Loop over colums */
                     j = 0;
                     while (j < n - n % 4) {
                         Xchg1 = *(pDstT2);
@@ -133,7 +133,6 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
             return 1;
         }
 
-
         /* DIVIDE BY THE PIVOT */
 
         /* Points to the pivot row of input and destination matrices */
@@ -189,10 +188,8 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
             *pSrcT2++ = FIX_DIV(in1, in);
             j++;
         }
-        /* SUM THE MULTIPLE OF A BOTTOM ROW */
-        /* Replace the rows with the sum of that row and a multiple of row i
-         * so that each new element in column i above row i is zero.*/
-        /* Temporary pointers for input and destination matrices */
+
+        /* REPLACE ROWS */
 
         pSrcT1 = pSrc;
         pSrcT2 = pDst;

From b412c877395b3606e84a409c6a9ec94874f4b7ff Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 21 Jul 2022 11:22:30 +0200
Subject: [PATCH 09/22] [software] Change kernel name

---
 .../initialization.h                          |  0
 .../{MP_matrix_inverse => mat_inv}/main.c     | 33 ++++++++---------
 .../mempool_mat_inv_q32p.h}                   | 35 ++++++++++++++++++-
 .../mempool_mat_inv_q32s.h}                   |  0
 4 files changed, 49 insertions(+), 19 deletions(-)
 rename software/apps/{MP_matrix_inverse => mat_inv}/initialization.h (100%)
 rename software/apps/{MP_matrix_inverse => mat_inv}/main.c (86%)
 rename software/apps/{MP_matrix_inverse/mempool_mat_inv_q16p.h => mat_inv/mempool_mat_inv_q32p.h} (86%)
 rename software/apps/{MP_matrix_inverse/mempool_mat_inv_q16s.h => mat_inv/mempool_mat_inv_q32s.h} (100%)

diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/mat_inv/initialization.h
similarity index 100%
rename from software/apps/MP_matrix_inverse/initialization.h
rename to software/apps/mat_inv/initialization.h
diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/mat_inv/main.c
similarity index 86%
rename from software/apps/MP_matrix_inverse/main.c
rename to software/apps/mat_inv/main.c
index 4cb660e23..6c14707f2 100644
--- a/software/apps/MP_matrix_inverse/main.c
+++ b/software/apps/mat_inv/main.c
@@ -7,9 +7,9 @@
 //#include <stdint.h>
 //#include <string.h>
 
-#define N 4
-#define M 4
-#define O 4
+#define N 16
+#define M 16
+#define O 16
 
 #include "encoding.h"
 #include "printf.h"
@@ -17,12 +17,13 @@
 #include "synchronization.h"
 
 #include "initialization.h"
-#include "mempool_mat_inv_q16s.h"
-#include "mempool_mat_inv_q16p.h"
+#include "mempool_mat_inv_q32p.h"
+#include "mempool_mat_inv_q32s.h"
+
 
 // #define VERBOSE
-// #define SINGLE
-#define PARALLEL
+#define SINGLE
+// #define PARALLEL
 
 int32_t matrix[N * M]         __attribute__((section(".l1")));
 int32_t t_matrix[M * N]       __attribute__((section(".l1")));
@@ -32,12 +33,6 @@ int32_t pseudoinverse[M * N]  __attribute__((section(".l1")));
 uint32_t flag                __attribute__((section(".l1")));
 
 void display(int32_t *A, int32_t n, int32_t m) {
-    //int32_t volatile i = 0;
-    //while (i < n * m) {
-    //    // printf("ciao mamma\n");
-    //    printf("Value %d: %d\n", i, A[i]);
-    //    i++;
-    //}
     int32_t i, j;
     for (i = 0; i < n; i++) {
       for (j = 0; j < m; j++) {
@@ -56,10 +51,10 @@ void single_core()
     // Initialize barrier and synchronize
     mempool_barrier_init(core_id);
 
-    init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    init_matrix(matrix, N, M, -156, 2000, -219, core_id);
     //init_matrix_zeros(t_matrix, M, N, core_id);
     //init_matrix_zeros(matrix_mult, M, M, core_id);
-    //init_matrix_zeros(inv, M, M, core_id);
+    init_matrix_zeros(inv, M, M, core_id);
     //init_matrix_zeros(pseudoinverse, M, N, core_id);
     mempool_barrier(num_cores);
 
@@ -114,9 +109,11 @@ void multi_core()
     //Transpose(matrix, t_matrix, N, M);
     //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
 
-    mempool_start_benchmark();
-    mempool_GJinv_q16p(matrix, inv, M, &flag);
-    mempool_stop_benchmark();
+    if (core_id < MIN(NUM_CORES, N / 4)) {
+      mempool_start_benchmark();
+      mempool_GJinv_q16p(matrix, inv, M, &flag);
+      mempool_stop_benchmark();
+    }
 
     //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
     //MatrixMult(pseudoinverse, matrix, inv, M, N, M);
diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
similarity index 86%
rename from software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
rename to software/apps/mat_inv/mempool_mat_inv_q32p.h
index 445d70cd7..4576cde37 100644
--- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h
@@ -228,6 +228,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                     pPRT_in = pPivotRowIn;
                     pPRT_pDst = pPivotRowDst;
                     /* Loop over columns to the right of pivot */
+                    j = 0;
                     while (j < (n - l) - (n - l) % 4) {
                         in1 = pSrcT1[j];
                         in2 = pSrcT1[j + 1];
@@ -249,6 +250,22 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                         pSrcT1[j] = in1 - FIX_MUL(in, out1);
                         j++;
                     }
+                    //j = 0;
+                    //while (j < MIN(core_id * 4, n - l)) {
+                    //    in1 = pSrcT1[j];
+                    //    in2 = pSrcT1[j + 1];
+                    //    in3 = pSrcT1[j + 2];
+                    //    in4 = pSrcT1[j + 3];
+                    //    out1 = pPRT_in[j];
+                    //    out2 = pPRT_in[j + 1];
+                    //    out3 = pPRT_in[j + 2];
+                    //    out4 = pPRT_in[j + 3];
+                    //    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+                    //    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+                    //    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+                    //    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+                    //    j += 4;
+                    //}
                     /* Loop over columns */
                     j = 0;
                     while (j < n - n % 4) {
@@ -266,12 +283,28 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                         pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
                         j += 4;
                     }
-                    while (j < n) {
+                    while (j < MIN(core_id * 4, n)) {
                         in1 = pSrcT2[j];
                         out1 = pPRT_pDst[j];
                         pSrcT2[j] = in1 - FIX_MUL(in, out1);
                         j++;
                     }
+                    //j = 0;
+                    //while (j < core_id * 4) {
+                    //    in1 = pSrcT2[j];
+                    //    in2 = pSrcT2[j + 1];
+                    //    in3 = pSrcT2[j + 2];
+                    //    in4 = pSrcT2[j + 3];
+                    //    out1 = pPRT_pDst[j];
+                    //    out2 = pPRT_pDst[j + 1];
+                    //    out3 = pPRT_pDst[j + 2];
+                    //    out4 = pPRT_pDst[j + 3];
+                    //    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+                    //    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+                    //    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+                    //    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+                    //    j += 4;
+                    //}
                 }
                 i++;
             }
diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h
similarity index 100%
rename from software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h
rename to software/apps/mat_inv/mempool_mat_inv_q32s.h

From 91380245b2b11d5763d00a66ab5e7a5ad28e006d Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 27 Jul 2022 11:03:39 +0200
Subject: [PATCH 10/22] [software] Add different parallelization schemes

---
 software/apps/mat_inv/mempool_mat_inv_q32p.h | 192 +++++++++++--------
 software/apps/mat_inv/mempool_mat_inv_q32s.h |  17 +-
 2 files changed, 122 insertions(+), 87 deletions(-)

diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
index 4576cde37..320fe709b 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h
@@ -10,9 +10,9 @@ dump(l, 1);
 dump(loopCnt, 2);
 dump(i, 3);
 
-int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
+int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
 
-int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
+int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
 
     int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
     int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
@@ -31,12 +31,12 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
     /* CREATE THE IDENTITY MATRIX */
 
     pDstT1 = pDst;
-    for (i = core_id * 4; i < m; i += 4 * NUM_CORES) {
+    for (k = core_id * 4; k < m; k += 4 * NUM_CORES) {
         for (j = 0; j < m; j++) {
-            pDstT1[i * m + j] = (uint32_t) (i == j);
-            pDstT1[(i + 1) * m + j] = (uint32_t) ((i + 1) == j);
-            pDstT1[(i + 2) * m + j] = (uint32_t) ((i + 2) == j);
-            pDstT1[(i + 3) * m + j] = (uint32_t) ((i + 3) == j);
+            pDstT1[k * m + j] = (uint32_t) (k == j);
+            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
+            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
+            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
         }
     }
     mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
@@ -52,33 +52,32 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         pDstT1 = pDst + (l * n);
         in = *pSrcT1;
 
-
         /* CHECK IF PIVOT ELEMENT IS ZERO */
 
         if (in == 0U) {
 
-            //if (core_id == 0) {
-            //    k = 1U;
-            //    while (k < m - l) {
-            //        pSrcT2 = pSrcT1 + k * n;
-            //        if (*pSrcT2 != 0) {
-            //            *flag = k;
-            //            break;
-            //        }
-            //        k++;
-            //    }
-            //}
-            //mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-
-            k = 1U + core_id;
-            while ((k < m - l) && (*flag == 0)) {
-                pSrcT2 = pSrcT1 + k * n;
-                if (*pSrcT2 != 0) {
-                    __atomic_store_n(flag, k, __ATOMIC_RELAXED);
+            if (core_id == 0) {
+                k = 1U;
+                while (k < m - l) {
+                    pSrcT2 = pSrcT1 + k * n;
+                    if (*pSrcT2 != 0) {
+                        *flag = k;
+                        break;
+                    }
+                    k++;
                 }
-                k += MIN(n / 4, NUM_CORES);
-                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
             }
+            mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+
+            //k = 1U + core_id;
+            //while ((k < m - l) && (*flag == 0)) {
+            //    pSrcT2 = pSrcT1 + k * n;
+            //    if (*pSrcT2 != 0) {
+            //        __atomic_store_n(flag, k, __ATOMIC_RELAXED);
+            //    }
+            //    k += MIN(n / 4, NUM_CORES);
+            //    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+            //}
 
             /* EXCHANGE */
 
@@ -86,7 +85,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                 pSrcT2 = pSrcT1 + (n * *flag);
                 pDstT2 = pDstT1 + (n * *flag);
                 /* Loop over columns to the right of pivot */
-                for (j = core_id * 4; j < (n - l) - (n - l) % 4; j += 4 * NUM_CORES) {
+                //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
+                //while (j < 4 * ((n - l) >> 2U)) {
+                for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) {
                     Xchg1 = pSrcT2[j];
                     Xchg2 = pSrcT2[j + 1];
                     Xchg3 = pSrcT2[j + 2];
@@ -103,9 +104,10 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                     pSrcT1[j + 1] = Xchg2;
                     pSrcT1[j + 2] = Xchg3;
                     pSrcT1[j + 3] = Xchg4;
+                    // j += 4 * NUM_CORES;
                 }
-                if (core_id == (n - l) / 4) {
-                    j = (n - l) - (n - l) % 4;
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * ((n - l) >> 2U);
                     while (j < n - l) {
                         Xchg1 = pSrcT2[j];
                         pSrcT2[j] = pSrcT1[j];
@@ -114,7 +116,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                     }
                 }
                 /* Loop over columns */
-                for (j = core_id * 4; j < n - n % 4; j += 4 * NUM_CORES) {
+                for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) {
                     Xchg1 = pDstT2[j];
                     Xchg2 = pDstT2[j + 1];
                     Xchg3 = pDstT2[j + 2];
@@ -132,8 +134,8 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                     pDstT1[j + 2] = Xchg3;
                     pDstT1[j + 3] = Xchg4;
                 }
-                if (core_id == n / 4) {
-                    j = n - n % 4;
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * (n >> 2U);
                     while (j < n) {
                         Xchg1 = pDstT2[j];
                         pDstT2[j] = pDstT1[j];
@@ -144,7 +146,6 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
             }
             mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
         }
-
         /* Update the status if the matrix is singular */
         if ((*flag == 0U) && (in == 0U)) {
             return 1;
@@ -163,6 +164,8 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         in = *pPivotRowIn;
 
         ///* Loop over columns to the right of pivot */
+        // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
+        // while (j < 4 * ((n - l) >> 2U)) {
         for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
             in1 = pSrcT1[j];
             in2 = pSrcT1[j + 1];
@@ -176,8 +179,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
             pSrcT1[j + 1] = out2;
             pSrcT1[j + 2] = out3;
             pSrcT1[j + 3] = out4;
+            // j += NUM_CORES * 4;
         }
-        if (core_id == (n - l) / 4) {
+        if (core_id == (n >> 2U) - 1) {
             j = 4 * ((n - l) >> 2U);
             while (j < n - l) {
                 in1 = pSrcT1[j];
@@ -200,11 +204,11 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
             pSrcT2[j + 2] = out3;
             pSrcT2[j + 3] = out4;
         }
-        if (core_id == n / 4) {
+        if (core_id == (n >> 2U) - 1) {
             j = 4 * (n >> 2U);
             while (j < n) {
-                in1 = pSrcT1[j];
-                pSrcT1[j] = FIX_DIV(in1, in);
+                in1 = pSrcT2[j];
+                pSrcT2[j] = FIX_DIV(in1, in);
                 j++;
             }
         }
@@ -215,10 +219,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
 
         pSrcT1 = pSrc;
         pSrcT2 = pDst;
-
         /* Loop over rows */
         for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
-            i = 0;
+            i = 0U;
             while (i < 4) {
                 if ((i + k) != l) {
                     pSrcT1 = pSrc + (i + k) * n;
@@ -229,7 +232,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                     pPRT_pDst = pPivotRowDst;
                     /* Loop over columns to the right of pivot */
                     j = 0;
-                    while (j < (n - l) - (n - l) % 4) {
+                    while (j < 4 * ((n - l) >> 2U)) {
                         in1 = pSrcT1[j];
                         in2 = pSrcT1[j + 1];
                         in3 = pSrcT1[j + 2];
@@ -250,25 +253,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                         pSrcT1[j] = in1 - FIX_MUL(in, out1);
                         j++;
                     }
-                    //j = 0;
-                    //while (j < MIN(core_id * 4, n - l)) {
-                    //    in1 = pSrcT1[j];
-                    //    in2 = pSrcT1[j + 1];
-                    //    in3 = pSrcT1[j + 2];
-                    //    in4 = pSrcT1[j + 3];
-                    //    out1 = pPRT_in[j];
-                    //    out2 = pPRT_in[j + 1];
-                    //    out3 = pPRT_in[j + 2];
-                    //    out4 = pPRT_in[j + 3];
-                    //    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-                    //    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-                    //    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-                    //    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-                    //    j += 4;
-                    //}
                     /* Loop over columns */
                     j = 0;
-                    while (j < n - n % 4) {
+                    while (j < 4 * (n >> 2U)) {
                         in1 = pSrcT2[j];
                         in2 = pSrcT2[j + 1];
                         in3 = pSrcT2[j + 2];
@@ -283,34 +270,87 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
                         pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
                         j += 4;
                     }
-                    while (j < MIN(core_id * 4, n)) {
+                    while (j < n) {
                         in1 = pSrcT2[j];
                         out1 = pPRT_pDst[j];
                         pSrcT2[j] = in1 - FIX_MUL(in, out1);
                         j++;
                     }
-                    //j = 0;
-                    //while (j < core_id * 4) {
-                    //    in1 = pSrcT2[j];
-                    //    in2 = pSrcT2[j + 1];
-                    //    in3 = pSrcT2[j + 2];
-                    //    in4 = pSrcT2[j + 3];
-                    //    out1 = pPRT_pDst[j];
-                    //    out2 = pPRT_pDst[j + 1];
-                    //    out3 = pPRT_pDst[j + 2];
-                    //    out4 = pPRT_pDst[j + 3];
-                    //    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-                    //    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-                    //    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-                    //    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-                    //    j += 4;
-                    //}
                 }
                 i++;
             }
         }
         mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
+//        pSrcT1 = pSrc;
+//        pSrcT2 = pDst;
+//        /* Loop over rows */
+//        for (k = 0; k < m; k++) {
+//            if (k != l) {
+//                pSrcT1 = pSrc + k * n;
+//                pSrcT2 = pDst + k * n;
+//                /* Element of the reference row */
+//                in = *pSrcT1;
+//                pPRT_in = pPivotRowIn;
+//                pPRT_pDst = pPivotRowDst;
+//                /* Loop over columns to the right of pivot */
+//                j = core_id * 4;
+//                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
+//                while (j < 4 * ((n - l) >> 2U)) {
+//                    in1 = pSrcT1[j];
+//                    in2 = pSrcT1[j + 1];
+//                    in3 = pSrcT1[j + 2];
+//                    in4 = pSrcT1[j + 3];
+//                    out1 = pPRT_in[j];
+//                    out2 = pPRT_in[j + 1];
+//                    out3 = pPRT_in[j + 2];
+//                    out4 = pPRT_in[j + 3];
+//                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4 * NUM_CORES;
+//                }
+//                if (core_id == (n >> 2U) - 1) {
+//                    j = 4 * ((n - l) >> 2U);
+//                    while (j < n - l) {
+//                        in1 = pSrcT1[j];
+//                        out1 = pPRT_in[j];
+//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                        j++;
+//                    }
+//                }
+//                /* Loop over columns */
+//                j = core_id * 4;
+//                while (j < 4 * (n >> 2U)) {
+//                    in1 = pSrcT2[j];
+//                    in2 = pSrcT2[j + 1];
+//                    in3 = pSrcT2[j + 2];
+//                    in4 = pSrcT2[j + 3];
+//                    out1 = pPRT_pDst[j];
+//                    out2 = pPRT_pDst[j + 1];
+//                    out3 = pPRT_pDst[j + 2];
+//                    out4 = pPRT_pDst[j + 3];
+//                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4 * NUM_CORES;
+//                }
+//                if (core_id == (n >> 2U) - 1) {
+//                    j = 4 * (n >> 2U);
+//                    while (j < n) {
+//                        in1 = pSrcT2[j];
+//                        out1 = pPRT_pDst[j];
+//                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+//                        j++;
+//                    }
+//                }
+//                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+//            }
+//        }
+//        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+
         pSrc++;     /* Increment the input pointer */
         loopCnt--;  /* Decrement the loop counter */
         l++;        /* Increment the index modifier */
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h
index 83c5a3c21..70fff05a2 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32s.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h
@@ -6,9 +6,9 @@
 
 /* GAUSS JORDAN INVERSION */
 
-int mempool_GJinv_q16s(int32_t *pSrc, int32_t *pDst, uint32_t n);
+int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
-int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
+int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
 
     int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
     int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
@@ -146,7 +146,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
 
         /* Loop over number of columns to the right of the pilot element */
         j = 0;
-        while (j < (n - l) - (n - l) % 4) {
+        while (j < 4 * ((n - l) >> 2U)) {
             in1 = *pSrcT1;
             in2 = *(pSrcT1 + 1);
             in3 = *(pSrcT1 + 2);
@@ -168,7 +168,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
         }
         /* Loop over number of columns of the destination matrix */
         j = 0;
-        while (j < n - n % 4) {
+        while (j < 4 * (n >> 2U)) {
             in1 = *pSrcT2;
             in2 = *(pSrcT2 + 1);
             in3 = *(pSrcT2 + 2);
@@ -193,26 +193,21 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
 
         pSrcT1 = pSrc;
         pSrcT2 = pDst;
-
         i = 0U; /* pivot index */
         k = m; /* row index */
         while (k > 0U) {
-
             /* Only the columns to the right of the pivot are to be processed */
             if (i == l) {
                 pSrcT1 += n - l;
                 pSrcT2 += n;
-
             } else {
-
                 /* Element of the reference row */
                 in = *pSrcT1;
                 /* Reference row pointers */
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
-
                 j = 0;
-                while (j < (n - l) - (n - l) % 2) {
+                while (j < 2 * ((n - l) >> 1U)) {
                     in1 = *pSrcT1;
                     in2 = *(pSrcT1 + 1);
                     // in3 = *(pSrcT1 + 2);
@@ -236,7 +231,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
                 /* Loop over the number of columns to
                    replace the elements in the destination matrix */
                 j = 0;
-                while (j < n - n % 4) {
+                while (j < 4 * (n >> 2U)) {
                     in1 = *pSrcT2;
                     in2 = *(pSrcT2 + 1);
                     in3 = *(pSrcT2 + 2);

From 3aad7fdd3000c472f833e230a221711b1e9716b3 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 27 Jul 2022 11:04:12 +0200
Subject: [PATCH 11/22] [software] Shape memory accesses to mempool

---
 software/apps/mat_inv/main.c                  | 114 +++---
 .../mat_inv/mempool_mat_inv_q32p_memsized.h   | 358 ++++++++++++++++++
 2 files changed, 419 insertions(+), 53 deletions(-)
 create mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h

diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index 6c14707f2..a00f66b99 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -10,6 +10,7 @@
 #define N 16
 #define M 16
 #define O 16
+#define N_BANKS (1024)
 
 #include "encoding.h"
 #include "printf.h"
@@ -18,27 +19,30 @@
 
 #include "initialization.h"
 #include "mempool_mat_inv_q32p.h"
+#include "mempool_mat_inv_q32p_memsized.h"
 #include "mempool_mat_inv_q32s.h"
 
 
-// #define VERBOSE
-#define SINGLE
+#define VERBOSE
+// #define SINGLE
 // #define PARALLEL
+#define MEMSIZED
 
-int32_t matrix[N * M]         __attribute__((section(".l1")));
-int32_t t_matrix[M * N]       __attribute__((section(".l1")));
-int32_t matrix_mult[M * M]    __attribute__((section(".l1")));
-int32_t inv[M * M]            __attribute__((section(".l1")));
-int32_t pseudoinverse[M * N]  __attribute__((section(".l1")));
-uint32_t flag                __attribute__((section(".l1")));
+int32_t matrix[N * M]         __attribute__((aligned(N), section(".l1")));
+int32_t inv[M * M]            __attribute__((aligned(N), section(".l1")));
+uint32_t flag                 __attribute__((section(".l1")));
 
 void display(int32_t *A, int32_t n, int32_t m) {
-    int32_t i, j;
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < m; j++) {
-        printf("%5d ", A[i * m + j]);
-      }
-      printf("\n");
+    //int32_t i, j;
+    //for (i = 0; i < n; i++) {
+    //  for (j = 0; j < m; j++) {
+    //    printf("%8d ", A[i * m + j]);
+    //  }
+    //  printf("\n");
+    //}
+    int32_t i;
+    for (i = 0; i < n * m; i++) {
+      printf("Output[%d] = %8d\n", i, A[i]);
     }
 }
 
@@ -51,41 +55,21 @@ void single_core()
     // Initialize barrier and synchronize
     mempool_barrier_init(core_id);
 
-    init_matrix(matrix, N, M, -156, 2000, -219, core_id);
-    //init_matrix_zeros(t_matrix, M, N, core_id);
-    //init_matrix_zeros(matrix_mult, M, M, core_id);
+    init_matrix(matrix, N, M, -156, 427, -219, core_id);
     init_matrix_zeros(inv, M, M, core_id);
-    //init_matrix_zeros(pseudoinverse, M, N, core_id);
     mempool_barrier(num_cores);
 
     if(core_id == 0) {
-      #if defined(VERBOSE)
-          display(matrix, N, M);
-          Transpose(matrix, t_matrix, N,  M);
-          printf("\nThe Transpose is :\n");
-          display(t_matrix, M, N);
-          MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
-          printf("The product of the matrix is: \n");
-          display(matrix_mult, M, M);
-          printf("\nThe Inverse is :\n");
-          mempool_mat_inv_q16s(matrix_mult, inv, N);
-          display(inv, N, N);
-          MatrixMult(t_matrix, inv, pseudoinverse, M, N, N);
-          printf("\nThe Moore-Penrose inverse is :\n");
-          display(pseudoinverse, M, N);
-      #else
-          //Transpose(matrix, t_matrix, N, M);
-          //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
-
-          mempool_start_benchmark();
-          mempool_GJinv_q16s(matrix, inv, M);
-          mempool_stop_benchmark();
-
-          //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
-          //MatrixMult(pseudoinverse, matrix, inv, M, N, M);
-      #endif
+        mempool_start_benchmark();
+        mempool_GJinv_q32s(matrix, inv, M);
+        mempool_stop_benchmark();
     }
     mempool_barrier(num_cores);
+    #ifdef VERBOSE
+    if (core_id == 0)
+      display(inv, N, M);
+    #endif
+    mempool_barrier(num_cores);
 }
 
 void multi_core()
@@ -97,27 +81,49 @@ void multi_core()
     mempool_barrier_init(core_id);
 
     init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    init_matrix_zeros(inv, M, M, core_id);
     if (core_id == 0) {
         flag = 0U;
     }
-    //init_matrix_zeros(t_matrix, M, N, core_id);
-    //init_matrix_zeros(matrix_mult, M, M, core_id);
-    //init_matrix_zeros(inv, M, M, core_id);
-    //init_matrix_zeros(pseudoinverse, M, N, core_id);
     mempool_barrier(num_cores);
 
-    //Transpose(matrix, t_matrix, N, M);
-    //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O);
-
     if (core_id < MIN(NUM_CORES, N / 4)) {
       mempool_start_benchmark();
-      mempool_GJinv_q16p(matrix, inv, M, &flag);
+      mempool_GJinv_q32p(matrix, inv, M, &flag);
       mempool_stop_benchmark();
     }
+    mempool_barrier(num_cores);
+    #ifdef VERBOSE
+    if (core_id == 0)
+      display(inv, M, N);
+    #endif
+    mempool_barrier(num_cores);
+}
 
-    //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N);
-    //MatrixMult(pseudoinverse, matrix, inv, M, N, M);
+void multi_core_memsized()
+{
 
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t num_cores = mempool_get_core_count();
+    // Initialize barrier and synchronize
+    mempool_barrier_init(core_id);
+
+    init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    init_matrix_zeros(inv, M, M, core_id);
+    if (core_id == 0) {
+        flag = 0U;
+    }
+    mempool_barrier(num_cores);
+
+    mempool_start_benchmark();
+    mempool_GJinv_q32p_memsized(matrix, inv, M, &flag);
+    mempool_stop_benchmark();
+
+    mempool_barrier(num_cores);
+    #ifdef VERBOSE
+    if (core_id == 0)
+      display(inv, M, N);
+    #endif
     mempool_barrier(num_cores);
 }
 
@@ -126,6 +132,8 @@ int main() {
     single_core();
     #elif defined(PARALLEL)
     multi_core();
+    #elif defined(MEMSIZED)
+    multi_core_memsized();
     #endif
     return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
new file mode 100644
index 000000000..496459e19
--- /dev/null
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
@@ -0,0 +1,358 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* GAUSS JORDAN INVERSION */
+
+int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
+
+int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
+
+    int32_t volatile *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
+    int32_t volatile *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
+    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
+    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+    int32_t in = 0;
+    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+    int32_t in1, in2, in3, in4;
+    int32_t out1, out2, out3, out4;
+
+    uint32_t absolute_core_id = mempool_get_core_id();
+    uint32_t core_id = absolute_core_id;
+    uint32_t i, j, loopCnt, k, l;  /* loop counters */
+    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+
+    /* CREATE THE IDENTITY MATRIX */
+
+    pDstT1 = pDst;
+    for (k = 0; k < m; k++) {
+        core_id = absolute_core_id - ((n * k) / 4) % N_BANKS;
+        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
+        for (j = core_id * 4; j < m; j += 4 * NUM_CORES) {
+            pDstT1[k * m + j] = (uint32_t) (k == j);
+            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
+            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
+            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
+        }
+    }
+    mempool_log_barrier(2, absolute_core_id);
+
+    /* Loop over the number of columns of the input matrix. */
+    loopCnt = n;
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+
+    while (loopCnt > 0U) {
+
+        pSrcT1 = pSrc + (l * n);
+        pDstT1 = pDst + (l * n);
+        core_id = absolute_core_id - ((l * n) / 4) % N_BANKS;
+        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
+        in = *pSrcT1;
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO */
+
+        if (in == 0U) {
+
+            if (absolute_core_id == 0) {
+                k = 1U;
+                while (k < m - l) {
+                    pSrcT2 = pSrcT1 + k * n;
+                    if (*pSrcT2 != 0) {
+                        *flag = k;
+                        break;
+                    }
+                    k++;
+                }
+            }
+            mempool_log_barrier(2, absolute_core_id);
+
+            /* EXCHANGE */
+
+            if (*flag != 0U) {
+                pSrcT2 = pSrcT1 + (n * *flag);
+                pDstT2 = pDstT1 + (n * *flag);
+
+                /* Loop over columns to the right of pivot */
+                //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
+                //while (j < 4 * ((n - l) >> 2U)) {
+                for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) {
+                    Xchg1 = pSrcT2[j];
+                    Xchg2 = pSrcT2[j + 1];
+                    Xchg3 = pSrcT2[j + 2];
+                    Xchg4 = pSrcT2[j + 3];
+                    out1 = pSrcT1[j];
+                    out2 = pSrcT1[j + 1];
+                    out3 = pSrcT1[j + 2];
+                    out4 = pSrcT1[j + 3];
+                    pSrcT2[j] = out1;
+                    pSrcT2[j + 1] = out2;
+                    pSrcT2[j + 2] = out3;
+                    pSrcT2[j + 3] = out4;
+                    pSrcT1[j] = Xchg1;
+                    pSrcT1[j + 1] = Xchg2;
+                    pSrcT1[j + 2] = Xchg3;
+                    pSrcT1[j + 3] = Xchg4;
+                    // j += 4 * NUM_CORES;
+                }
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * ((n - l) >> 2U);
+                    while (j < n - l) {
+                        Xchg1 = pSrcT2[j];
+                        pSrcT2[j] = pSrcT1[j];
+                        pSrcT1[j] = Xchg1;
+                        j++;
+                    }
+                }
+                /* Loop over columns */
+                for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) {
+                    Xchg1 = pDstT2[j];
+                    Xchg2 = pDstT2[j + 1];
+                    Xchg3 = pDstT2[j + 2];
+                    Xchg4 = pDstT2[j + 3];
+                    out1 = pDstT1[j];
+                    out2 = pDstT1[j + 1];
+                    out3 = pDstT1[j + 2];
+                    out4 = pDstT1[j + 3];
+                    pDstT2[j] = out1;
+                    pDstT2[j + 1] = out2;
+                    pDstT2[j + 2] = out3;
+                    pDstT2[j + 3] = out4;
+                    pDstT1[j] = Xchg1;
+                    pDstT1[j + 1] = Xchg2;
+                    pDstT1[j + 2] = Xchg3;
+                    pDstT1[j + 3] = Xchg4;
+                }
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * (n >> 2U);
+                    while (j < n) {
+                        Xchg1 = pDstT2[j];
+                        pDstT2[j] = pDstT1[j];
+                        pDstT1[j] = Xchg1;
+                        j++;
+                    }
+                }
+            }
+            mempool_log_barrier(2, absolute_core_id);
+        }
+        /* Update the status if the matrix is singular */
+        if ((*flag == 0U) && (in == 0U)) {
+            return 1;
+        }
+
+
+        /* DIVIDE BY THE PIVOT */
+
+        /* Points to the pivot row of input and destination matrices */
+        pPivotRowIn = pSrc + (l * n);
+        pPivotRowDst = pDst + (l * n);
+
+        /* Temporary pointers to the pivot row pointers */
+        pSrcT1 = pPivotRowIn;
+        pSrcT2 = pPivotRowDst;
+        /* Pivot element of the row */
+        in = *pPivotRowIn;
+
+        ///* Loop over columns to the right of pivot */
+        // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
+        // while (j < 4 * ((n - l) >> 2U)) {
+        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+            in1 = pSrcT1[j];
+            in2 = pSrcT1[j + 1];
+            in3 = pSrcT1[j + 2];
+            in4 = pSrcT1[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT1[j] = out1;
+            pSrcT1[j + 1] = out2;
+            pSrcT1[j + 2] = out3;
+            pSrcT1[j + 3] = out4;
+            // j += NUM_CORES * 4;
+        }
+        if (core_id == (n >> 2U) - 1) {
+            j = 4 * ((n - l) >> 2U);
+            while (j < n - l) {
+                in1 = pSrcT1[j];
+                // pSrcT1[j] = FIX_DIV(in1, in);
+                j++;
+            }
+        }
+        /* Loop over columns */
+        for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+            in1 = pSrcT2[j];
+            in2 = pSrcT2[j + 1];
+            in3 = pSrcT2[j + 2];
+            in4 = pSrcT2[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT2[j] = out1;
+            pSrcT2[j + 1] = out2;
+            pSrcT2[j + 2] = out3;
+            pSrcT2[j + 3] = out4;
+        }
+        if (core_id == (n >> 2U) - 1) {
+            j = 4 * (n >> 2U);
+            while (j < n) {
+                in1 = pSrcT2[j];
+                pSrcT2[j] = FIX_DIV(in1, in);
+                j++;
+            }
+        }
+        mempool_log_barrier(2, absolute_core_id);
+
+
+        /* REPLACE ROWS */
+        core_id = absolute_core_id;
+        pSrcT1 = pSrc;
+        pSrcT2 = pDst;
+        /* Loop over rows */
+//        for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+//            i = 0U;
+//            while (i < 4) {
+//                if ((i + k) != l) {
+//                    pSrcT1 = pSrc + (i + k) * n;
+//                    pSrcT2 = pDst + (i + k) * n;
+//                    /* Element of the reference row */
+//                    in = *pSrcT1;
+//                    pPRT_in = pPivotRowIn;
+//                    pPRT_pDst = pPivotRowDst;
+//                    /* Loop over columns to the right of pivot */
+//                    j = 0;
+//                    while (j < 4 * ((n - l) >> 2U)) {
+//                        in1 = pSrcT1[j];
+//                        in2 = pSrcT1[j + 1];
+//                        in3 = pSrcT1[j + 2];
+//                        in4 = pSrcT1[j + 3];
+//                        out1 = pPRT_in[j];
+//                        out2 = pPRT_in[j + 1];
+//                        out3 = pPRT_in[j + 2];
+//                        out4 = pPRT_in[j + 3];
+//                        pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+//                        pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+//                        pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+//                        pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+//                        j += 4;
+//                    }
+//                    while (j < n - l) {
+//                        in1 = pSrcT1[j];
+//                        out1 = pPRT_in[j];
+//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                        j++;
+//                    }
+//                    /* Loop over columns */
+//                    j = 0;
+//                    while (j < 4 * (n >> 2U)) {
+//                        in1 = pSrcT2[j];
+//                        in2 = pSrcT2[j + 1];
+//                        in3 = pSrcT2[j + 2];
+//                        in4 = pSrcT2[j + 3];
+//                        out1 = pPRT_pDst[j];
+//                        out2 = pPRT_pDst[j + 1];
+//                        out3 = pPRT_pDst[j + 2];
+//                        out4 = pPRT_pDst[j + 3];
+//                        pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+//                        pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+//                        pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+//                        pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+//                        j += 4;
+//                    }
+//                    while (j < n) {
+//                        in1 = pSrcT2[j];
+//                        out1 = pPRT_pDst[j];
+//                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+//                        j++;
+//                    }
+//                }
+//                i++;
+//            }
+//        }
+//        mempool_log_barrier(2, absolute_core_id);
+
+        for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
+            dump_i(absolute_core_id);
+            if (k != l) {
+                pSrcT1 = pSrc + k * n;
+                pSrcT2 = pDst + k * n;
+                core_id = absolute_core_id % (n >> 2U);
+                /* Element of the reference row */
+                in = *pSrcT1;
+                pPRT_in = pPivotRowIn;
+                pPRT_pDst = pPivotRowDst;
+                j = core_id * 4;
+                while (j < 4 * ((n - l) >> 2U)) {
+                    in1 = pSrcT1[j];
+                    in2 = pSrcT1[j + 1];
+                    in3 = pSrcT1[j + 2];
+                    in4 = pSrcT1[j + 3];
+                    out1 = pPRT_in[j];
+                    out2 = pPRT_in[j + 1];
+                    out3 = pPRT_in[j + 2];
+                    out4 = pPRT_in[j + 3];
+                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+                    j += 4 * NUM_CORES;
+                }
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * ((n - l) >> 2U);
+                    while (j < n - l) {
+                        in1 = pSrcT1[j];
+                        out1 = pPRT_in[j];
+                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
+                }
+                /* Loop over columns */
+                j = core_id * 4;
+                while (j < 4 * (n >> 2U)) {
+                    in1 = pSrcT2[j];
+                    in2 = pSrcT2[j + 1];
+                    in3 = pSrcT2[j + 2];
+                    in4 = pSrcT2[j + 3];
+                    out1 = pPRT_pDst[j];
+                    out2 = pPRT_pDst[j + 1];
+                    out3 = pPRT_pDst[j + 2];
+                    out4 = pPRT_pDst[j + 3];
+                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+                    j += 4 * NUM_CORES;
+                }
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * (n >> 2U);
+                    while (j < n) {
+                        in1 = pSrcT2[j];
+                        out1 = pPRT_pDst[j];
+                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
+                }
+            }
+        }
+        mempool_log_barrier(2, absolute_core_id);
+
+        pSrc++;     /* Increment the input pointer */
+        loopCnt--;  /* Decrement the loop counter */
+        l++;        /* Increment the index modifier */
+    }
+    mempool_log_barrier(2, absolute_core_id);
+
+//    if ((flag != 1U) && (x == 0)) {
+//        for (i = 0; i < m * n; i++) {
+//            if (pSrc[i] != 0)
+//                break;
+//        }
+//        if (i == m * n)
+//            return 1;
+//    }
+    return 0;
+}

From a045b42cae6eb0abc26aa7e1b2e589ebfd3febba Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 2 Aug 2022 18:20:46 +0200
Subject: [PATCH 12/22] [software] Add folded kernel

---
 software/apps/mat_inv/initialization.h        |  35 +++
 software/apps/mat_inv/main.c                  |  86 ++++--
 .../mat_inv/mempool_mat_inv_q32p_folded.h     | 287 ++++++++++++++++++
 3 files changed, 380 insertions(+), 28 deletions(-)
 create mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_folded.h

diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h
index c8c874ea3..ec330e766 100644
--- a/software/apps/mat_inv/initialization.h
+++ b/software/apps/mat_inv/initialization.h
@@ -9,6 +9,16 @@
 #define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
 #define MIN(a,b) (a < b ? a : b)
 
+dump(l, 1);
+dump(loopCnt, 2);
+dump(i, 3);
+
+void display(int32_t *A, int32_t n, int32_t m);
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m);
+#endif
+
 void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
 
 void MatrixMult(int32_t *matrix_1,  int32_t *matrix_2,  int32_t *matrix_product, int32_t n, int32_t m, int32_t o);
@@ -17,6 +27,31 @@ void init_matrix(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, int3
 
 void init_matrix_zeros(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id);
 
+void display(int32_t *A, int32_t n, int32_t m) {
+    //int32_t i, j;
+    //for (i = 0; i < n; i++) {
+    //  for (j = 0; j < m; j++) {
+    //    printf("%8d ", A[i * m + j]);
+    //  }
+    //  printf("\n");
+    //}
+    int32_t i;
+    for (i = 0; i < n * m; i++) {
+      printf("Output[%d] = %8d\n", i, A[i]);
+    }
+}
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m) {
+    int32_t i, j, k, shift;
+    for (i = 0; i < n * m; i++) {
+      k = i / n;
+      j = i % n;
+      shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+      printf("Output[%d] = %8d\n", i, A[shift + j]);
+    }
+}
+#endif
 
 void Transpose(int32_t *matrix,  int32_t *t_matrix, int32_t n, int32_t m) {
   int32_t i, j;
diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index a00f66b99..f39cd6ac8 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -4,47 +4,39 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
-//#include <stdint.h>
-//#include <string.h>
-
-#define N 16
-#define M 16
-#define O 16
-#define N_BANKS (1024)
-
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "initialization.h"
-#include "mempool_mat_inv_q32p.h"
-#include "mempool_mat_inv_q32p_memsized.h"
-#include "mempool_mat_inv_q32s.h"
-
+#define N 16
+#define M 16
+#define O 16
+#define N_BANKS (1024)
+#define N_USED_BANKS (64)
 
 #define VERBOSE
 // #define SINGLE
 // #define PARALLEL
 #define MEMSIZED
+// #define FOLDED
 
+#include "initialization.h"
+#include "mempool_mat_inv_q32s.h"
+#include "mempool_mat_inv_q32p.h"
+#include "mempool_mat_inv_q32p_memsized.h"
+#include "mempool_mat_inv_q32p_folded.h"
+
+#ifdef FOLDED
+int32_t matrix[N * M]                                       __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)]   __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)]             __attribute__((aligned(N_BANKS), section(".l1")));
+uint32_t flag                                               __attribute__((section(".l1")));
+#else
 int32_t matrix[N * M]         __attribute__((aligned(N), section(".l1")));
 int32_t inv[M * M]            __attribute__((aligned(N), section(".l1")));
 uint32_t flag                 __attribute__((section(".l1")));
-
-void display(int32_t *A, int32_t n, int32_t m) {
-    //int32_t i, j;
-    //for (i = 0; i < n; i++) {
-    //  for (j = 0; j < m; j++) {
-    //    printf("%8d ", A[i * m + j]);
-    //  }
-    //  printf("\n");
-    //}
-    int32_t i;
-    for (i = 0; i < n * m; i++) {
-      printf("Output[%d] = %8d\n", i, A[i]);
-    }
-}
+#endif
 
 // Driver program
 void single_core()
@@ -109,7 +101,7 @@ void multi_core_memsized()
     mempool_barrier_init(core_id);
 
     init_matrix(matrix, N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(inv, M, M, core_id);
+    init_matrix_zeros(inv, N, M, core_id);
     if (core_id == 0) {
         flag = 0U;
     }
@@ -127,6 +119,42 @@ void multi_core_memsized()
     mempool_barrier(num_cores);
 }
 
+#ifdef FOLDED
+void multi_core_folded()
+{
+
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t num_cores = mempool_get_core_count();
+    uint32_t nPE = N_USED_BANKS >> 2U;
+    // Initialize barrier and synchronize
+    mempool_barrier_init(core_id);
+
+    init_matrix(matrix, N, M, -156, 427, -219, core_id);
+    init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+    init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+    if (core_id == 0) {
+        flag = 0U;
+    }
+    mempool_barrier(num_cores);
+
+    mempool_start_benchmark();
+    fold_matrix(matrix, folded_matrix, N);
+    mempool_stop_benchmark();
+    if(core_id < nPE) {
+        mempool_start_benchmark();
+        mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE);
+        mempool_stop_benchmark();
+    }
+    mempool_barrier(num_cores);
+    #ifdef VERBOSE
+    if (core_id == 0)
+      display_folded(inv, M, N);
+    #endif
+    mempool_barrier(num_cores);
+
+}
+#endif
+
 int main() {
     #if defined(SINGLE)
     single_core();
@@ -134,6 +162,8 @@ int main() {
     multi_core();
     #elif defined(MEMSIZED)
     multi_core_memsized();
+    #elif defined(FOLDED)
+    multi_core_folded();
     #endif
     return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
new file mode 100644
index 000000000..5dc0aefc8
--- /dev/null
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
@@ -0,0 +1,287 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* GAUSS JORDAN INVERSION */
+
+int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE);
+void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n);
+
+
+void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n) {
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t i, j, k, shift;
+    for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) {
+        k = i / n;
+        j = i % n;
+        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+        pDst[shift + j] = pSrc[i];
+        pDst[shift + j + 1] = pSrc[i + 1];
+        pDst[shift + j + 2] = pSrc[i + 2];
+        pDst[shift + j + 3] = pSrc[i + 3];
+    }
+    mempool_log_barrier(2, core_id);
+}
+
+int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE) {
+
+    int32_t volatile *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
+    int32_t volatile *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
+    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
+    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+    int32_t in = 0;
+    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+    int32_t in1, in2, in3, in4;
+    int32_t out1, out2, out3, out4;
+
+    uint32_t absolute_core_id = mempool_get_core_id();
+    uint32_t core_id = absolute_core_id;
+    uint32_t shift = 0;
+    uint32_t i, j, k, l;  /* loop counters */
+    uint32_t m = n;       /* M is the number of rows. However, the matrices must be square. */
+
+    /* CREATE THE IDENTITY MATRIX */
+    pDstT1 = pDst;
+    for (i = core_id * 4; i < n * m; i += nPE * 4) {
+        k = i / n;
+        j = i % n;
+        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+        pDstT1[shift + j] = (uint32_t) (k == j);
+        pDstT1[shift + j + 1] = (uint32_t) (k == (j + 1));
+        pDstT1[shift + j + 2] = (uint32_t) (k == (j + 2));
+        pDstT1[shift + j + 3] = (uint32_t) (k == (j + 3));
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+    while (l < n) {
+
+        shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+        pSrcT1 = pSrc + shift;
+        pDstT1 = pDst + shift;
+        in = *pSrcT1;
+
+        /* CHECK IF PIVOT ELEMENT IS ZERO */
+        if (absolute_core_id == 0) {
+            if (in == 0U) {
+                /* Loop over the rows present below */
+                for (k = l + 1U; k < m; k++) {
+                    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+                    pSrcT2 = pSrc + shift;
+                    pDstT2 = pDst + shift;
+                    /* EXCHANGE */
+                    if (*pSrcT2 != 0) {
+                        /* Loop over colums to the right of the pivot */
+                        j = 0;
+                        while (j < 4 * ((n - l) >> 2U)) {
+                            Xchg1 = pSrcT2[j];
+                            Xchg2 = pSrcT2[j + 1];
+                            Xchg3 = pSrcT2[j + 2];
+                            Xchg4 = pSrcT2[j + 3];
+                            out1 = pSrcT1[j];
+                            out2 = pSrcT1[j + 1];
+                            out3 = pSrcT1[j + 2];
+                            out4 = pSrcT1[j + 3];
+                            pSrcT2[j] = out1;
+                            pSrcT2[j + 1] = out2;
+                            pSrcT2[j + 2] = out3;
+                            pSrcT2[j + 3] = out4;
+                            pSrcT1[j] = Xchg1;
+                            pSrcT1[j + 1] = Xchg2;
+                            pSrcT1[j + 2] = Xchg3;
+                            pSrcT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n - l) {
+                            Xchg1 = pSrcT2[j];
+                            pSrcT2[j] = pSrcT1[j];
+                            pSrcT1[j] = Xchg1;
+                            j++;
+                        }
+                        /* Loop over colums */
+                        j = 0;
+                        while (j < 4 * (n >> 2U)) {
+                            Xchg1 = pDstT2[j];
+                            Xchg2 = pDstT2[j + 1];
+                            Xchg3 = pDstT2[j + 2];
+                            Xchg4 = pDstT2[j + 3];
+                            out1 = pDstT1[j];
+                            out2 = pDstT1[j + 1];
+                            out3 = pDstT1[j + 2];
+                            out4 = pDstT1[j + 3];
+                            pDstT2[j] = out1;
+                            pDstT2[j + 1] = out2;
+                            pDstT2[j + 2] = out3;
+                            pDstT2[j + 3] = out4;
+                            pDstT1[j] = Xchg1;
+                            pDstT1[j + 1] = Xchg2;
+                            pDstT1[j + 2] = Xchg3;
+                            pDstT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n) {
+                            Xchg1 = pDstT2[j];
+                            pDstT2[j] = pDstT1[j];
+                            pDstT1[j] = Xchg1;
+                            j++;
+                        }
+                        *flag = 1U;
+                        break;
+                    }
+                }
+            }
+            /* Update the status if the matrix is singular */
+            if ((*flag == 0U) && (in == 0U)) {
+                return 1;
+            }
+        }
+        mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+        /* DIVIDE BY THE PIVOT */
+        /* Points to the pivot row of input and destination matrices */
+        shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+        pPivotRowIn = pSrc + shift;
+        pPivotRowDst = pDst + shift;
+        /* Temporary pointers to the pivot row pointers */
+        pSrcT1 = pPivotRowIn;
+        pSrcT2 = pPivotRowDst;
+        /* Pivot element of the row */
+        in = *pPivotRowIn;
+
+        /* Loop over columns to the right of pivot */
+        core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
+        core_id = core_id > nPE ? core_id + nPE : core_id;
+        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
+            in1 = pSrcT1[j];
+            in2 = pSrcT1[j + 1];
+            in3 = pSrcT1[j + 2];
+            in4 = pSrcT1[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT1[j] = out1;
+            pSrcT1[j + 1] = out2;
+            pSrcT1[j + 2] = out3;
+            pSrcT1[j + 3] = out4;
+        }
+        if (core_id == 0) {
+            j = 4 * ((n - l) >> 2U);
+            while (j < n - l) {
+                in1 = pSrcT1[j];
+                pSrcT1[j] = FIX_DIV(in1, in);
+                j++;
+            }
+        }
+
+        /* Loop over columns */
+        core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
+        core_id = core_id > nPE ? core_id + nPE : core_id;
+        for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
+            in1 = pSrcT2[j];
+            in2 = pSrcT2[j + 1];
+            in3 = pSrcT2[j + 2];
+            in4 = pSrcT2[j + 3];
+            out1 = FIX_DIV(in1, in);
+            out2 = FIX_DIV(in2, in);
+            out3 = FIX_DIV(in3, in);
+            out4 = FIX_DIV(in4, in);
+            pSrcT2[j] = out1;
+            pSrcT2[j + 1] = out2;
+            pSrcT2[j + 2] = out3;
+            pSrcT2[j + 3] = out4;
+        }
+        if (core_id == (n >> 2U) - 1) {
+            j = 4 * (n >> 2U);
+            while (j < n) {
+                in1 = pSrcT2[j];
+                pSrcT2[j] = FIX_DIV(in1, in);
+                j++;
+            }
+        }
+        mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+        /* REPLACE ROWS */
+        pSrcT1 = pSrc;
+        pSrcT2 = pDst;
+        for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
+            /* Only the columns to the right of the pivot are to be processed */
+            if (k != l) {
+                shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+                pSrcT1 = pSrc + shift;
+                pSrcT2 = pDst + shift;
+                /* Element of the reference row */
+                in = *pSrcT1;
+                /* Reference row pointers */
+                pPRT_in = pPivotRowIn;
+                pPRT_pDst = pPivotRowDst;
+                /* Loop over the columns */
+                core_id = absolute_core_id % (n >> 2U);
+                core_id = core_id - (l >> 2U);
+                j = core_id * 4;
+                while (j < 4 * ((n - l) >> 2U)) {
+                    out1 = pPRT_in[j];
+                    out2 = pPRT_in[j + 1];
+                    out3 = pPRT_in[j + 2];
+                    out4 = pPRT_in[j + 3];
+                    in1 = pSrcT1[j];
+                    in2 = pSrcT1[j + 1];
+                    in3 = pSrcT1[j + 2];
+                    in4 = pSrcT1[j + 3];
+                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+                    j += 4 * (n >> 2U);
+                }
+                if (core_id == 0) {
+                    j = 4 * ((n - l) >> 2U);
+                    while (j < n - l) {
+                        in1 = pSrcT1[j];
+                        out1 = pPRT_in[j];
+                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
+                }
+                core_id = absolute_core_id % (n >> 2U);
+                /* Loop over the columns */
+                j = core_id * 4;
+                while (j < 4 * (n >> 2U)) {
+                    out1 = pPRT_pDst[j];
+                    out2 = pPRT_pDst[j + 1];
+                    out3 = pPRT_pDst[j + 2];
+                    out4 = pPRT_pDst[j + 3];
+                    in1 = pSrcT2[j];
+                    in2 = pSrcT2[j + 1];
+                    in3 = pSrcT2[j + 2];
+                    in4 = pSrcT2[j + 3];
+                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+                    j += 4 * (n >> 2U);
+                }
+                if (core_id == (n >> 2U) - 1) {
+                    j = 4 * (n >> 2U);
+                    while (j < n) {
+                        in1 = pSrcT2[j];
+                        out1 = pPRT_pDst[j];
+                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+                        j++;
+                    }
+                }
+            }
+        }
+        mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+        pSrc++;     /* Increment the input pointer */
+        l++;        /* Increment the index modifier */
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    return 0;
+}

From 4dca2cf15e00e3b486adffe9591dedde9b79edbb Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 3 Aug 2022 11:00:33 +0200
Subject: [PATCH 13/22] [software] Let single core handle exchange in parallel
 implementation

---
 software/apps/mat_inv/mempool_mat_inv_q32p.h | 165 ++++++++-----------
 1 file changed, 67 insertions(+), 98 deletions(-)

diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
index 320fe709b..952d06fc4 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h
@@ -6,10 +6,6 @@
 
 /* GAUSS JORDAN INVERSION */
 
-dump(l, 1);
-dump(loopCnt, 2);
-dump(i, 3);
-
 int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
 
 int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
@@ -53,107 +49,82 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         in = *pSrcT1;
 
         /* CHECK IF PIVOT ELEMENT IS ZERO */
-
-        if (in == 0U) {
-
-            if (core_id == 0) {
-                k = 1U;
-                while (k < m - l) {
-                    pSrcT2 = pSrcT1 + k * n;
+        if (core_id == 0) {
+            if (in == 0U) {
+                /* Loop over the rows present below */
+                for (k = l + 1U; k < m; k++) {
+                    pSrcT2 = pSrc + (n * k);
+                    pDstT2 = pDst + (n * k);
+                    /* EXCHANGE */
                     if (*pSrcT2 != 0) {
-                        *flag = k;
+                        /* Loop over colums to the right of the pivot */
+                        j = 0;
+                        while (j < 4 * ((n - l) >> 2U)) {
+                            Xchg1 = pSrcT2[j];
+                            Xchg2 = pSrcT2[j + 1];
+                            Xchg3 = pSrcT2[j + 2];
+                            Xchg4 = pSrcT2[j + 3];
+                            out1 = pSrcT1[j];
+                            out2 = pSrcT1[j + 1];
+                            out3 = pSrcT1[j + 2];
+                            out4 = pSrcT1[j + 3];
+                            pSrcT2[j] = out1;
+                            pSrcT2[j + 1] = out2;
+                            pSrcT2[j + 2] = out3;
+                            pSrcT2[j + 3] = out4;
+                            pSrcT1[j] = Xchg1;
+                            pSrcT1[j + 1] = Xchg2;
+                            pSrcT1[j + 2] = Xchg3;
+                            pSrcT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n - l) {
+                            Xchg1 = pSrcT2[j];
+                            pSrcT2[j] = pSrcT1[j];
+                            pSrcT1[j] = Xchg1;
+                            j++;
+                        }
+                        /* Loop over colums */
+                        j = 0;
+                        while (j < 4 * (n >> 2U)) {
+                            Xchg1 = pDstT2[j];
+                            Xchg2 = pDstT2[j + 1];
+                            Xchg3 = pDstT2[j + 2];
+                            Xchg4 = pDstT2[j + 3];
+                            out1 = pDstT1[j];
+                            out2 = pDstT1[j + 1];
+                            out3 = pDstT1[j + 2];
+                            out4 = pDstT1[j + 3];
+                            pDstT2[j] = out1;
+                            pDstT2[j + 1] = out2;
+                            pDstT2[j + 2] = out3;
+                            pDstT2[j + 3] = out4;
+                            pDstT1[j] = Xchg1;
+                            pDstT1[j + 1] = Xchg2;
+                            pDstT1[j + 2] = Xchg3;
+                            pDstT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n) {
+                            Xchg1 = pDstT2[j];
+                            pDstT2[j] = pDstT1[j];
+                            pDstT1[j] = Xchg1;
+                            j++;
+                        }
+                        *flag = 1U;
                         break;
                     }
-                    k++;
                 }
             }
-            mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-
-            //k = 1U + core_id;
-            //while ((k < m - l) && (*flag == 0)) {
-            //    pSrcT2 = pSrcT1 + k * n;
-            //    if (*pSrcT2 != 0) {
-            //        __atomic_store_n(flag, k, __ATOMIC_RELAXED);
-            //    }
-            //    k += MIN(n / 4, NUM_CORES);
-            //    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-            //}
-
-            /* EXCHANGE */
-
-            if (*flag != 0U) {
-                pSrcT2 = pSrcT1 + (n * *flag);
-                pDstT2 = pDstT1 + (n * *flag);
-                /* Loop over columns to the right of pivot */
-                //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
-                //while (j < 4 * ((n - l) >> 2U)) {
-                for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) {
-                    Xchg1 = pSrcT2[j];
-                    Xchg2 = pSrcT2[j + 1];
-                    Xchg3 = pSrcT2[j + 2];
-                    Xchg4 = pSrcT2[j + 3];
-                    out1 = pSrcT1[j];
-                    out2 = pSrcT1[j + 1];
-                    out3 = pSrcT1[j + 2];
-                    out4 = pSrcT1[j + 3];
-                    pSrcT2[j] = out1;
-                    pSrcT2[j + 1] = out2;
-                    pSrcT2[j + 2] = out3;
-                    pSrcT2[j + 3] = out4;
-                    pSrcT1[j] = Xchg1;
-                    pSrcT1[j + 1] = Xchg2;
-                    pSrcT1[j + 2] = Xchg3;
-                    pSrcT1[j + 3] = Xchg4;
-                    // j += 4 * NUM_CORES;
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * ((n - l) >> 2U);
-                    while (j < n - l) {
-                        Xchg1 = pSrcT2[j];
-                        pSrcT2[j] = pSrcT1[j];
-                        pSrcT1[j] = Xchg1;
-                        j++;
-                    }
-                }
-                /* Loop over columns */
-                for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) {
-                    Xchg1 = pDstT2[j];
-                    Xchg2 = pDstT2[j + 1];
-                    Xchg3 = pDstT2[j + 2];
-                    Xchg4 = pDstT2[j + 3];
-                    out1 = pDstT1[j];
-                    out2 = pDstT1[j + 1];
-                    out3 = pDstT1[j + 2];
-                    out4 = pDstT1[j + 3];
-                    pDstT2[j] = out1;
-                    pDstT2[j + 1] = out2;
-                    pDstT2[j + 2] = out3;
-                    pDstT2[j + 3] = out4;
-                    pDstT1[j] = Xchg1;
-                    pDstT1[j + 1] = Xchg2;
-                    pDstT1[j + 2] = Xchg3;
-                    pDstT1[j + 3] = Xchg4;
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * (n >> 2U);
-                    while (j < n) {
-                        Xchg1 = pDstT2[j];
-                        pDstT2[j] = pDstT1[j];
-                        pDstT1[j] = Xchg1;
-                        j++;
-                    }
-                }
+            /* Update the status if the matrix is singular */
+            if ((*flag == 0U) && (in == 0U)) {
+                return 1;
             }
-            mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-        }
-        /* Update the status if the matrix is singular */
-        if ((*flag == 0U) && (in == 0U)) {
-            return 1;
         }
+        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
 
         /* DIVIDE BY THE PIVOT */
-
         /* Points to the pivot row of input and destination matrices */
         pPivotRowIn = pSrc + (l * n);
         pPivotRowDst = pDst + (l * n);
@@ -164,8 +135,6 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         in = *pPivotRowIn;
 
         ///* Loop over columns to the right of pivot */
-        // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
-        // while (j < 4 * ((n - l) >> 2U)) {
         for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
             in1 = pSrcT1[j];
             in2 = pSrcT1[j + 1];
@@ -216,7 +185,6 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
 
 
         /* REPLACE ROWS */
-
         pSrcT1 = pSrc;
         pSrcT2 = pDst;
         /* Loop over rows */
@@ -282,6 +250,7 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla
         }
         mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
+//        /* REPLACE ROWS */
 //        pSrcT1 = pSrc;
 //        pSrcT2 = pDst;
 //        /* Loop over rows */

From 0ca5b681ebe3d6281d20094a7540f491abddead3 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 3 Aug 2022 11:01:34 +0200
Subject: [PATCH 14/22] [software] Add code for unrolling in single-core

---
 software/apps/mat_inv/mempool_mat_inv_q32s.h | 111 ++++++++++++-------
 1 file changed, 70 insertions(+), 41 deletions(-)

diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h
index 70fff05a2..21aadbe39 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32s.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h
@@ -20,37 +20,24 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
     int32_t in1, in2, in3, in4;
     int32_t out1, out2, out3, out4;
 
-    uint32_t i, rowCnt, j, loopCnt, k, l;        /* loop counters */
-    uint32_t flag;
-    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+    uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */
+    uint32_t i, j, k, l; /* loop counters */
+    uint32_t flag = 0U; /* Flag to check if the matrix is singular */
 
     pDstT1 = pDst;  /* Working pointer for destination matrix */
-    rowCnt = m;     /* Loop over the number of rows */
-    flag = 0U;
-
     /* CREATE THE IDENTITY MATRIX */
-
-    while (rowCnt > 0U) {
-        j = m - rowCnt;
-        while (j > 0U) {
-            *pDstT1++ = 0;
-            j--;
-        }
-        *pDstT1++ = 1;
-        j = rowCnt - 1U;
-        while (j > 0U) {
-            *pDstT1++ = 0;
-            j--;
+    for (k = 0; k < m; k += 4) {
+        for (j = 0; j < n; j++) {
+            pDstT1[k * m + j] = (uint32_t) (k == j);
+            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
+            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
+            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
         }
-        rowCnt--;
     }
 
-    /* Loop over the number of columns of the input matrix. */
-    loopCnt = n;
     /* Index modifier to navigate through the columns */
     l = 0U;
-
-    while (loopCnt > 0U) {
+    while (l < n) {
 
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
@@ -166,6 +153,32 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
             *pSrcT1++ = FIX_DIV(in1, in);
             j++;
         }
+        //switch ((n - l) % 4) {
+        //    case 3:
+        //        in1 = *pSrcT1;
+        //        in2 = *(pSrcT1 + 1);
+        //        in3 = *(pSrcT1 + 2);
+        //        out1 = FIX_DIV(in1, in);
+        //        out2 = FIX_DIV(in2, in);
+        //        out3 = FIX_DIV(in3, in);
+        //        *pSrcT1++ = out1;
+        //        *pSrcT1++ = out2;
+        //        *pSrcT1++ = out3;
+        //        break;
+        //    case 2:
+        //        in1 = *pSrcT1;
+        //        in2 = *(pSrcT1 + 1);
+        //        out1 = FIX_DIV(in1, in);
+        //        out2 = FIX_DIV(in2, in);
+        //        *pSrcT1++ = out1;
+        //        *pSrcT1++ = out2;
+        //        break;
+        //    case 1:
+        //        in1 = *pSrcT1;
+        //        out1 = FIX_DIV(in1, in);
+        //        *pSrcT1++ = out1;
+        //        break;
+        //}
         /* Loop over number of columns of the destination matrix */
         j = 0;
         while (j < 4 * (n >> 2U)) {
@@ -207,20 +220,20 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
                 j = 0;
-                while (j < 2 * ((n - l) >> 1U)) {
+                while (j < 4 * ((n - l) >> 2U)) {
                     in1 = *pSrcT1;
                     in2 = *(pSrcT1 + 1);
-                    // in3 = *(pSrcT1 + 2);
-                    // in4 = *(pSrcT1 + 3);
+                    in3 = *(pSrcT1 + 2);
+                    in4 = *(pSrcT1 + 3);
                     out1 = *pPRT_in++;
                     out2 = *pPRT_in++;
-                    // out3 = *pPRT_in++;
-                    // out4 = *pPRT_in++;
+                    out3 = *pPRT_in++;
+                    out4 = *pPRT_in++;
                     *pSrcT1++ = in1 - FIX_MUL(in, out1);
                     *pSrcT1++ = in2 - FIX_MUL(in, out2);
-                    // *pSrcT1++ = in3 - FIX_MUL(in, out3);
-                    // *pSrcT1++ = in4 - FIX_MUL(in, out4);
-                    j += 2;
+                    *pSrcT1++ = in3 - FIX_MUL(in, out3);
+                    *pSrcT1++ = in4 - FIX_MUL(in, out4);
+                    j += 4;
                 }
                 while (j < n - l) {
                     in1 = *pSrcT1;
@@ -228,6 +241,32 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
                     *pSrcT1++ = in1 - FIX_MUL(in, out1);
                     j++;
                 }
+                //switch ((n - l) % 4) {
+                //    case 3:
+                //        in1 = *pSrcT1;
+                //        in2 = *(pSrcT1 + 1);
+                //        in3 = *(pSrcT1 + 2);
+                //        out1 = *pPRT_in++;
+                //        out2 = *pPRT_in++;
+                //        out3 = *pPRT_in++;
+                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+                //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
+                //        *pSrcT1++ = in3 - FIX_MUL(in, out3);
+                //        break;
+                //    case 2:
+                //        in1 = *pSrcT1;
+                //        in2 = *(pSrcT1 + 1);
+                //        out1 = *pPRT_in++;
+                //        out2 = *pPRT_in++;
+                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+                //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
+                //        break;
+                //    case 1:
+                //        in1 = *pSrcT1;
+                //        out1 = *pPRT_in++;
+                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+                //        break;
+                //}
                 /* Loop over the number of columns to
                    replace the elements in the destination matrix */
                 j = 0;
@@ -262,19 +301,9 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
         }
 
         pSrc++; /* Increment the input pointer */
-        loopCnt--; /* Decrement the loop counter */
         l++; /* Increment the index modifier */
     }
 
-//    if ((flag != 1U) && (in == 0)) {
-//        for (i = 0; i < m * n; i++) {
-//            if (pSrc[i] != 0)
-//                break;
-//        }
-//        if (i == m * n)
-//            return 1;
-//    }
-
     return 0;
 }
  

From b42e968713d47bc536c03518f5eb6741eca1e9bc Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 3 Aug 2022 11:02:43 +0200
Subject: [PATCH 15/22] [software] Add parallelization schemes in memory shaped
 version

---
 .../mat_inv/mempool_mat_inv_q32p_memsized.h   | 638 +++++++++++-------
 1 file changed, 409 insertions(+), 229 deletions(-)

diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
index 496459e19..961aefd58 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
@@ -22,167 +22,230 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
 
     uint32_t absolute_core_id = mempool_get_core_id();
     uint32_t core_id = absolute_core_id;
-    uint32_t i, j, loopCnt, k, l;  /* loop counters */
+    uint32_t i, j, k, l;  /* loop counters */
     uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
 
     /* CREATE THE IDENTITY MATRIX */
 
     pDstT1 = pDst;
-    for (k = 0; k < m; k++) {
-        core_id = absolute_core_id - ((n * k) / 4) % N_BANKS;
-        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
-        for (j = core_id * 4; j < m; j += 4 * NUM_CORES) {
-            pDstT1[k * m + j] = (uint32_t) (k == j);
-            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
-            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
-            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
+    for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+        for (j = 0; j < n; j++) {
+            pDstT1[k * n + j] = (uint32_t) (k == j);
+            pDstT1[(k + 1) * n + j] = (uint32_t) ((k + 1) == j);
+            pDstT1[(k + 2) * n + j] = (uint32_t) ((k + 2) == j);
+            pDstT1[(k + 3) * n + j] = (uint32_t) ((k + 3) == j);
         }
     }
-    mempool_log_barrier(2, absolute_core_id);
+//    pDstT1 = pDst;
+//    for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) {
+//        k = i / n;
+//        j = i % n;
+//        pDstT1[k * n + j] = (uint32_t) (k == j);
+//        pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1));
+//        pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2));
+//        pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3));
+//    }
+//    mempool_log_barrier(2, absolute_core_id);
 
-    /* Loop over the number of columns of the input matrix. */
-    loopCnt = n;
     /* Index modifier to navigate through the columns */
     l = 0U;
-
-    while (loopCnt > 0U) {
+    while (l < n) {
 
         pSrcT1 = pSrc + (l * n);
         pDstT1 = pDst + (l * n);
-        core_id = absolute_core_id - ((l * n) / 4) % N_BANKS;
-        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
         in = *pSrcT1;
 
         /* CHECK IF PIVOT ELEMENT IS ZERO */
-
-        if (in == 0U) {
-
-            if (absolute_core_id == 0) {
-                k = 1U;
-                while (k < m - l) {
-                    pSrcT2 = pSrcT1 + k * n;
+        if (absolute_core_id == 0) {
+            if (in == 0U) {
+                /* Loop over the rows present below */
+                for (k = l + 1U; k < m; k++) {
+                    pSrcT2 = pSrc + (n * k);
+                    pDstT2 = pDst + (n * k);
+                    /* EXCHANGE */
                     if (*pSrcT2 != 0) {
-                        *flag = k;
+                        /* Loop over colums to the right of the pivot */
+                        j = 0;
+                        while (j < 4 * ((n - l) >> 2U)) {
+                            Xchg1 = pSrcT2[j];
+                            Xchg2 = pSrcT2[j + 1];
+                            Xchg3 = pSrcT2[j + 2];
+                            Xchg4 = pSrcT2[j + 3];
+                            out1 = pSrcT1[j];
+                            out2 = pSrcT1[j + 1];
+                            out3 = pSrcT1[j + 2];
+                            out4 = pSrcT1[j + 3];
+                            pSrcT2[j] = out1;
+                            pSrcT2[j + 1] = out2;
+                            pSrcT2[j + 2] = out3;
+                            pSrcT2[j + 3] = out4;
+                            pSrcT1[j] = Xchg1;
+                            pSrcT1[j + 1] = Xchg2;
+                            pSrcT1[j + 2] = Xchg3;
+                            pSrcT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n - l) {
+                            Xchg1 = pSrcT2[j];
+                            pSrcT2[j] = pSrcT1[j];
+                            pSrcT1[j] = Xchg1;
+                            j++;
+                        }
+                        /* Loop over colums */
+                        j = 0;
+                        while (j < 4 * (n >> 2U)) {
+                            Xchg1 = pDstT2[j];
+                            Xchg2 = pDstT2[j + 1];
+                            Xchg3 = pDstT2[j + 2];
+                            Xchg4 = pDstT2[j + 3];
+                            out1 = pDstT1[j];
+                            out2 = pDstT1[j + 1];
+                            out3 = pDstT1[j + 2];
+                            out4 = pDstT1[j + 3];
+                            pDstT2[j] = out1;
+                            pDstT2[j + 1] = out2;
+                            pDstT2[j + 2] = out3;
+                            pDstT2[j + 3] = out4;
+                            pDstT1[j] = Xchg1;
+                            pDstT1[j + 1] = Xchg2;
+                            pDstT1[j + 2] = Xchg3;
+                            pDstT1[j + 3] = Xchg4;
+                            j += 4;
+                        }
+                        while (j < n) {
+                            Xchg1 = pDstT2[j];
+                            pDstT2[j] = pDstT1[j];
+                            pDstT1[j] = Xchg1;
+                            j++;
+                        }
+                        *flag = 1U;
                         break;
                     }
-                    k++;
                 }
             }
-            mempool_log_barrier(2, absolute_core_id);
-
-            /* EXCHANGE */
-
-            if (*flag != 0U) {
-                pSrcT2 = pSrcT1 + (n * *flag);
-                pDstT2 = pDstT1 + (n * *flag);
-
-                /* Loop over columns to the right of pivot */
-                //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
-                //while (j < 4 * ((n - l) >> 2U)) {
-                for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) {
-                    Xchg1 = pSrcT2[j];
-                    Xchg2 = pSrcT2[j + 1];
-                    Xchg3 = pSrcT2[j + 2];
-                    Xchg4 = pSrcT2[j + 3];
-                    out1 = pSrcT1[j];
-                    out2 = pSrcT1[j + 1];
-                    out3 = pSrcT1[j + 2];
-                    out4 = pSrcT1[j + 3];
-                    pSrcT2[j] = out1;
-                    pSrcT2[j + 1] = out2;
-                    pSrcT2[j + 2] = out3;
-                    pSrcT2[j + 3] = out4;
-                    pSrcT1[j] = Xchg1;
-                    pSrcT1[j + 1] = Xchg2;
-                    pSrcT1[j + 2] = Xchg3;
-                    pSrcT1[j + 3] = Xchg4;
-                    // j += 4 * NUM_CORES;
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * ((n - l) >> 2U);
-                    while (j < n - l) {
-                        Xchg1 = pSrcT2[j];
-                        pSrcT2[j] = pSrcT1[j];
-                        pSrcT1[j] = Xchg1;
-                        j++;
-                    }
-                }
-                /* Loop over columns */
-                for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) {
-                    Xchg1 = pDstT2[j];
-                    Xchg2 = pDstT2[j + 1];
-                    Xchg3 = pDstT2[j + 2];
-                    Xchg4 = pDstT2[j + 3];
-                    out1 = pDstT1[j];
-                    out2 = pDstT1[j + 1];
-                    out3 = pDstT1[j + 2];
-                    out4 = pDstT1[j + 3];
-                    pDstT2[j] = out1;
-                    pDstT2[j + 1] = out2;
-                    pDstT2[j + 2] = out3;
-                    pDstT2[j + 3] = out4;
-                    pDstT1[j] = Xchg1;
-                    pDstT1[j + 1] = Xchg2;
-                    pDstT1[j + 2] = Xchg3;
-                    pDstT1[j + 3] = Xchg4;
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * (n >> 2U);
-                    while (j < n) {
-                        Xchg1 = pDstT2[j];
-                        pDstT2[j] = pDstT1[j];
-                        pDstT1[j] = Xchg1;
-                        j++;
-                    }
-                }
+            /* Update the status if the matrix is singular */
+            if ((*flag == 0U) && (in == 0U)) {
+                return 1;
             }
-            mempool_log_barrier(2, absolute_core_id);
-        }
-        /* Update the status if the matrix is singular */
-        if ((*flag == 0U) && (in == 0U)) {
-            return 1;
+            //    /* DIVIDE BY THE PIVOT */
+            //    /* Points to the pivot row of input and destination matrices */
+            //    pPivotRowIn = pSrc + (l * n);
+            //    pPivotRowDst = pDst + (l * n);
+            //    /* Temporary pointers to the pivot row pointers */
+            //    pSrcT1 = pPivotRowIn;
+            //    pSrcT2 = pPivotRowDst;
+            //    /* Pivot element of the row */
+            //    in = *pPivotRowIn;
+            //    /* Loop over number of columns to the right of the pilot element */
+            //    j = 0;
+            //    while (j < 4 * ((n - l) >> 2U)) {
+            //        in1 = *pSrcT1;
+            //        in2 = *(pSrcT1 + 1);
+            //        in3 = *(pSrcT1 + 2);
+            //        in4 = *(pSrcT1 + 3);
+            //        out1 = FIX_DIV(in1, in);
+            //        out2 = FIX_DIV(in2, in);
+            //        out3 = FIX_DIV(in3, in);
+            //        out4 = FIX_DIV(in4, in);
+            //        *pSrcT1++ = out1;
+            //        *pSrcT1++ = out2;
+            //        *pSrcT1++ = out3;
+            //        *pSrcT1++ = out4;
+            //        j += 4;
+            //    }
+            //    while (j < n - l) {
+            //        in1 = *pSrcT1;
+            //        *pSrcT1++ = FIX_DIV(in1, in);
+            //        j++;
+            //    }
+            //    /* Loop over number of columns of the destination matrix */
+            //    j = 0;
+            //    while (j < 4 * (n >> 2U)) {
+            //        in1 = *pSrcT2;
+            //        in2 = *(pSrcT2 + 1);
+            //        in3 = *(pSrcT2 + 2);
+            //        in4 = *(pSrcT2 + 3);
+            //        out1 = FIX_DIV(in1, in);
+            //        out2 = FIX_DIV(in2, in);
+            //        out3 = FIX_DIV(in3, in);
+            //        out4 = FIX_DIV(in4, in);
+            //        *pSrcT2++ = out1;
+            //        *pSrcT2++ = out2;
+            //        *pSrcT2++ = out3;
+            //        *pSrcT2++ = out4;
+            //        j += 4;
+            //    }
+            //    while (j < n) {
+            //        in1 = *pSrcT2;
+            //        *pSrcT2++ = FIX_DIV(in1, in);
+            //        j++;
+            //    }
         }
-
+        mempool_log_barrier(2, absolute_core_id);
+        //pPivotRowIn = pSrc + (l * n);
+        //pPivotRowDst = pDst + (l * n);
 
         /* DIVIDE BY THE PIVOT */
-
         /* Points to the pivot row of input and destination matrices */
         pPivotRowIn = pSrc + (l * n);
         pPivotRowDst = pDst + (l * n);
-
         /* Temporary pointers to the pivot row pointers */
         pSrcT1 = pPivotRowIn;
         pSrcT2 = pPivotRowDst;
         /* Pivot element of the row */
         in = *pPivotRowIn;
-
-        ///* Loop over columns to the right of pivot */
-        // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
-        // while (j < 4 * ((n - l) >> 2U)) {
-        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
-            in1 = pSrcT1[j];
-            in2 = pSrcT1[j + 1];
-            in3 = pSrcT1[j + 2];
-            in4 = pSrcT1[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT1[j] = out1;
-            pSrcT1[j + 1] = out2;
-            pSrcT1[j + 2] = out3;
-            pSrcT1[j + 3] = out4;
-            // j += NUM_CORES * 4;
-        }
-        if (core_id == (n >> 2U) - 1) {
-            j = 4 * ((n - l) >> 2U);
-            while (j < n - l) {
+        /* Loop over columns to the right of pivot */
+        core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U);
+        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
+        //for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+        //    in1 = pSrcT1[j];
+        //    in2 = pSrcT1[j + 1];
+        //    in3 = pSrcT1[j + 2];
+        //    in4 = pSrcT1[j + 3];
+        //    out1 = FIX_DIV(in1, in);
+        //    out2 = FIX_DIV(in2, in);
+        //    out3 = FIX_DIV(in3, in);
+        //    out4 = FIX_DIV(in4, in);
+        //    pSrcT1[j] = out1;
+        //    pSrcT1[j + 1] = out2;
+        //    pSrcT1[j + 2] = out3;
+        //    pSrcT1[j + 3] = out4;
+        //}
+        //if (core_id == 0) {
+        //    j = 4 * ((n - l) >> 2U);
+        //    while (j < n - l) {
+        //        in1 = pSrcT1[j];
+        //        pSrcT1[j] = FIX_DIV(in1, in);
+        //        j++;
+        //    }
+        //}
+        if(core_id == 0) {
+            j = 0;
+            while (j < 4 - l % 4) {
                 in1 = pSrcT1[j];
-                // pSrcT1[j] = FIX_DIV(in1, in);
+                pSrcT1[j] = FIX_DIV(in1, in);
                 j++;
             }
+        } else {
+            j = core_id * 4 - l % 4;
+            if (j < (n - l)) {
+                in1 = pSrcT1[j];
+                in2 = pSrcT1[j + 1];
+                in3 = pSrcT1[j + 2];
+                in4 = pSrcT1[j + 3];
+                out1 = FIX_DIV(in1, in);
+                out2 = FIX_DIV(in2, in);
+                out3 = FIX_DIV(in3, in);
+                out4 = FIX_DIV(in4, in);
+                pSrcT1[j] = out1;
+                pSrcT1[j + 1] = out2;
+                pSrcT1[j + 2] = out3;
+                pSrcT1[j + 3] = out4;
+            }
         }
         /* Loop over columns */
+        core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U);
+        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
         for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
             in1 = pSrcT2[j];
             in2 = pSrcT2[j + 1];
@@ -197,111 +260,51 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
             pSrcT2[j + 2] = out3;
             pSrcT2[j + 3] = out4;
         }
-        if (core_id == (n >> 2U) - 1) {
-            j = 4 * (n >> 2U);
-            while (j < n) {
-                in1 = pSrcT2[j];
-                pSrcT2[j] = FIX_DIV(in1, in);
-                j++;
-            }
-        }
+        //if (core_id == (n >> 2U) - 1) {
+        //    j = 4 * (n >> 2U);
+        //    while (j < n) {
+        //        in1 = pSrcT2[j];
+        //        pSrcT2[j] = FIX_DIV(in1, in);
+        //        j++;
+        //    }
+        //}
         mempool_log_barrier(2, absolute_core_id);
 
-
         /* REPLACE ROWS */
-        core_id = absolute_core_id;
         pSrcT1 = pSrc;
         pSrcT2 = pDst;
-        /* Loop over rows */
-//        for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
-//            i = 0U;
-//            while (i < 4) {
-//                if ((i + k) != l) {
-//                    pSrcT1 = pSrc + (i + k) * n;
-//                    pSrcT2 = pDst + (i + k) * n;
-//                    /* Element of the reference row */
-//                    in = *pSrcT1;
-//                    pPRT_in = pPivotRowIn;
-//                    pPRT_pDst = pPivotRowDst;
-//                    /* Loop over columns to the right of pivot */
-//                    j = 0;
-//                    while (j < 4 * ((n - l) >> 2U)) {
-//                        in1 = pSrcT1[j];
-//                        in2 = pSrcT1[j + 1];
-//                        in3 = pSrcT1[j + 2];
-//                        in4 = pSrcT1[j + 3];
-//                        out1 = pPRT_in[j];
-//                        out2 = pPRT_in[j + 1];
-//                        out3 = pPRT_in[j + 2];
-//                        out4 = pPRT_in[j + 3];
-//                        pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-//                        pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-//                        pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-//                        pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-//                        j += 4;
-//                    }
-//                    while (j < n - l) {
-//                        in1 = pSrcT1[j];
-//                        out1 = pPRT_in[j];
-//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                        j++;
-//                    }
-//                    /* Loop over columns */
-//                    j = 0;
-//                    while (j < 4 * (n >> 2U)) {
-//                        in1 = pSrcT2[j];
-//                        in2 = pSrcT2[j + 1];
-//                        in3 = pSrcT2[j + 2];
-//                        in4 = pSrcT2[j + 3];
-//                        out1 = pPRT_pDst[j];
-//                        out2 = pPRT_pDst[j + 1];
-//                        out3 = pPRT_pDst[j + 2];
-//                        out4 = pPRT_pDst[j + 3];
-//                        pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-//                        pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-//                        pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-//                        pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-//                        j += 4;
-//                    }
-//                    while (j < n) {
-//                        in1 = pSrcT2[j];
-//                        out1 = pPRT_pDst[j];
-//                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-//                        j++;
-//                    }
-//                }
-//                i++;
-//            }
-//        }
-//        mempool_log_barrier(2, absolute_core_id);
-
         for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
-            dump_i(absolute_core_id);
+            /* Only the columns to the right of the pivot are to be processed */
             if (k != l) {
+
                 pSrcT1 = pSrc + k * n;
                 pSrcT2 = pDst + k * n;
-                core_id = absolute_core_id % (n >> 2U);
                 /* Element of the reference row */
                 in = *pSrcT1;
+                /* Reference row pointers */
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
+
+                /* Loop over the columns */
+                core_id = absolute_core_id % (n >> 2U);
+                core_id = core_id - (l >> 2U);
                 j = core_id * 4;
                 while (j < 4 * ((n - l) >> 2U)) {
-                    in1 = pSrcT1[j];
-                    in2 = pSrcT1[j + 1];
-                    in3 = pSrcT1[j + 2];
-                    in4 = pSrcT1[j + 3];
                     out1 = pPRT_in[j];
                     out2 = pPRT_in[j + 1];
                     out3 = pPRT_in[j + 2];
                     out4 = pPRT_in[j + 3];
-                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+                    in1 = pSrcT1[j];
+                    in2 = pSrcT1[j + 1];
+                    in3 = pSrcT1[j + 2];
+                    in4 = pSrcT1[j + 3];
+                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
                     pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
                     pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
                     pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * NUM_CORES;
+                    j += 4 * (n >> 2U);
                 }
-                if (core_id == (n >> 2U) - 1) {
+                if (core_id == 0) {
                     j = 4 * ((n - l) >> 2U);
                     while (j < n - l) {
                         in1 = pSrcT1[j];
@@ -310,49 +313,226 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
                         j++;
                     }
                 }
-                /* Loop over columns */
+                /* Loop over the columns */
+                core_id = absolute_core_id % (n >> 2U);
                 j = core_id * 4;
                 while (j < 4 * (n >> 2U)) {
-                    in1 = pSrcT2[j];
-                    in2 = pSrcT2[j + 1];
-                    in3 = pSrcT2[j + 2];
-                    in4 = pSrcT2[j + 3];
                     out1 = pPRT_pDst[j];
                     out2 = pPRT_pDst[j + 1];
                     out3 = pPRT_pDst[j + 2];
                     out4 = pPRT_pDst[j + 3];
+                    in1 = pSrcT2[j];
+                    in2 = pSrcT2[j + 1];
+                    in3 = pSrcT2[j + 2];
+                    in4 = pSrcT2[j + 3];
                     pSrcT2[j]     = in1 - FIX_MUL(in, out1);
                     pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
                     pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
                     pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * NUM_CORES;
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * (n >> 2U);
-                    while (j < n) {
-                        in1 = pSrcT2[j];
-                        out1 = pPRT_pDst[j];
-                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
+                    j += 4 * (n >> 2U);
                 }
+                //if (core_id == (n >> 2U) - 1) {
+                //    j = 4 * (n >> 2U);
+                //    while (j < n) {
+                //        in1 = pSrcT2[j];
+                //        out1 = pPRT_pDst[j];
+                //        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+                //        j++;
+                //    }
+                //}
+
+                //uint32_t core_id_in;
+                //uint32_t core_id_Dst;
+                //int32_t p1_in, p2_in, p3_in, p4_in;
+                //int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst;
+                //core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U);
+                //core_id_Dst = absolute_core_id % (n >> 2U);
+                //j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4);
+                //i = core_id_Dst * 4;
+                //p1_in = pPRT_in[j];
+                //p2_in = pPRT_in[j + 1];
+                //p3_in = pPRT_in[j + 2];
+                //p4_in = pPRT_in[j + 3];
+                //p1_Dst = pPRT_pDst[i];
+                //p2_Dst = pPRT_pDst[i + 1];
+                //p3_Dst = pPRT_pDst[i + 2];
+                //p4_Dst = pPRT_pDst[i + 3];
+                //if(core_id_in == 0) {
+                //    switch (4 - l % 4) {
+                //        case (1):
+                //            in1 = pSrcT1[j];
+                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+                //            break;
+                //        case (2):
+                //            in1 = pSrcT1[j];
+                //            in2 = pSrcT1[j + 1];
+                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+                //            break;
+                //        case (3):
+                //            in1 = pSrcT1[j];
+                //            in2 = pSrcT1[j + 1];
+                //            in3 = pSrcT1[j + 2];
+                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+                //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+                //            break;
+                //        case (4):
+                //            in1 = pSrcT1[j];
+                //            in2 = pSrcT1[j + 1];
+                //            in3 = pSrcT1[j + 2];
+                //            in4 = pSrcT1[j + 3];
+                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+                //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+                //            pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
+                //            break;
+                //    }
+                //} else {
+                //    in1 = pSrcT1[j];
+                //    in2 = pSrcT1[j + 1];
+                //    in3 = pSrcT1[j + 2];
+                //    in4 = pSrcT1[j + 3];
+                //    pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+                //    pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+                //    pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+                //    pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
+                //}
+                //in1 = pSrcT2[i];
+                //in2 = pSrcT2[i + 1];
+                //in3 = pSrcT2[i + 2];
+                //in4 = pSrcT2[i + 3];
+                //pSrcT2[i]     = in1 - FIX_MUL(in, p1_Dst);
+                //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst);
+                //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst);
+                //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst);
+
             }
         }
         mempool_log_barrier(2, absolute_core_id);
 
+//        /* REPLACE ROWS */
+//        pSrcT1 = pSrc;
+//        pSrcT2 = pDst;
+//        for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) {
+//            k = i / n;
+//            if (k != l) {
+//                in = *(pSrc + k * n);
+//                j = i - (k * n);
+//                if (j >= 4 * (l >> 2U)) {
+//                    if (j == 4 * (l >> 2U)) {
+//                        pSrcT1 = pSrc + k * n;
+//                        pPRT_in = pPivotRowIn;
+//                        uint32_t bound = j + 4 - l;
+//                        j = 0;
+//                        while (j < bound) {
+//                            in1 = *pSrcT1;
+//                            out1 = *pPRT_in++;
+//                            *pSrcT1++ = in1 - FIX_MUL(in, out1);
+//                            j++;
+//                        }
+//                    } else {
+//                        pSrcT1 = pSrc + (i - l);
+//                        pPRT_in = pPivotRowIn + (j - l);
+//                        in1 = *pSrcT1;
+//                        in2 = *(pSrcT1 + 1);
+//                        in3 = *(pSrcT1 + 2);
+//                        in4 = *(pSrcT1 + 3);
+//                        out1 = *pPRT_in++;
+//                        out2 = *pPRT_in++;
+//                        out3 = *pPRT_in++;
+//                        out4 = *pPRT_in++;
+//                        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+//                        *pSrcT1++ = in2 - FIX_MUL(in, out2);
+//                        *pSrcT1++ = in3 - FIX_MUL(in, out3);
+//                        *pSrcT1++ = in4 - FIX_MUL(in, out4);
+//                    }
+//                }
+//                pSrcT2 = pDst + i;
+//                pPRT_pDst = pPivotRowDst + j;
+//                in1 = *pSrcT2;
+//                in2 = *(pSrcT2 + 1);
+//                in3 = *(pSrcT2 + 2);
+//                in4 = *(pSrcT2 + 3);
+//                out1 = *pPRT_pDst++;
+//                out2 = *pPRT_pDst++;
+//                out3 = *pPRT_pDst++;
+//                out4 = *pPRT_pDst++;
+//                *pSrcT2++ = in1 - FIX_MUL(in, out1);
+//                *pSrcT2++ = in2 - FIX_MUL(in, out2);
+//                *pSrcT2++ = in3 - FIX_MUL(in, out3);
+//                *pSrcT2++ = in4 - FIX_MUL(in, out4);
+//            }
+//        }
+//        mempool_log_barrier(2, absolute_core_id);
+//        /* REPLACE ROWS */
+//        pSrcT1 = pSrc;
+//        pSrcT2 = pDst;
+//        core_id = absolute_core_id;
+//        for (k = core_id; k < m; k += NUM_CORES) {
+//            /* Only the columns to the right of the pivot are to be processed */
+//            if (k != l) {
+//                pSrcT1 = pSrc + k * n;
+//                pSrcT2 = pDst + k * n;
+//                /* Element of the reference row */
+//                in = *pSrcT1;
+//                /* Reference row pointers */
+//                pPRT_in = pPivotRowIn;
+//                pPRT_pDst = pPivotRowDst;
+//                /* Loop over the columns */
+//                j = 0;
+//                while (j < 4 * ((n - l) >> 2U)) {
+//                    in1 = pSrcT1[j];
+//                    in2 = pSrcT1[j + 1];
+//                    in3 = pSrcT1[j + 2];
+//                    in4 = pSrcT1[j + 3];
+//                    out1 = pPRT_in[j];
+//                    out2 = pPRT_in[j + 1];
+//                    out3 = pPRT_in[j + 2];
+//                    out4 = pPRT_in[j + 3];
+//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4;
+//                }
+//                while (j < n - l) {
+//                    in1 = pSrcT1[j];
+//                    out1 = pPRT_in[j];
+//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                    j++;
+//                }
+//                /* Loop over the columns */
+//                j = 0;
+//                while (j < 4 * (n >> 2U)) {
+//                    in1 = pSrcT2[j];
+//                    in2 = pSrcT2[j + 1];
+//                    in3 = pSrcT2[j + 2];
+//                    in4 = pSrcT2[j + 3];
+//                    out1 = pPRT_pDst[j];
+//                    out2 = pPRT_pDst[j + 1];
+//                    out3 = pPRT_pDst[j + 2];
+//                    out4 = pPRT_pDst[j + 3];
+//                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
+//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4;
+//                }
+//                while (j < n) {
+//                    in1 = pSrcT2[j];
+//                    out1 = pPRT_pDst[j];
+//                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
+//                    j++;
+//                }
+//            }
+//        }
+//        mempool_log_barrier(2, absolute_core_id);
+
         pSrc++;     /* Increment the input pointer */
-        loopCnt--;  /* Decrement the loop counter */
         l++;        /* Increment the index modifier */
     }
     mempool_log_barrier(2, absolute_core_id);
 
-//    if ((flag != 1U) && (x == 0)) {
-//        for (i = 0; i < m * n; i++) {
-//            if (pSrc[i] != 0)
-//                break;
-//        }
-//        if (i == m * n)
-//            return 1;
-//    }
     return 0;
 }

From 82f8f518cc6930b85959e54b716a74c9c4d148e2 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 4 Aug 2022 09:35:18 +0200
Subject: [PATCH 16/22] [software] Merge the two final steps of matrix
 inversion

---
 software/apps/mat_inv/main.c                  |   1 +
 .../mat_inv/mempool_mat_inv_q32p_memsized.h   | 203 +++++++++++++-----
 2 files changed, 145 insertions(+), 59 deletions(-)

diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index f39cd6ac8..e0cb8741b 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -134,6 +134,7 @@ void multi_core_folded()
     init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
     if (core_id == 0) {
         flag = 0U;
+        __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED);
     }
     mempool_barrier(num_cores);
 
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
index 961aefd58..6ec20a91b 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
@@ -6,6 +6,8 @@
 
 /* GAUSS JORDAN INVERSION */
 
+uint32_t volatile pivot_barrier __attribute__((section(".l1")));
+
 int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
 
 int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
@@ -127,63 +129,8 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
             if ((*flag == 0U) && (in == 0U)) {
                 return 1;
             }
-            //    /* DIVIDE BY THE PIVOT */
-            //    /* Points to the pivot row of input and destination matrices */
-            //    pPivotRowIn = pSrc + (l * n);
-            //    pPivotRowDst = pDst + (l * n);
-            //    /* Temporary pointers to the pivot row pointers */
-            //    pSrcT1 = pPivotRowIn;
-            //    pSrcT2 = pPivotRowDst;
-            //    /* Pivot element of the row */
-            //    in = *pPivotRowIn;
-            //    /* Loop over number of columns to the right of the pilot element */
-            //    j = 0;
-            //    while (j < 4 * ((n - l) >> 2U)) {
-            //        in1 = *pSrcT1;
-            //        in2 = *(pSrcT1 + 1);
-            //        in3 = *(pSrcT1 + 2);
-            //        in4 = *(pSrcT1 + 3);
-            //        out1 = FIX_DIV(in1, in);
-            //        out2 = FIX_DIV(in2, in);
-            //        out3 = FIX_DIV(in3, in);
-            //        out4 = FIX_DIV(in4, in);
-            //        *pSrcT1++ = out1;
-            //        *pSrcT1++ = out2;
-            //        *pSrcT1++ = out3;
-            //        *pSrcT1++ = out4;
-            //        j += 4;
-            //    }
-            //    while (j < n - l) {
-            //        in1 = *pSrcT1;
-            //        *pSrcT1++ = FIX_DIV(in1, in);
-            //        j++;
-            //    }
-            //    /* Loop over number of columns of the destination matrix */
-            //    j = 0;
-            //    while (j < 4 * (n >> 2U)) {
-            //        in1 = *pSrcT2;
-            //        in2 = *(pSrcT2 + 1);
-            //        in3 = *(pSrcT2 + 2);
-            //        in4 = *(pSrcT2 + 3);
-            //        out1 = FIX_DIV(in1, in);
-            //        out2 = FIX_DIV(in2, in);
-            //        out3 = FIX_DIV(in3, in);
-            //        out4 = FIX_DIV(in4, in);
-            //        *pSrcT2++ = out1;
-            //        *pSrcT2++ = out2;
-            //        *pSrcT2++ = out3;
-            //        *pSrcT2++ = out4;
-            //        j += 4;
-            //    }
-            //    while (j < n) {
-            //        in1 = *pSrcT2;
-            //        *pSrcT2++ = FIX_DIV(in1, in);
-            //        j++;
-            //    }
         }
         mempool_log_barrier(2, absolute_core_id);
-        //pPivotRowIn = pSrc + (l * n);
-        //pPivotRowDst = pDst + (l * n);
 
         /* DIVIDE BY THE PIVOT */
         /* Points to the pivot row of input and destination matrices */
@@ -276,7 +223,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
         for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
             /* Only the columns to the right of the pivot are to be processed */
             if (k != l) {
-
                 pSrcT1 = pSrc + k * n;
                 pSrcT2 = pDst + k * n;
                 /* Element of the reference row */
@@ -284,7 +230,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
                 /* Reference row pointers */
                 pPRT_in = pPivotRowIn;
                 pPRT_pDst = pPivotRowDst;
-
                 /* Loop over the columns */
                 core_id = absolute_core_id % (n >> 2U);
                 core_id = core_id - (l >> 2U);
@@ -340,7 +285,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
                 //        j++;
                 //    }
                 //}
-
                 //uint32_t core_id_in;
                 //uint32_t core_id_Dst;
                 //int32_t p1_in, p2_in, p3_in, p4_in;
@@ -406,11 +350,152 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint
                 //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst);
                 //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst);
                 //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst);
-
             }
         }
         mempool_log_barrier(2, absolute_core_id);
 
+//        /* REPLACE ROWS */
+//        pSrcT1 = pSrc;
+//        pSrcT2 = pDst;
+//        /* Reference row pointers */
+//        pPRT_in = pSrc + (l * n);
+//        pPRT_pDst = pDst + (l * n);
+//        int32_t pivot = *pPRT_in;
+//        uint32_t nPE = (n >> 2U);
+//        uint32_t check = 0;
+//        if (absolute_core_id >= m * nPE)
+//            mempool_wfi();
+//        for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) {
+//            /* Only the columns to the right of the pivot are to be processed */
+//            if (k != l) {
+//                pSrcT1 = pSrc + k * n;
+//                pSrcT2 = pDst + k * n;
+//                /* Element of the reference row */
+//                in = *pSrcT1;
+//                /* Loop over the columns */
+//                core_id = absolute_core_id % nPE;
+//                core_id = core_id - (l >> 2U);
+//                j = core_id * 4;
+//                while (j < 4 * ((n - l) >> 2U)) {
+//                    out1 = pPRT_in[j];
+//                    out2 = pPRT_in[j + 1];
+//                    out3 = pPRT_in[j + 2];
+//                    out4 = pPRT_in[j + 3];
+//                    out1 = FIX_DIV(out1, pivot);
+//                    out2 = FIX_DIV(out2, pivot);
+//                    out3 = FIX_DIV(out3, pivot);
+//                    out4 = FIX_DIV(out4, pivot);
+//                    in1 = pSrcT1[j];
+//                    in2 = pSrcT1[j + 1];
+//                    in3 = pSrcT1[j + 2];
+//                    in4 = pSrcT1[j + 3];
+//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4 * (n >> 2U);
+//                }
+//                if (core_id == 0) {
+//                    j = 4 * ((n - l) >> 2U);
+//                    while (j < n - l) {
+//                        out1 = pPRT_in[j];
+//                        out1 = FIX_DIV(out1, pivot);
+//                        in1 = pSrcT1[j];
+//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+//                        j++;
+//                    }
+//                }
+//                /* Loop over the columns */
+//                core_id = absolute_core_id % nPE;
+//                j = core_id * 4;
+//                while (j < 4 * (n >> 2U)) {
+//                    out1 = pPRT_pDst[j];
+//                    out2 = pPRT_pDst[j + 1];
+//                    out3 = pPRT_pDst[j + 2];
+//                    out4 = pPRT_pDst[j + 3];
+//                    out1 = FIX_DIV(out1, pivot);
+//                    out2 = FIX_DIV(out2, pivot);
+//                    out3 = FIX_DIV(out3, pivot);
+//                    out4 = FIX_DIV(out4, pivot);
+//                    in1 = pSrcT2[j];
+//                    in2 = pSrcT2[j + 1];
+//                    in3 = pSrcT2[j + 2];
+//                    in4 = pSrcT2[j + 3];
+//                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+//                    j += 4 * nPE;
+//                }
+//                __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED);
+//                mempool_wfi();
+//            } else {
+//                do {
+//                    check = __atomic_fetch_add(&pivot_barrier, 0, __ATOMIC_RELAXED);
+//                    mempool_wait(20);
+//                } while (check < ((m - 1) * nPE));
+//                /* Loop over the columns */
+//                core_id = absolute_core_id % (n >> 2U);
+//                core_id = core_id - (l >> 2U);
+//                j = core_id * 4;
+//                while (j < 4 * ((n - l) >> 2U)) {
+//                    in1 = pPRT_in[j];
+//                    in2 = pPRT_in[j + 1];
+//                    in3 = pPRT_in[j + 2];
+//                    in4 = pPRT_in[j + 3];
+//                    out1 = FIX_DIV(in1, pivot);
+//                    out2 = FIX_DIV(in2, pivot);
+//                    out3 = FIX_DIV(in3, pivot);
+//                    out4 = FIX_DIV(in4, pivot);
+//                    pPRT_in[j] = out1;
+//                    pPRT_in[j + 1] = out2;
+//                    pPRT_in[j + 2] = out3;
+//                    pPRT_in[j + 3] = out4;
+//                    j += 4 * (n >> 2U);
+//                }
+//                if (core_id == 0) {
+//                    j = 4 * ((n - l) >> 2U);
+//                    while (j < n - l) {
+//                        in1 = pPRT_in[j];
+//                        pPRT_in[j] = FIX_DIV(in1, pivot);
+//                        j++;
+//                    }
+//                }
+//                /* Loop over the columns */
+//                core_id = absolute_core_id % (n >> 2U);
+//                j = core_id * 4;
+//                while (j < 4 * (n >> 2U)) {
+//                    in1 = pPRT_pDst[j];
+//                    in2 = pPRT_pDst[j + 1];
+//                    in3 = pPRT_pDst[j + 2];
+//                    in4 = pPRT_pDst[j + 3];
+//                    out1 = FIX_DIV(in1, pivot);
+//                    out2 = FIX_DIV(in2, pivot);
+//                    out3 = FIX_DIV(in3, pivot);
+//                    out4 = FIX_DIV(in4, pivot);
+//                    pPRT_pDst[j] = out1;
+//                    pPRT_pDst[j + 1] = out2;
+//                    pPRT_pDst[j + 2] = out3;
+//                    pPRT_pDst[j + 3] = out4;
+//                    j += 4 * (n >> 2U);
+//                }
+//                if (core_id == (n >> 2U) - 1) {
+//                    j = 4 * (n >> 2U);
+//                    while (j < n) {
+//                        in1 = pPRT_pDst[j];
+//                        pPRT_pDst[j] = FIX_DIV(in1, pivot);
+//                        j++;
+//                    }
+//                }
+//                if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED)) {
+//                    __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED);
+//                    __sync_synchronize();
+//                    wake_up_all();
+//                }
+//                mempool_wfi();
+//            }
+//        }
+
 //        /* REPLACE ROWS */
 //        pSrcT1 = pSrc;
 //        pSrcT2 = pDst;

From 8acd2602fa37c713da6e028771a21e9c53763d99 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 27 Sep 2022 09:40:03 +0200
Subject: [PATCH 17/22] [software] Correct lint errors

---
 software/apps/mat_inv/initialization.h        |   94 +-
 software/apps/mat_inv/main.c                  |  235 ++--
 software/apps/mat_inv/mempool_mat_inv_q32p.h  |  601 ++++-----
 .../mat_inv/mempool_mat_inv_q32p_folded.h     |  511 +++----
 .../mat_inv/mempool_mat_inv_q32p_memsized.h   | 1175 +++++++++--------
 software/apps/mat_inv/mempool_mat_inv_q32s.h  |  517 ++++----
 software/apps/svd/main.c                      |   11 +-
 software/apps/svd/nrutil.h                    |   82 +-
 software/apps/svd/svd.c                       |  445 ++++---
 9 files changed, 1857 insertions(+), 1814 deletions(-)

diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h
index ec330e766..6e48e7951 100644
--- a/software/apps/mat_inv/initialization.h
+++ b/software/apps/mat_inv/initialization.h
@@ -5,9 +5,9 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #define FIXED_POINT 16
-#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b))
-#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT))
-#define MIN(a,b) (a < b ? a : b)
+#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a, b) ((int32_t)((a * b) >> FIXED_POINT))
+#define MIN(a, b) (a < b ? a : b)
 
 dump(l, 1);
 dump(loopCnt, 2);
@@ -21,75 +21,81 @@ void display_folded(int32_t *A, int32_t n, int32_t m);
 
 void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
 
-void MatrixMult(int32_t *matrix_1,  int32_t *matrix_2,  int32_t *matrix_product, int32_t n, int32_t m, int32_t o);
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+                int32_t n, int32_t m, int32_t o);
 
-void init_matrix(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id);
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id);
 
-void init_matrix_zeros(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id);
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                       uint32_t core_id);
 
 void display(int32_t *A, int32_t n, int32_t m) {
-    //int32_t i, j;
-    //for (i = 0; i < n; i++) {
-    //  for (j = 0; j < m; j++) {
-    //    printf("%8d ", A[i * m + j]);
-    //  }
-    //  printf("\n");
-    //}
-    int32_t i;
-    for (i = 0; i < n * m; i++) {
-      printf("Output[%d] = %8d\n", i, A[i]);
-    }
+  // int32_t i, j;
+  // for (i = 0; i < n; i++) {
+  //  for (j = 0; j < m; j++) {
+  //    printf("%8d ", A[i * m + j]);
+  //  }
+  //  printf("\n");
+  //}
+  int32_t i;
+  for (i = 0; i < n * m; i++) {
+    printf("Output[%d] = %8d\n", i, A[i]);
+  }
 }
 
 #ifdef FOLDED
 void display_folded(int32_t *A, int32_t n, int32_t m) {
-    int32_t i, j, k, shift;
-    for (i = 0; i < n * m; i++) {
-      k = i / n;
-      j = i % n;
-      shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-      printf("Output[%d] = %8d\n", i, A[shift + j]);
-    }
+  int32_t i, j, k, shift;
+  for (i = 0; i < n * m; i++) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    printf("Output[%d] = %8d\n", i, A[shift + j]);
+  }
 }
 #endif
 
-void Transpose(int32_t *matrix,  int32_t *t_matrix, int32_t n, int32_t m) {
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) {
   int32_t i, j;
   for (i = 0; i < n; i++) {
-      for (j = 0; j < m; j++) {
-          t_matrix[j * n + i] = matrix[i * m + j];
-      }
+    for (j = 0; j < m; j++) {
+      t_matrix[j * n + i] = matrix[i * m + j];
+    }
   }
 }
 
-void MatrixMult(int32_t *matrix_1,  int32_t *matrix_2,  int32_t *matrix_product, int32_t n, int32_t m, int32_t o) {
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+                int32_t n, int32_t m, int32_t o) {
   int32_t i, j, k;
   for (i = 0; i < n; i++) {
-      for (j = 0; j < o; j++) {
-        matrix_product[i * o + j] = 0;
-        for (k = 0; k < m; k++) {
-          matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
+    for (j = 0; j < o; j++) {
+      matrix_product[i * o + j] = 0;
+      for (k = 0; k < m; k++) {
+        matrix_product[i * o + j] +=
+            FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
       }
     }
   }
 }
 
-void init_matrix(int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id) {
-  if(core_id == 0) {
-    for(uint32_t j = 0; j < num_rows; j++) {
-      for(uint32_t i = 0; i < num_columns; i++) {
-          matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                 int32_t a, int32_t b, int32_t c, uint32_t core_id) {
+  if (core_id == 0) {
+    for (uint32_t j = 0; j < num_rows; j++) {
+      for (uint32_t i = 0; i < num_columns; i++) {
+        matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
       }
     }
   }
 }
 
-
-void init_matrix_zeros (int32_t  *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id) {
-  if(core_id == 0) {
-    for(uint32_t i = 0; i < num_columns; i++) {
-      for(uint32_t j = 0; j < num_rows; j++) {
-          matrix[j * num_columns + i] = 0;
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+                       uint32_t core_id) {
+  if (core_id == 0) {
+    for (uint32_t i = 0; i < num_columns; i++) {
+      for (uint32_t j = 0; j < num_rows; j++) {
+        matrix[j * num_columns + i] = 0;
       }
     }
   }
diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index e0cb8741b..ebe4eca06 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -22,149 +22,146 @@
 // #define FOLDED
 
 #include "initialization.h"
-#include "mempool_mat_inv_q32s.h"
 #include "mempool_mat_inv_q32p.h"
-#include "mempool_mat_inv_q32p_memsized.h"
 #include "mempool_mat_inv_q32p_folded.h"
+#include "mempool_mat_inv_q32p_memsized.h"
+#include "mempool_mat_inv_q32s.h"
 
 #ifdef FOLDED
-int32_t matrix[N * M]                                       __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)]   __attribute__((aligned(N_BANKS), section(".l1")));
-int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)]             __attribute__((aligned(N_BANKS), section(".l1")));
-uint32_t flag                                               __attribute__((section(".l1")));
+int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)]
+    __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)]
+    __attribute__((aligned(N_BANKS), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
 #else
-int32_t matrix[N * M]         __attribute__((aligned(N), section(".l1")));
-int32_t inv[M * M]            __attribute__((aligned(N), section(".l1")));
-uint32_t flag                 __attribute__((section(".l1")));
+int32_t matrix[N * M] __attribute__((aligned(N), section(".l1")));
+int32_t inv[M * M] __attribute__((aligned(N), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
 #endif
 
 // Driver program
-void single_core()
-{
-
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t num_cores = mempool_get_core_count();
-    // Initialize barrier and synchronize
-    mempool_barrier_init(core_id);
-
-    init_matrix(matrix, N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(inv, M, M, core_id);
-    mempool_barrier(num_cores);
-
-    if(core_id == 0) {
-        mempool_start_benchmark();
-        mempool_GJinv_q32s(matrix, inv, M);
-        mempool_stop_benchmark();
-    }
-    mempool_barrier(num_cores);
-    #ifdef VERBOSE
-    if (core_id == 0)
-      display(inv, N, M);
-    #endif
-    mempool_barrier(num_cores);
-}
+void single_core() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
 
-void multi_core()
-{
-
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t num_cores = mempool_get_core_count();
-    // Initialize barrier and synchronize
-    mempool_barrier_init(core_id);
-
-    init_matrix(matrix, N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(inv, M, M, core_id);
-    if (core_id == 0) {
-        flag = 0U;
-    }
-    mempool_barrier(num_cores);
-
-    if (core_id < MIN(NUM_CORES, N / 4)) {
-      mempool_start_benchmark();
-      mempool_GJinv_q32p(matrix, inv, M, &flag);
-      mempool_stop_benchmark();
-    }
-    mempool_barrier(num_cores);
-    #ifdef VERBOSE
-    if (core_id == 0)
-      display(inv, M, N);
-    #endif
-    mempool_barrier(num_cores);
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(inv, M, M, core_id);
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_GJinv_q32s(matrix, inv, M);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#ifdef VERBOSE
+  if (core_id == 0)
+    display(inv, N, M);
+#endif
+  mempool_barrier(num_cores);
 }
 
-void multi_core_memsized()
-{
+void multi_core() {
 
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t num_cores = mempool_get_core_count();
-    // Initialize barrier and synchronize
-    mempool_barrier_init(core_id);
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
 
-    init_matrix(matrix, N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(inv, N, M, core_id);
-    if (core_id == 0) {
-        flag = 0U;
-    }
-    mempool_barrier(num_cores);
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(inv, M, M, core_id);
+  if (core_id == 0) {
+    flag = 0U;
+  }
+  mempool_barrier(num_cores);
 
+  if (core_id < MIN(NUM_CORES, N / 4)) {
     mempool_start_benchmark();
-    mempool_GJinv_q32p_memsized(matrix, inv, M, &flag);
+    mempool_GJinv_q32p(matrix, inv, M, &flag);
     mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#ifdef VERBOSE
+  if (core_id == 0)
+    display(inv, M, N);
+#endif
+  mempool_barrier(num_cores);
+}
+
+void multi_core_memsized() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
+
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(inv, N, M, core_id);
+  if (core_id == 0) {
+    flag = 0U;
+  }
+  mempool_barrier(num_cores);
+
+  mempool_start_benchmark();
+  mempool_GJinv_q32p_memsized(matrix, inv, M, &flag);
+  mempool_stop_benchmark();
 
-    mempool_barrier(num_cores);
-    #ifdef VERBOSE
-    if (core_id == 0)
-      display(inv, M, N);
-    #endif
-    mempool_barrier(num_cores);
+  mempool_barrier(num_cores);
+#ifdef VERBOSE
+  if (core_id == 0)
+    display(inv, M, N);
+#endif
+  mempool_barrier(num_cores);
 }
 
 #ifdef FOLDED
-void multi_core_folded()
-{
-
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t num_cores = mempool_get_core_count();
-    uint32_t nPE = N_USED_BANKS >> 2U;
-    // Initialize barrier and synchronize
-    mempool_barrier_init(core_id);
-
-    init_matrix(matrix, N, M, -156, 427, -219, core_id);
-    init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
-    init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
-    if (core_id == 0) {
-        flag = 0U;
-        __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED);
-    }
-    mempool_barrier(num_cores);
-
+void multi_core_folded() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t nPE = N_USED_BANKS >> 2U;
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
+
+  init_matrix(matrix, N, M, -156, 427, -219, core_id);
+  init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+  init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+  if (core_id == 0) {
+    flag = 0U;
+    __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED);
+  }
+  mempool_barrier(num_cores);
+
+  mempool_start_benchmark();
+  fold_matrix(matrix, folded_matrix, N);
+  mempool_stop_benchmark();
+  if (core_id < nPE) {
     mempool_start_benchmark();
-    fold_matrix(matrix, folded_matrix, N);
+    mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE);
     mempool_stop_benchmark();
-    if(core_id < nPE) {
-        mempool_start_benchmark();
-        mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE);
-        mempool_stop_benchmark();
-    }
-    mempool_barrier(num_cores);
-    #ifdef VERBOSE
-    if (core_id == 0)
-      display_folded(inv, M, N);
-    #endif
-    mempool_barrier(num_cores);
-
+  }
+  mempool_barrier(num_cores);
+#ifdef VERBOSE
+  if (core_id == 0)
+    display_folded(inv, M, N);
+#endif
+  mempool_barrier(num_cores);
 }
 #endif
 
 int main() {
-    #if defined(SINGLE)
-    single_core();
-    #elif defined(PARALLEL)
-    multi_core();
-    #elif defined(MEMSIZED)
-    multi_core_memsized();
-    #elif defined(FOLDED)
-    multi_core_folded();
-    #endif
-    return 0;
+#if defined(SINGLE)
+  single_core();
+#elif defined(PARALLEL)
+  multi_core();
+#elif defined(MEMSIZED)
+  multi_core_memsized();
+#elif defined(FOLDED)
+  multi_core_folded();
+#endif
+  return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
index 952d06fc4..09e2b449f 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h
@@ -6,332 +6,335 @@
 
 /* GAUSS JORDAN INVERSION */
 
-int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
+int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                       uint32_t *flag);
 
-int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
+int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                       uint32_t *flag) {
 
-    int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
-    int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
-    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
-    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t in = 0;
-    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-    int32_t in1, in2, in3, in4;
-    int32_t out1, out2, out3, out4;
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
 
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t i, j, loopCnt, k, l;  /* loop counters */
-    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t i, j, loopCnt, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matirces must be square. */
 
-    /* CREATE THE IDENTITY MATRIX */
+  /* CREATE THE IDENTITY MATRIX */
 
-    pDstT1 = pDst;
-    for (k = core_id * 4; k < m; k += 4 * NUM_CORES) {
-        for (j = 0; j < m; j++) {
-            pDstT1[k * m + j] = (uint32_t) (k == j);
-            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
-            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
-            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
-        }
+  pDstT1 = pDst;
+  for (k = core_id * 4; k < m; k += 4 * NUM_CORES) {
+    for (j = 0; j < m; j++) {
+      pDstT1[k * m + j] = (uint32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j);
     }
-    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+  }
+  mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
-    /* Loop over the number of columns of the input matrix. */
-    loopCnt = n;
-    /* Index modifier to navigate through the columns */
-    l = 0U;
+  /* Loop over the number of columns of the input matrix. */
+  loopCnt = n;
+  /* Index modifier to navigate through the columns */
+  l = 0U;
 
-    while (loopCnt > 0U) {
+  while (loopCnt > 0U) {
 
-        pSrcT1 = pSrc + (l * n);
-        pDstT1 = pDst + (l * n);
-        in = *pSrcT1;
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    in = *pSrcT1;
 
-        /* CHECK IF PIVOT ELEMENT IS ZERO */
-        if (core_id == 0) {
-            if (in == 0U) {
-                /* Loop over the rows present below */
-                for (k = l + 1U; k < m; k++) {
-                    pSrcT2 = pSrc + (n * k);
-                    pDstT2 = pDst + (n * k);
-                    /* EXCHANGE */
-                    if (*pSrcT2 != 0) {
-                        /* Loop over colums to the right of the pivot */
-                        j = 0;
-                        while (j < 4 * ((n - l) >> 2U)) {
-                            Xchg1 = pSrcT2[j];
-                            Xchg2 = pSrcT2[j + 1];
-                            Xchg3 = pSrcT2[j + 2];
-                            Xchg4 = pSrcT2[j + 3];
-                            out1 = pSrcT1[j];
-                            out2 = pSrcT1[j + 1];
-                            out3 = pSrcT1[j + 2];
-                            out4 = pSrcT1[j + 3];
-                            pSrcT2[j] = out1;
-                            pSrcT2[j + 1] = out2;
-                            pSrcT2[j + 2] = out3;
-                            pSrcT2[j + 3] = out4;
-                            pSrcT1[j] = Xchg1;
-                            pSrcT1[j + 1] = Xchg2;
-                            pSrcT1[j + 2] = Xchg3;
-                            pSrcT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n - l) {
-                            Xchg1 = pSrcT2[j];
-                            pSrcT2[j] = pSrcT1[j];
-                            pSrcT1[j] = Xchg1;
-                            j++;
-                        }
-                        /* Loop over colums */
-                        j = 0;
-                        while (j < 4 * (n >> 2U)) {
-                            Xchg1 = pDstT2[j];
-                            Xchg2 = pDstT2[j + 1];
-                            Xchg3 = pDstT2[j + 2];
-                            Xchg4 = pDstT2[j + 3];
-                            out1 = pDstT1[j];
-                            out2 = pDstT1[j + 1];
-                            out3 = pDstT1[j + 2];
-                            out4 = pDstT1[j + 3];
-                            pDstT2[j] = out1;
-                            pDstT2[j + 1] = out2;
-                            pDstT2[j + 2] = out3;
-                            pDstT2[j + 3] = out4;
-                            pDstT1[j] = Xchg1;
-                            pDstT1[j + 1] = Xchg2;
-                            pDstT1[j + 2] = Xchg3;
-                            pDstT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n) {
-                            Xchg1 = pDstT2[j];
-                            pDstT2[j] = pDstT1[j];
-                            pDstT1[j] = Xchg1;
-                            j++;
-                        }
-                        *flag = 1U;
-                        break;
-                    }
-                }
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          pSrcT2 = pSrc + (n * k);
+          pDstT2 = pDst + (n * k);
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
             }
-            /* Update the status if the matrix is singular */
-            if ((*flag == 0U) && (in == 0U)) {
-                return 1;
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
             }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
         }
-        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
 
-        /* DIVIDE BY THE PIVOT */
-        /* Points to the pivot row of input and destination matrices */
-        pPivotRowIn = pSrc + (l * n);
-        pPivotRowDst = pDst + (l * n);
-        /* Temporary pointers to the pivot row pointers */
-        pSrcT1 = pPivotRowIn;
-        pSrcT2 = pPivotRowDst;
-        /* Pivot element of the row */
-        in = *pPivotRowIn;
+    ///* Loop over columns to the right of pivot */
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+      // j += NUM_CORES * 4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    /* Loop over columns */
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
-        ///* Loop over columns to the right of pivot */
-        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    /* Loop over rows */
+    for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+      i = 0U;
+      while (i < 4) {
+        if ((i + k) != l) {
+          pSrcT1 = pSrc + (i + k) * n;
+          pSrcT2 = pDst + (i + k) * n;
+          /* Element of the reference row */
+          in = *pSrcT1;
+          pPRT_in = pPivotRowIn;
+          pPRT_pDst = pPivotRowDst;
+          /* Loop over columns to the right of pivot */
+          j = 0;
+          while (j < 4 * ((n - l) >> 2U)) {
             in1 = pSrcT1[j];
             in2 = pSrcT1[j + 1];
             in3 = pSrcT1[j + 2];
             in4 = pSrcT1[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT1[j] = out1;
-            pSrcT1[j + 1] = out2;
-            pSrcT1[j + 2] = out3;
-            pSrcT1[j + 3] = out4;
-            // j += NUM_CORES * 4;
-        }
-        if (core_id == (n >> 2U) - 1) {
-            j = 4 * ((n - l) >> 2U);
-            while (j < n - l) {
-                in1 = pSrcT1[j];
-                pSrcT1[j] = FIX_DIV(in1, in);
-                j++;
-            }
-        }
-        /* Loop over columns */
-        for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+            out1 = pPRT_in[j];
+            out2 = pPRT_in[j + 1];
+            out3 = pPRT_in[j + 2];
+            out4 = pPRT_in[j + 3];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+          /* Loop over columns */
+          j = 0;
+          while (j < 4 * (n >> 2U)) {
             in1 = pSrcT2[j];
             in2 = pSrcT2[j + 1];
             in3 = pSrcT2[j + 2];
             in4 = pSrcT2[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT2[j] = out1;
-            pSrcT2[j + 1] = out2;
-            pSrcT2[j + 2] = out3;
-            pSrcT2[j + 3] = out4;
-        }
-        if (core_id == (n >> 2U) - 1) {
-            j = 4 * (n >> 2U);
-            while (j < n) {
-                in1 = pSrcT2[j];
-                pSrcT2[j] = FIX_DIV(in1, in);
-                j++;
-            }
-        }
-        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-
-
-        /* REPLACE ROWS */
-        pSrcT1 = pSrc;
-        pSrcT2 = pDst;
-        /* Loop over rows */
-        for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
-            i = 0U;
-            while (i < 4) {
-                if ((i + k) != l) {
-                    pSrcT1 = pSrc + (i + k) * n;
-                    pSrcT2 = pDst + (i + k) * n;
-                    /* Element of the reference row */
-                    in = *pSrcT1;
-                    pPRT_in = pPivotRowIn;
-                    pPRT_pDst = pPivotRowDst;
-                    /* Loop over columns to the right of pivot */
-                    j = 0;
-                    while (j < 4 * ((n - l) >> 2U)) {
-                        in1 = pSrcT1[j];
-                        in2 = pSrcT1[j + 1];
-                        in3 = pSrcT1[j + 2];
-                        in4 = pSrcT1[j + 3];
-                        out1 = pPRT_in[j];
-                        out2 = pPRT_in[j + 1];
-                        out3 = pPRT_in[j + 2];
-                        out4 = pPRT_in[j + 3];
-                        pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-                        pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-                        pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-                        pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-                        j += 4;
-                    }
-                    while (j < n - l) {
-                        in1 = pSrcT1[j];
-                        out1 = pPRT_in[j];
-                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
-                    /* Loop over columns */
-                    j = 0;
-                    while (j < 4 * (n >> 2U)) {
-                        in1 = pSrcT2[j];
-                        in2 = pSrcT2[j + 1];
-                        in3 = pSrcT2[j + 2];
-                        in4 = pSrcT2[j + 3];
-                        out1 = pPRT_pDst[j];
-                        out2 = pPRT_pDst[j + 1];
-                        out3 = pPRT_pDst[j + 2];
-                        out4 = pPRT_pDst[j + 3];
-                        pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-                        pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-                        pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-                        pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-                        j += 4;
-                    }
-                    while (j < n) {
-                        in1 = pSrcT2[j];
-                        out1 = pPRT_pDst[j];
-                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
-                }
-                i++;
-            }
+            out1 = pPRT_pDst[j];
+            out2 = pPRT_pDst[j + 1];
+            out3 = pPRT_pDst[j + 2];
+            out4 = pPRT_pDst[j + 3];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
         }
-        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+        i++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
-//        /* REPLACE ROWS */
-//        pSrcT1 = pSrc;
-//        pSrcT2 = pDst;
-//        /* Loop over rows */
-//        for (k = 0; k < m; k++) {
-//            if (k != l) {
-//                pSrcT1 = pSrc + k * n;
-//                pSrcT2 = pDst + k * n;
-//                /* Element of the reference row */
-//                in = *pSrcT1;
-//                pPRT_in = pPivotRowIn;
-//                pPRT_pDst = pPivotRowDst;
-//                /* Loop over columns to the right of pivot */
-//                j = core_id * 4;
-//                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U);
-//                while (j < 4 * ((n - l) >> 2U)) {
-//                    in1 = pSrcT1[j];
-//                    in2 = pSrcT1[j + 1];
-//                    in3 = pSrcT1[j + 2];
-//                    in4 = pSrcT1[j + 3];
-//                    out1 = pPRT_in[j];
-//                    out2 = pPRT_in[j + 1];
-//                    out3 = pPRT_in[j + 2];
-//                    out4 = pPRT_in[j + 3];
-//                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4 * NUM_CORES;
-//                }
-//                if (core_id == (n >> 2U) - 1) {
-//                    j = 4 * ((n - l) >> 2U);
-//                    while (j < n - l) {
-//                        in1 = pSrcT1[j];
-//                        out1 = pPRT_in[j];
-//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                        j++;
-//                    }
-//                }
-//                /* Loop over columns */
-//                j = core_id * 4;
-//                while (j < 4 * (n >> 2U)) {
-//                    in1 = pSrcT2[j];
-//                    in2 = pSrcT2[j + 1];
-//                    in3 = pSrcT2[j + 2];
-//                    in4 = pSrcT2[j + 3];
-//                    out1 = pPRT_pDst[j];
-//                    out2 = pPRT_pDst[j + 1];
-//                    out3 = pPRT_pDst[j + 2];
-//                    out4 = pPRT_pDst[j + 3];
-//                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4 * NUM_CORES;
-//                }
-//                if (core_id == (n >> 2U) - 1) {
-//                    j = 4 * (n >> 2U);
-//                    while (j < n) {
-//                        in1 = pSrcT2[j];
-//                        out1 = pPRT_pDst[j];
-//                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-//                        j++;
-//                    }
-//                }
-//                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
-//            }
-//        }
-//        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+    //        /* REPLACE ROWS */
+    //        pSrcT1 = pSrc;
+    //        pSrcT2 = pDst;
+    //        /* Loop over rows */
+    //        for (k = 0; k < m; k++) {
+    //            if (k != l) {
+    //                pSrcT1 = pSrc + k * n;
+    //                pSrcT2 = pDst + k * n;
+    //                /* Element of the reference row */
+    //                in = *pSrcT1;
+    //                pPRT_in = pPivotRowIn;
+    //                pPRT_pDst = pPivotRowDst;
+    //                /* Loop over columns to the right of pivot */
+    //                j = core_id * 4;
+    //                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n
+    //                - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) {
+    //                    in1 = pSrcT1[j];
+    //                    in2 = pSrcT1[j + 1];
+    //                    in3 = pSrcT1[j + 2];
+    //                    in4 = pSrcT1[j + 3];
+    //                    out1 = pPRT_in[j];
+    //                    out2 = pPRT_in[j + 1];
+    //                    out3 = pPRT_in[j + 2];
+    //                    out4 = pPRT_in[j + 3];
+    //                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * NUM_CORES;
+    //                }
+    //                if (core_id == (n >> 2U) - 1) {
+    //                    j = 4 * ((n - l) >> 2U);
+    //                    while (j < n - l) {
+    //                        in1 = pSrcT1[j];
+    //                        out1 = pPRT_in[j];
+    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                        j++;
+    //                    }
+    //                }
+    //                /* Loop over columns */
+    //                j = core_id * 4;
+    //                while (j < 4 * (n >> 2U)) {
+    //                    in1 = pSrcT2[j];
+    //                    in2 = pSrcT2[j + 1];
+    //                    in3 = pSrcT2[j + 2];
+    //                    in4 = pSrcT2[j + 3];
+    //                    out1 = pPRT_pDst[j];
+    //                    out2 = pPRT_pDst[j + 1];
+    //                    out3 = pPRT_pDst[j + 2];
+    //                    out4 = pPRT_pDst[j + 3];
+    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * NUM_CORES;
+    //                }
+    //                if (core_id == (n >> 2U) - 1) {
+    //                    j = 4 * (n >> 2U);
+    //                    while (j < n) {
+    //                        in1 = pSrcT2[j];
+    //                        out1 = pPRT_pDst[j];
+    //                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+    //                        j++;
+    //                    }
+    //                }
+    //                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n /
+    //                4));
+    //            }
+    //        }
+    //        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
 
-        pSrc++;     /* Increment the input pointer */
-        loopCnt--;  /* Decrement the loop counter */
-        l++;        /* Increment the index modifier */
-    }
+    pSrc++;    /* Increment the input pointer */
+    loopCnt--; /* Decrement the loop counter */
+    l++;       /* Increment the index modifier */
+  }
 
-//    if ((flag != 1U) && (x == 0)) {
-//        for (i = 0; i < m * n; i++) {
-//            if (pSrc[i] != 0)
-//                break;
-//        }
-//        if (i == m * n)
-//            return 1;
-//    }
-    return 0;
+  //    if ((flag != 1U) && (x == 0)) {
+  //        for (i = 0; i < m * n; i++) {
+  //            if (pSrc[i] != 0)
+  //                break;
+  //        }
+  //        if (i == m * n)
+  //            return 1;
+  //    }
+  return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
index 5dc0aefc8..6064a1faf 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
@@ -6,282 +6,285 @@
 
 /* GAUSS JORDAN INVERSION */
 
-int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE);
-void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n);
+int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                              uint32_t *flag, uint32_t nPE);
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
-
-void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n) {
-    uint32_t core_id = mempool_get_core_id();
-    uint32_t i, j, k, shift;
-    for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) {
-        k = i / n;
-        j = i % n;
-        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-        pDst[shift + j] = pSrc[i];
-        pDst[shift + j + 1] = pSrc[i + 1];
-        pDst[shift + j + 2] = pSrc[i + 2];
-        pDst[shift + j + 3] = pSrc[i + 3];
-    }
-    mempool_log_barrier(2, core_id);
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t i, j, k, shift;
+  for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDst[shift + j] = pSrc[i];
+    pDst[shift + j + 1] = pSrc[i + 1];
+    pDst[shift + j + 2] = pSrc[i + 2];
+    pDst[shift + j + 3] = pSrc[i + 3];
+  }
+  mempool_log_barrier(2, core_id);
 }
 
-int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE) {
+int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                              uint32_t *flag, uint32_t nPE) {
 
-    int32_t volatile *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
-    int32_t volatile *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
-    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
-    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t in = 0;
-    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-    int32_t in1, in2, in3, in4;
-    int32_t out1, out2, out3, out4;
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
 
-    uint32_t absolute_core_id = mempool_get_core_id();
-    uint32_t core_id = absolute_core_id;
-    uint32_t shift = 0;
-    uint32_t i, j, k, l;  /* loop counters */
-    uint32_t m = n;       /* M is the number of rows. However, the matrices must be square. */
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  uint32_t shift = 0;
+  uint32_t i, j, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matrices must be square. */
 
-    /* CREATE THE IDENTITY MATRIX */
-    pDstT1 = pDst;
-    for (i = core_id * 4; i < n * m; i += nPE * 4) {
-        k = i / n;
-        j = i % n;
-        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-        pDstT1[shift + j] = (uint32_t) (k == j);
-        pDstT1[shift + j + 1] = (uint32_t) (k == (j + 1));
-        pDstT1[shift + j + 2] = (uint32_t) (k == (j + 2));
-        pDstT1[shift + j + 3] = (uint32_t) (k == (j + 3));
-    }
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  /* CREATE THE IDENTITY MATRIX */
+  pDstT1 = pDst;
+  for (i = core_id * 4; i < n * m; i += nPE * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDstT1[shift + j] = (uint32_t)(k == j);
+    pDstT1[shift + j + 1] = (uint32_t)(k == (j + 1));
+    pDstT1[shift + j + 2] = (uint32_t)(k == (j + 2));
+    pDstT1[shift + j + 3] = (uint32_t)(k == (j + 3));
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
 
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-    while (l < n) {
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
 
-        shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
-        pSrcT1 = pSrc + shift;
-        pDstT1 = pDst + shift;
-        in = *pSrcT1;
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pSrcT1 = pSrc + shift;
+    pDstT1 = pDst + shift;
+    in = *pSrcT1;
 
-        /* CHECK IF PIVOT ELEMENT IS ZERO */
-        if (absolute_core_id == 0) {
-            if (in == 0U) {
-                /* Loop over the rows present below */
-                for (k = l + 1U; k < m; k++) {
-                    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-                    pSrcT2 = pSrc + shift;
-                    pDstT2 = pDst + shift;
-                    /* EXCHANGE */
-                    if (*pSrcT2 != 0) {
-                        /* Loop over colums to the right of the pivot */
-                        j = 0;
-                        while (j < 4 * ((n - l) >> 2U)) {
-                            Xchg1 = pSrcT2[j];
-                            Xchg2 = pSrcT2[j + 1];
-                            Xchg3 = pSrcT2[j + 2];
-                            Xchg4 = pSrcT2[j + 3];
-                            out1 = pSrcT1[j];
-                            out2 = pSrcT1[j + 1];
-                            out3 = pSrcT1[j + 2];
-                            out4 = pSrcT1[j + 3];
-                            pSrcT2[j] = out1;
-                            pSrcT2[j + 1] = out2;
-                            pSrcT2[j + 2] = out3;
-                            pSrcT2[j + 3] = out4;
-                            pSrcT1[j] = Xchg1;
-                            pSrcT1[j + 1] = Xchg2;
-                            pSrcT1[j + 2] = Xchg3;
-                            pSrcT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n - l) {
-                            Xchg1 = pSrcT2[j];
-                            pSrcT2[j] = pSrcT1[j];
-                            pSrcT1[j] = Xchg1;
-                            j++;
-                        }
-                        /* Loop over colums */
-                        j = 0;
-                        while (j < 4 * (n >> 2U)) {
-                            Xchg1 = pDstT2[j];
-                            Xchg2 = pDstT2[j + 1];
-                            Xchg3 = pDstT2[j + 2];
-                            Xchg4 = pDstT2[j + 3];
-                            out1 = pDstT1[j];
-                            out2 = pDstT1[j + 1];
-                            out3 = pDstT1[j + 2];
-                            out4 = pDstT1[j + 3];
-                            pDstT2[j] = out1;
-                            pDstT2[j + 1] = out2;
-                            pDstT2[j + 2] = out3;
-                            pDstT2[j + 3] = out4;
-                            pDstT1[j] = Xchg1;
-                            pDstT1[j + 1] = Xchg2;
-                            pDstT1[j + 2] = Xchg3;
-                            pDstT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n) {
-                            Xchg1 = pDstT2[j];
-                            pDstT2[j] = pDstT1[j];
-                            pDstT1[j] = Xchg1;
-                            j++;
-                        }
-                        *flag = 1U;
-                        break;
-                    }
-                }
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (absolute_core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+          pSrcT2 = pSrc + shift;
+          pDstT2 = pDst + shift;
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
             }
-            /* Update the status if the matrix is singular */
-            if ((*flag == 0U) && (in == 0U)) {
-                return 1;
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
             }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
         }
-        mempool_log_partial_barrier(2, absolute_core_id, nPE);
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
 
-        /* DIVIDE BY THE PIVOT */
-        /* Points to the pivot row of input and destination matrices */
-        shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
-        pPivotRowIn = pSrc + shift;
-        pPivotRowDst = pDst + shift;
-        /* Temporary pointers to the pivot row pointers */
-        pSrcT1 = pPivotRowIn;
-        pSrcT2 = pPivotRowDst;
-        /* Pivot element of the row */
-        in = *pPivotRowIn;
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pPivotRowIn = pSrc + shift;
+    pPivotRowDst = pDst + shift;
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
 
-        /* Loop over columns to the right of pivot */
-        core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
-        core_id = core_id > nPE ? core_id + nPE : core_id;
-        for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
-            in1 = pSrcT1[j];
-            in2 = pSrcT1[j + 1];
-            in3 = pSrcT1[j + 2];
-            in4 = pSrcT1[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT1[j] = out1;
-            pSrcT1[j + 1] = out2;
-            pSrcT1[j + 2] = out3;
-            pSrcT1[j + 3] = out4;
+    /* Loop over columns to the right of pivot */
+    core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+    }
+    if (core_id == 0) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+
+    /* Loop over columns */
+    core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (k != l) {
+        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+        pSrcT1 = pSrc + shift;
+        pSrcT2 = pDst + shift;
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
+        /* Loop over the columns */
+        core_id = absolute_core_id % (n >> 2U);
+        core_id = core_id - (l >> 2U);
+        j = core_id * 4;
+        while (j < 4 * ((n - l) >> 2U)) {
+          out1 = pPRT_in[j];
+          out2 = pPRT_in[j + 1];
+          out3 = pPRT_in[j + 2];
+          out4 = pPRT_in[j + 3];
+          in1 = pSrcT1[j];
+          in2 = pSrcT1[j + 1];
+          in3 = pSrcT1[j + 2];
+          in4 = pSrcT1[j + 3];
+          pSrcT1[j] = in1 - FIX_MUL(in, out1);
+          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
         }
         if (core_id == 0) {
-            j = 4 * ((n - l) >> 2U);
-            while (j < n - l) {
-                in1 = pSrcT1[j];
-                pSrcT1[j] = FIX_DIV(in1, in);
-                j++;
-            }
+          j = 4 * ((n - l) >> 2U);
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
         }
-
-        /* Loop over columns */
-        core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
-        core_id = core_id > nPE ? core_id + nPE : core_id;
-        for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
-            in1 = pSrcT2[j];
-            in2 = pSrcT2[j + 1];
-            in3 = pSrcT2[j + 2];
-            in4 = pSrcT2[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT2[j] = out1;
-            pSrcT2[j + 1] = out2;
-            pSrcT2[j + 2] = out3;
-            pSrcT2[j + 3] = out4;
+        core_id = absolute_core_id % (n >> 2U);
+        /* Loop over the columns */
+        j = core_id * 4;
+        while (j < 4 * (n >> 2U)) {
+          out1 = pPRT_pDst[j];
+          out2 = pPRT_pDst[j + 1];
+          out3 = pPRT_pDst[j + 2];
+          out4 = pPRT_pDst[j + 3];
+          in1 = pSrcT2[j];
+          in2 = pSrcT2[j + 1];
+          in3 = pSrcT2[j + 2];
+          in4 = pSrcT2[j + 3];
+          pSrcT2[j] = in1 - FIX_MUL(in, out1);
+          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
         }
         if (core_id == (n >> 2U) - 1) {
-            j = 4 * (n >> 2U);
-            while (j < n) {
-                in1 = pSrcT2[j];
-                pSrcT2[j] = FIX_DIV(in1, in);
-                j++;
-            }
-        }
-        mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-        /* REPLACE ROWS */
-        pSrcT1 = pSrc;
-        pSrcT2 = pDst;
-        for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
-            /* Only the columns to the right of the pivot are to be processed */
-            if (k != l) {
-                shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-                pSrcT1 = pSrc + shift;
-                pSrcT2 = pDst + shift;
-                /* Element of the reference row */
-                in = *pSrcT1;
-                /* Reference row pointers */
-                pPRT_in = pPivotRowIn;
-                pPRT_pDst = pPivotRowDst;
-                /* Loop over the columns */
-                core_id = absolute_core_id % (n >> 2U);
-                core_id = core_id - (l >> 2U);
-                j = core_id * 4;
-                while (j < 4 * ((n - l) >> 2U)) {
-                    out1 = pPRT_in[j];
-                    out2 = pPRT_in[j + 1];
-                    out3 = pPRT_in[j + 2];
-                    out4 = pPRT_in[j + 3];
-                    in1 = pSrcT1[j];
-                    in2 = pSrcT1[j + 1];
-                    in3 = pSrcT1[j + 2];
-                    in4 = pSrcT1[j + 3];
-                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * (n >> 2U);
-                }
-                if (core_id == 0) {
-                    j = 4 * ((n - l) >> 2U);
-                    while (j < n - l) {
-                        in1 = pSrcT1[j];
-                        out1 = pPRT_in[j];
-                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
-                }
-                core_id = absolute_core_id % (n >> 2U);
-                /* Loop over the columns */
-                j = core_id * 4;
-                while (j < 4 * (n >> 2U)) {
-                    out1 = pPRT_pDst[j];
-                    out2 = pPRT_pDst[j + 1];
-                    out3 = pPRT_pDst[j + 2];
-                    out4 = pPRT_pDst[j + 3];
-                    in1 = pSrcT2[j];
-                    in2 = pSrcT2[j + 1];
-                    in3 = pSrcT2[j + 2];
-                    in4 = pSrcT2[j + 3];
-                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * (n >> 2U);
-                }
-                if (core_id == (n >> 2U) - 1) {
-                    j = 4 * (n >> 2U);
-                    while (j < n) {
-                        in1 = pSrcT2[j];
-                        out1 = pPRT_pDst[j];
-                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
-                }
-            }
+          j = 4 * (n >> 2U);
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
         }
-        mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-        pSrc++;     /* Increment the input pointer */
-        l++;        /* Increment the index modifier */
+      }
     }
     mempool_log_partial_barrier(2, absolute_core_id, nPE);
 
-    return 0;
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
index 6ec20a91b..b697f9d24 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
@@ -8,616 +8,621 @@
 
 uint32_t volatile pivot_barrier __attribute__((section(".l1")));
 
-int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag);
+int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                                uint32_t *flag);
 
-int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) {
+int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                                uint32_t *flag) {
 
-    int32_t volatile *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
-    int32_t volatile *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
-    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
-    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t in = 0;
-    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-    int32_t in1, in2, in3, in4;
-    int32_t out1, out2, out3, out4;
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
 
-    uint32_t absolute_core_id = mempool_get_core_id();
-    uint32_t core_id = absolute_core_id;
-    uint32_t i, j, k, l;  /* loop counters */
-    uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  uint32_t i, j, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matirces must be square. */
 
-    /* CREATE THE IDENTITY MATRIX */
+  /* CREATE THE IDENTITY MATRIX */
 
-    pDstT1 = pDst;
-    for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
-        for (j = 0; j < n; j++) {
-            pDstT1[k * n + j] = (uint32_t) (k == j);
-            pDstT1[(k + 1) * n + j] = (uint32_t) ((k + 1) == j);
-            pDstT1[(k + 2) * n + j] = (uint32_t) ((k + 2) == j);
-            pDstT1[(k + 3) * n + j] = (uint32_t) ((k + 3) == j);
-        }
+  pDstT1 = pDst;
+  for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+    for (j = 0; j < n; j++) {
+      pDstT1[k * n + j] = (uint32_t)(k == j);
+      pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j);
+      pDstT1[(k + 2) * n + j] = (uint32_t)((k + 2) == j);
+      pDstT1[(k + 3) * n + j] = (uint32_t)((k + 3) == j);
     }
-//    pDstT1 = pDst;
-//    for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) {
-//        k = i / n;
-//        j = i % n;
-//        pDstT1[k * n + j] = (uint32_t) (k == j);
-//        pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1));
-//        pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2));
-//        pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3));
-//    }
-//    mempool_log_barrier(2, absolute_core_id);
+  }
+  //    pDstT1 = pDst;
+  //    for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) {
+  //        k = i / n;
+  //        j = i % n;
+  //        pDstT1[k * n + j] = (uint32_t) (k == j);
+  //        pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1));
+  //        pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2));
+  //        pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3));
+  //    }
+  //    mempool_log_barrier(2, absolute_core_id);
 
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-    while (l < n) {
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
 
-        pSrcT1 = pSrc + (l * n);
-        pDstT1 = pDst + (l * n);
-        in = *pSrcT1;
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    in = *pSrcT1;
 
-        /* CHECK IF PIVOT ELEMENT IS ZERO */
-        if (absolute_core_id == 0) {
-            if (in == 0U) {
-                /* Loop over the rows present below */
-                for (k = l + 1U; k < m; k++) {
-                    pSrcT2 = pSrc + (n * k);
-                    pDstT2 = pDst + (n * k);
-                    /* EXCHANGE */
-                    if (*pSrcT2 != 0) {
-                        /* Loop over colums to the right of the pivot */
-                        j = 0;
-                        while (j < 4 * ((n - l) >> 2U)) {
-                            Xchg1 = pSrcT2[j];
-                            Xchg2 = pSrcT2[j + 1];
-                            Xchg3 = pSrcT2[j + 2];
-                            Xchg4 = pSrcT2[j + 3];
-                            out1 = pSrcT1[j];
-                            out2 = pSrcT1[j + 1];
-                            out3 = pSrcT1[j + 2];
-                            out4 = pSrcT1[j + 3];
-                            pSrcT2[j] = out1;
-                            pSrcT2[j + 1] = out2;
-                            pSrcT2[j + 2] = out3;
-                            pSrcT2[j + 3] = out4;
-                            pSrcT1[j] = Xchg1;
-                            pSrcT1[j + 1] = Xchg2;
-                            pSrcT1[j + 2] = Xchg3;
-                            pSrcT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n - l) {
-                            Xchg1 = pSrcT2[j];
-                            pSrcT2[j] = pSrcT1[j];
-                            pSrcT1[j] = Xchg1;
-                            j++;
-                        }
-                        /* Loop over colums */
-                        j = 0;
-                        while (j < 4 * (n >> 2U)) {
-                            Xchg1 = pDstT2[j];
-                            Xchg2 = pDstT2[j + 1];
-                            Xchg3 = pDstT2[j + 2];
-                            Xchg4 = pDstT2[j + 3];
-                            out1 = pDstT1[j];
-                            out2 = pDstT1[j + 1];
-                            out3 = pDstT1[j + 2];
-                            out4 = pDstT1[j + 3];
-                            pDstT2[j] = out1;
-                            pDstT2[j + 1] = out2;
-                            pDstT2[j + 2] = out3;
-                            pDstT2[j + 3] = out4;
-                            pDstT1[j] = Xchg1;
-                            pDstT1[j + 1] = Xchg2;
-                            pDstT1[j + 2] = Xchg3;
-                            pDstT1[j + 3] = Xchg4;
-                            j += 4;
-                        }
-                        while (j < n) {
-                            Xchg1 = pDstT2[j];
-                            pDstT2[j] = pDstT1[j];
-                            pDstT1[j] = Xchg1;
-                            j++;
-                        }
-                        *flag = 1U;
-                        break;
-                    }
-                }
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (absolute_core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          pSrcT2 = pSrc + (n * k);
+          pDstT2 = pDst + (n * k);
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
             }
-            /* Update the status if the matrix is singular */
-            if ((*flag == 0U) && (in == 0U)) {
-                return 1;
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
             }
-        }
-        mempool_log_barrier(2, absolute_core_id);
-
-        /* DIVIDE BY THE PIVOT */
-        /* Points to the pivot row of input and destination matrices */
-        pPivotRowIn = pSrc + (l * n);
-        pPivotRowDst = pDst + (l * n);
-        /* Temporary pointers to the pivot row pointers */
-        pSrcT1 = pPivotRowIn;
-        pSrcT2 = pPivotRowDst;
-        /* Pivot element of the row */
-        in = *pPivotRowIn;
-        /* Loop over columns to the right of pivot */
-        core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U);
-        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
-        //for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
-        //    in1 = pSrcT1[j];
-        //    in2 = pSrcT1[j + 1];
-        //    in3 = pSrcT1[j + 2];
-        //    in4 = pSrcT1[j + 3];
-        //    out1 = FIX_DIV(in1, in);
-        //    out2 = FIX_DIV(in2, in);
-        //    out3 = FIX_DIV(in3, in);
-        //    out4 = FIX_DIV(in4, in);
-        //    pSrcT1[j] = out1;
-        //    pSrcT1[j + 1] = out2;
-        //    pSrcT1[j + 2] = out3;
-        //    pSrcT1[j + 3] = out4;
-        //}
-        //if (core_id == 0) {
-        //    j = 4 * ((n - l) >> 2U);
-        //    while (j < n - l) {
-        //        in1 = pSrcT1[j];
-        //        pSrcT1[j] = FIX_DIV(in1, in);
-        //        j++;
-        //    }
-        //}
-        if(core_id == 0) {
+            /* Loop over colums */
             j = 0;
-            while (j < 4 - l % 4) {
-                in1 = pSrcT1[j];
-                pSrcT1[j] = FIX_DIV(in1, in);
-                j++;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
             }
-        } else {
-            j = core_id * 4 - l % 4;
-            if (j < (n - l)) {
-                in1 = pSrcT1[j];
-                in2 = pSrcT1[j + 1];
-                in3 = pSrcT1[j + 2];
-                in4 = pSrcT1[j + 3];
-                out1 = FIX_DIV(in1, in);
-                out2 = FIX_DIV(in2, in);
-                out3 = FIX_DIV(in3, in);
-                out4 = FIX_DIV(in4, in);
-                pSrcT1[j] = out1;
-                pSrcT1[j + 1] = out2;
-                pSrcT1[j + 2] = out3;
-                pSrcT1[j + 3] = out4;
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
             }
+            *flag = 1U;
+            break;
+          }
+        }
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_barrier(2, absolute_core_id);
+
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+    /* Loop over columns to the right of pivot */
+    core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U);
+    core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
+    // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+    //    in1 = pSrcT1[j];
+    //    in2 = pSrcT1[j + 1];
+    //    in3 = pSrcT1[j + 2];
+    //    in4 = pSrcT1[j + 3];
+    //    out1 = FIX_DIV(in1, in);
+    //    out2 = FIX_DIV(in2, in);
+    //    out3 = FIX_DIV(in3, in);
+    //    out4 = FIX_DIV(in4, in);
+    //    pSrcT1[j] = out1;
+    //    pSrcT1[j + 1] = out2;
+    //    pSrcT1[j + 2] = out3;
+    //    pSrcT1[j + 3] = out4;
+    //}
+    // if (core_id == 0) {
+    //    j = 4 * ((n - l) >> 2U);
+    //    while (j < n - l) {
+    //        in1 = pSrcT1[j];
+    //        pSrcT1[j] = FIX_DIV(in1, in);
+    //        j++;
+    //    }
+    //}
+    if (core_id == 0) {
+      j = 0;
+      while (j < 4 - l % 4) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    } else {
+      j = core_id * 4 - l % 4;
+      if (j < (n - l)) {
+        in1 = pSrcT1[j];
+        in2 = pSrcT1[j + 1];
+        in3 = pSrcT1[j + 2];
+        in4 = pSrcT1[j + 3];
+        out1 = FIX_DIV(in1, in);
+        out2 = FIX_DIV(in2, in);
+        out3 = FIX_DIV(in3, in);
+        out4 = FIX_DIV(in4, in);
+        pSrcT1[j] = out1;
+        pSrcT1[j + 1] = out2;
+        pSrcT1[j + 2] = out3;
+        pSrcT1[j + 3] = out4;
+      }
+    }
+    /* Loop over columns */
+    core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U);
+    core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    // if (core_id == (n >> 2U) - 1) {
+    //    j = 4 * (n >> 2U);
+    //    while (j < n) {
+    //        in1 = pSrcT2[j];
+    //        pSrcT2[j] = FIX_DIV(in1, in);
+    //        j++;
+    //    }
+    //}
+    mempool_log_barrier(2, absolute_core_id);
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (k != l) {
+        pSrcT1 = pSrc + k * n;
+        pSrcT2 = pDst + k * n;
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
+        /* Loop over the columns */
+        core_id = absolute_core_id % (n >> 2U);
+        core_id = core_id - (l >> 2U);
+        j = core_id * 4;
+        while (j < 4 * ((n - l) >> 2U)) {
+          out1 = pPRT_in[j];
+          out2 = pPRT_in[j + 1];
+          out3 = pPRT_in[j + 2];
+          out4 = pPRT_in[j + 3];
+          in1 = pSrcT1[j];
+          in2 = pSrcT1[j + 1];
+          in3 = pSrcT1[j + 2];
+          in4 = pSrcT1[j + 3];
+          pSrcT1[j] = in1 - FIX_MUL(in, out1);
+          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
+        }
+        if (core_id == 0) {
+          j = 4 * ((n - l) >> 2U);
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
         }
-        /* Loop over columns */
-        core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U);
-        core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
-        for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
-            in1 = pSrcT2[j];
-            in2 = pSrcT2[j + 1];
-            in3 = pSrcT2[j + 2];
-            in4 = pSrcT2[j + 3];
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            pSrcT2[j] = out1;
-            pSrcT2[j + 1] = out2;
-            pSrcT2[j + 2] = out3;
-            pSrcT2[j + 3] = out4;
+        /* Loop over the columns */
+        core_id = absolute_core_id % (n >> 2U);
+        j = core_id * 4;
+        while (j < 4 * (n >> 2U)) {
+          out1 = pPRT_pDst[j];
+          out2 = pPRT_pDst[j + 1];
+          out3 = pPRT_pDst[j + 2];
+          out4 = pPRT_pDst[j + 3];
+          in1 = pSrcT2[j];
+          in2 = pSrcT2[j + 1];
+          in3 = pSrcT2[j + 2];
+          in4 = pSrcT2[j + 3];
+          pSrcT2[j] = in1 - FIX_MUL(in, out1);
+          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
         }
-        //if (core_id == (n >> 2U) - 1) {
+        // if (core_id == (n >> 2U) - 1) {
         //    j = 4 * (n >> 2U);
         //    while (j < n) {
         //        in1 = pSrcT2[j];
-        //        pSrcT2[j] = FIX_DIV(in1, in);
+        //        out1 = pPRT_pDst[j];
+        //        pSrcT2[j] = in1 - FIX_MUL(in, out1);
         //        j++;
         //    }
         //}
-        mempool_log_barrier(2, absolute_core_id);
-
-        /* REPLACE ROWS */
-        pSrcT1 = pSrc;
-        pSrcT2 = pDst;
-        for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
-            /* Only the columns to the right of the pivot are to be processed */
-            if (k != l) {
-                pSrcT1 = pSrc + k * n;
-                pSrcT2 = pDst + k * n;
-                /* Element of the reference row */
-                in = *pSrcT1;
-                /* Reference row pointers */
-                pPRT_in = pPivotRowIn;
-                pPRT_pDst = pPivotRowDst;
-                /* Loop over the columns */
-                core_id = absolute_core_id % (n >> 2U);
-                core_id = core_id - (l >> 2U);
-                j = core_id * 4;
-                while (j < 4 * ((n - l) >> 2U)) {
-                    out1 = pPRT_in[j];
-                    out2 = pPRT_in[j + 1];
-                    out3 = pPRT_in[j + 2];
-                    out4 = pPRT_in[j + 3];
-                    in1 = pSrcT1[j];
-                    in2 = pSrcT1[j + 1];
-                    in3 = pSrcT1[j + 2];
-                    in4 = pSrcT1[j + 3];
-                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * (n >> 2U);
-                }
-                if (core_id == 0) {
-                    j = 4 * ((n - l) >> 2U);
-                    while (j < n - l) {
-                        in1 = pSrcT1[j];
-                        out1 = pPRT_in[j];
-                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-                        j++;
-                    }
-                }
-                /* Loop over the columns */
-                core_id = absolute_core_id % (n >> 2U);
-                j = core_id * 4;
-                while (j < 4 * (n >> 2U)) {
-                    out1 = pPRT_pDst[j];
-                    out2 = pPRT_pDst[j + 1];
-                    out3 = pPRT_pDst[j + 2];
-                    out4 = pPRT_pDst[j + 3];
-                    in1 = pSrcT2[j];
-                    in2 = pSrcT2[j + 1];
-                    in3 = pSrcT2[j + 2];
-                    in4 = pSrcT2[j + 3];
-                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-                    j += 4 * (n >> 2U);
-                }
-                //if (core_id == (n >> 2U) - 1) {
-                //    j = 4 * (n >> 2U);
-                //    while (j < n) {
-                //        in1 = pSrcT2[j];
-                //        out1 = pPRT_pDst[j];
-                //        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-                //        j++;
-                //    }
-                //}
-                //uint32_t core_id_in;
-                //uint32_t core_id_Dst;
-                //int32_t p1_in, p2_in, p3_in, p4_in;
-                //int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst;
-                //core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U);
-                //core_id_Dst = absolute_core_id % (n >> 2U);
-                //j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4);
-                //i = core_id_Dst * 4;
-                //p1_in = pPRT_in[j];
-                //p2_in = pPRT_in[j + 1];
-                //p3_in = pPRT_in[j + 2];
-                //p4_in = pPRT_in[j + 3];
-                //p1_Dst = pPRT_pDst[i];
-                //p2_Dst = pPRT_pDst[i + 1];
-                //p3_Dst = pPRT_pDst[i + 2];
-                //p4_Dst = pPRT_pDst[i + 3];
-                //if(core_id_in == 0) {
-                //    switch (4 - l % 4) {
-                //        case (1):
-                //            in1 = pSrcT1[j];
-                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-                //            break;
-                //        case (2):
-                //            in1 = pSrcT1[j];
-                //            in2 = pSrcT1[j + 1];
-                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-                //            break;
-                //        case (3):
-                //            in1 = pSrcT1[j];
-                //            in2 = pSrcT1[j + 1];
-                //            in3 = pSrcT1[j + 2];
-                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-                //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-                //            break;
-                //        case (4):
-                //            in1 = pSrcT1[j];
-                //            in2 = pSrcT1[j + 1];
-                //            in3 = pSrcT1[j + 2];
-                //            in4 = pSrcT1[j + 3];
-                //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-                //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-                //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-                //            pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
-                //            break;
-                //    }
-                //} else {
-                //    in1 = pSrcT1[j];
-                //    in2 = pSrcT1[j + 1];
-                //    in3 = pSrcT1[j + 2];
-                //    in4 = pSrcT1[j + 3];
-                //    pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-                //    pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-                //    pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-                //    pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
-                //}
-                //in1 = pSrcT2[i];
-                //in2 = pSrcT2[i + 1];
-                //in3 = pSrcT2[i + 2];
-                //in4 = pSrcT2[i + 3];
-                //pSrcT2[i]     = in1 - FIX_MUL(in, p1_Dst);
-                //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst);
-                //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst);
-                //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst);
-            }
-        }
-        mempool_log_barrier(2, absolute_core_id);
+        // uint32_t core_id_in;
+        // uint32_t core_id_Dst;
+        // int32_t p1_in, p2_in, p3_in, p4_in;
+        // int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst;
+        // core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U);
+        // core_id_Dst = absolute_core_id % (n >> 2U);
+        // j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4);
+        // i = core_id_Dst * 4;
+        // p1_in = pPRT_in[j];
+        // p2_in = pPRT_in[j + 1];
+        // p3_in = pPRT_in[j + 2];
+        // p4_in = pPRT_in[j + 3];
+        // p1_Dst = pPRT_pDst[i];
+        // p2_Dst = pPRT_pDst[i + 1];
+        // p3_Dst = pPRT_pDst[i + 2];
+        // p4_Dst = pPRT_pDst[i + 3];
+        // if(core_id_in == 0) {
+        //    switch (4 - l % 4) {
+        //        case (1):
+        //            in1 = pSrcT1[j];
+        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+        //            break;
+        //        case (2):
+        //            in1 = pSrcT1[j];
+        //            in2 = pSrcT1[j + 1];
+        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+        //            break;
+        //        case (3):
+        //            in1 = pSrcT1[j];
+        //            in2 = pSrcT1[j + 1];
+        //            in3 = pSrcT1[j + 2];
+        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+        //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+        //            break;
+        //        case (4):
+        //            in1 = pSrcT1[j];
+        //            in2 = pSrcT1[j + 1];
+        //            in3 = pSrcT1[j + 2];
+        //            in4 = pSrcT1[j + 3];
+        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+        //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+        //            pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
+        //            break;
+        //    }
+        //} else {
+        //    in1 = pSrcT1[j];
+        //    in2 = pSrcT1[j + 1];
+        //    in3 = pSrcT1[j + 2];
+        //    in4 = pSrcT1[j + 3];
+        //    pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
+        //    pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
+        //    pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
+        //    pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
+        //}
+        // in1 = pSrcT2[i];
+        // in2 = pSrcT2[i + 1];
+        // in3 = pSrcT2[i + 2];
+        // in4 = pSrcT2[i + 3];
+        // pSrcT2[i]     = in1 - FIX_MUL(in, p1_Dst);
+        // pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst);
+        // pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst);
+        // pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst);
+      }
+    }
+    mempool_log_barrier(2, absolute_core_id);
 
-//        /* REPLACE ROWS */
-//        pSrcT1 = pSrc;
-//        pSrcT2 = pDst;
-//        /* Reference row pointers */
-//        pPRT_in = pSrc + (l * n);
-//        pPRT_pDst = pDst + (l * n);
-//        int32_t pivot = *pPRT_in;
-//        uint32_t nPE = (n >> 2U);
-//        uint32_t check = 0;
-//        if (absolute_core_id >= m * nPE)
-//            mempool_wfi();
-//        for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) {
-//            /* Only the columns to the right of the pivot are to be processed */
-//            if (k != l) {
-//                pSrcT1 = pSrc + k * n;
-//                pSrcT2 = pDst + k * n;
-//                /* Element of the reference row */
-//                in = *pSrcT1;
-//                /* Loop over the columns */
-//                core_id = absolute_core_id % nPE;
-//                core_id = core_id - (l >> 2U);
-//                j = core_id * 4;
-//                while (j < 4 * ((n - l) >> 2U)) {
-//                    out1 = pPRT_in[j];
-//                    out2 = pPRT_in[j + 1];
-//                    out3 = pPRT_in[j + 2];
-//                    out4 = pPRT_in[j + 3];
-//                    out1 = FIX_DIV(out1, pivot);
-//                    out2 = FIX_DIV(out2, pivot);
-//                    out3 = FIX_DIV(out3, pivot);
-//                    out4 = FIX_DIV(out4, pivot);
-//                    in1 = pSrcT1[j];
-//                    in2 = pSrcT1[j + 1];
-//                    in3 = pSrcT1[j + 2];
-//                    in4 = pSrcT1[j + 3];
-//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4 * (n >> 2U);
-//                }
-//                if (core_id == 0) {
-//                    j = 4 * ((n - l) >> 2U);
-//                    while (j < n - l) {
-//                        out1 = pPRT_in[j];
-//                        out1 = FIX_DIV(out1, pivot);
-//                        in1 = pSrcT1[j];
-//                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                        j++;
-//                    }
-//                }
-//                /* Loop over the columns */
-//                core_id = absolute_core_id % nPE;
-//                j = core_id * 4;
-//                while (j < 4 * (n >> 2U)) {
-//                    out1 = pPRT_pDst[j];
-//                    out2 = pPRT_pDst[j + 1];
-//                    out3 = pPRT_pDst[j + 2];
-//                    out4 = pPRT_pDst[j + 3];
-//                    out1 = FIX_DIV(out1, pivot);
-//                    out2 = FIX_DIV(out2, pivot);
-//                    out3 = FIX_DIV(out3, pivot);
-//                    out4 = FIX_DIV(out4, pivot);
-//                    in1 = pSrcT2[j];
-//                    in2 = pSrcT2[j + 1];
-//                    in3 = pSrcT2[j + 2];
-//                    in4 = pSrcT2[j + 3];
-//                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4 * nPE;
-//                }
-//                __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED);
-//                mempool_wfi();
-//            } else {
-//                do {
-//                    check = __atomic_fetch_add(&pivot_barrier, 0, __ATOMIC_RELAXED);
-//                    mempool_wait(20);
-//                } while (check < ((m - 1) * nPE));
-//                /* Loop over the columns */
-//                core_id = absolute_core_id % (n >> 2U);
-//                core_id = core_id - (l >> 2U);
-//                j = core_id * 4;
-//                while (j < 4 * ((n - l) >> 2U)) {
-//                    in1 = pPRT_in[j];
-//                    in2 = pPRT_in[j + 1];
-//                    in3 = pPRT_in[j + 2];
-//                    in4 = pPRT_in[j + 3];
-//                    out1 = FIX_DIV(in1, pivot);
-//                    out2 = FIX_DIV(in2, pivot);
-//                    out3 = FIX_DIV(in3, pivot);
-//                    out4 = FIX_DIV(in4, pivot);
-//                    pPRT_in[j] = out1;
-//                    pPRT_in[j + 1] = out2;
-//                    pPRT_in[j + 2] = out3;
-//                    pPRT_in[j + 3] = out4;
-//                    j += 4 * (n >> 2U);
-//                }
-//                if (core_id == 0) {
-//                    j = 4 * ((n - l) >> 2U);
-//                    while (j < n - l) {
-//                        in1 = pPRT_in[j];
-//                        pPRT_in[j] = FIX_DIV(in1, pivot);
-//                        j++;
-//                    }
-//                }
-//                /* Loop over the columns */
-//                core_id = absolute_core_id % (n >> 2U);
-//                j = core_id * 4;
-//                while (j < 4 * (n >> 2U)) {
-//                    in1 = pPRT_pDst[j];
-//                    in2 = pPRT_pDst[j + 1];
-//                    in3 = pPRT_pDst[j + 2];
-//                    in4 = pPRT_pDst[j + 3];
-//                    out1 = FIX_DIV(in1, pivot);
-//                    out2 = FIX_DIV(in2, pivot);
-//                    out3 = FIX_DIV(in3, pivot);
-//                    out4 = FIX_DIV(in4, pivot);
-//                    pPRT_pDst[j] = out1;
-//                    pPRT_pDst[j + 1] = out2;
-//                    pPRT_pDst[j + 2] = out3;
-//                    pPRT_pDst[j + 3] = out4;
-//                    j += 4 * (n >> 2U);
-//                }
-//                if (core_id == (n >> 2U) - 1) {
-//                    j = 4 * (n >> 2U);
-//                    while (j < n) {
-//                        in1 = pPRT_pDst[j];
-//                        pPRT_pDst[j] = FIX_DIV(in1, pivot);
-//                        j++;
-//                    }
-//                }
-//                if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED)) {
-//                    __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED);
-//                    __sync_synchronize();
-//                    wake_up_all();
-//                }
-//                mempool_wfi();
-//            }
-//        }
+    //        /* REPLACE ROWS */
+    //        pSrcT1 = pSrc;
+    //        pSrcT2 = pDst;
+    //        /* Reference row pointers */
+    //        pPRT_in = pSrc + (l * n);
+    //        pPRT_pDst = pDst + (l * n);
+    //        int32_t pivot = *pPRT_in;
+    //        uint32_t nPE = (n >> 2U);
+    //        uint32_t check = 0;
+    //        if (absolute_core_id >= m * nPE)
+    //            mempool_wfi();
+    //        for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) {
+    //            /* Only the columns to the right of the pivot are to be
+    //            processed */ if (k != l) {
+    //                pSrcT1 = pSrc + k * n;
+    //                pSrcT2 = pDst + k * n;
+    //                /* Element of the reference row */
+    //                in = *pSrcT1;
+    //                /* Loop over the columns */
+    //                core_id = absolute_core_id % nPE;
+    //                core_id = core_id - (l >> 2U);
+    //                j = core_id * 4;
+    //                while (j < 4 * ((n - l) >> 2U)) {
+    //                    out1 = pPRT_in[j];
+    //                    out2 = pPRT_in[j + 1];
+    //                    out3 = pPRT_in[j + 2];
+    //                    out4 = pPRT_in[j + 3];
+    //                    out1 = FIX_DIV(out1, pivot);
+    //                    out2 = FIX_DIV(out2, pivot);
+    //                    out3 = FIX_DIV(out3, pivot);
+    //                    out4 = FIX_DIV(out4, pivot);
+    //                    in1 = pSrcT1[j];
+    //                    in2 = pSrcT1[j + 1];
+    //                    in3 = pSrcT1[j + 2];
+    //                    in4 = pSrcT1[j + 3];
+    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * (n >> 2U);
+    //                }
+    //                if (core_id == 0) {
+    //                    j = 4 * ((n - l) >> 2U);
+    //                    while (j < n - l) {
+    //                        out1 = pPRT_in[j];
+    //                        out1 = FIX_DIV(out1, pivot);
+    //                        in1 = pSrcT1[j];
+    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                        j++;
+    //                    }
+    //                }
+    //                /* Loop over the columns */
+    //                core_id = absolute_core_id % nPE;
+    //                j = core_id * 4;
+    //                while (j < 4 * (n >> 2U)) {
+    //                    out1 = pPRT_pDst[j];
+    //                    out2 = pPRT_pDst[j + 1];
+    //                    out3 = pPRT_pDst[j + 2];
+    //                    out4 = pPRT_pDst[j + 3];
+    //                    out1 = FIX_DIV(out1, pivot);
+    //                    out2 = FIX_DIV(out2, pivot);
+    //                    out3 = FIX_DIV(out3, pivot);
+    //                    out4 = FIX_DIV(out4, pivot);
+    //                    in1 = pSrcT2[j];
+    //                    in2 = pSrcT2[j + 1];
+    //                    in3 = pSrcT2[j + 2];
+    //                    in4 = pSrcT2[j + 3];
+    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * nPE;
+    //                }
+    //                __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED);
+    //                mempool_wfi();
+    //            } else {
+    //                do {
+    //                    check = __atomic_fetch_add(&pivot_barrier, 0,
+    //                    __ATOMIC_RELAXED); mempool_wait(20);
+    //                } while (check < ((m - 1) * nPE));
+    //                /* Loop over the columns */
+    //                core_id = absolute_core_id % (n >> 2U);
+    //                core_id = core_id - (l >> 2U);
+    //                j = core_id * 4;
+    //                while (j < 4 * ((n - l) >> 2U)) {
+    //                    in1 = pPRT_in[j];
+    //                    in2 = pPRT_in[j + 1];
+    //                    in3 = pPRT_in[j + 2];
+    //                    in4 = pPRT_in[j + 3];
+    //                    out1 = FIX_DIV(in1, pivot);
+    //                    out2 = FIX_DIV(in2, pivot);
+    //                    out3 = FIX_DIV(in3, pivot);
+    //                    out4 = FIX_DIV(in4, pivot);
+    //                    pPRT_in[j] = out1;
+    //                    pPRT_in[j + 1] = out2;
+    //                    pPRT_in[j + 2] = out3;
+    //                    pPRT_in[j + 3] = out4;
+    //                    j += 4 * (n >> 2U);
+    //                }
+    //                if (core_id == 0) {
+    //                    j = 4 * ((n - l) >> 2U);
+    //                    while (j < n - l) {
+    //                        in1 = pPRT_in[j];
+    //                        pPRT_in[j] = FIX_DIV(in1, pivot);
+    //                        j++;
+    //                    }
+    //                }
+    //                /* Loop over the columns */
+    //                core_id = absolute_core_id % (n >> 2U);
+    //                j = core_id * 4;
+    //                while (j < 4 * (n >> 2U)) {
+    //                    in1 = pPRT_pDst[j];
+    //                    in2 = pPRT_pDst[j + 1];
+    //                    in3 = pPRT_pDst[j + 2];
+    //                    in4 = pPRT_pDst[j + 3];
+    //                    out1 = FIX_DIV(in1, pivot);
+    //                    out2 = FIX_DIV(in2, pivot);
+    //                    out3 = FIX_DIV(in3, pivot);
+    //                    out4 = FIX_DIV(in4, pivot);
+    //                    pPRT_pDst[j] = out1;
+    //                    pPRT_pDst[j + 1] = out2;
+    //                    pPRT_pDst[j + 2] = out3;
+    //                    pPRT_pDst[j + 3] = out4;
+    //                    j += 4 * (n >> 2U);
+    //                }
+    //                if (core_id == (n >> 2U) - 1) {
+    //                    j = 4 * (n >> 2U);
+    //                    while (j < n) {
+    //                        in1 = pPRT_pDst[j];
+    //                        pPRT_pDst[j] = FIX_DIV(in1, pivot);
+    //                        j++;
+    //                    }
+    //                }
+    //                if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1,
+    //                __ATOMIC_RELAXED)) {
+    //                    __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED);
+    //                    __sync_synchronize();
+    //                    wake_up_all();
+    //                }
+    //                mempool_wfi();
+    //            }
+    //        }
 
-//        /* REPLACE ROWS */
-//        pSrcT1 = pSrc;
-//        pSrcT2 = pDst;
-//        for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) {
-//            k = i / n;
-//            if (k != l) {
-//                in = *(pSrc + k * n);
-//                j = i - (k * n);
-//                if (j >= 4 * (l >> 2U)) {
-//                    if (j == 4 * (l >> 2U)) {
-//                        pSrcT1 = pSrc + k * n;
-//                        pPRT_in = pPivotRowIn;
-//                        uint32_t bound = j + 4 - l;
-//                        j = 0;
-//                        while (j < bound) {
-//                            in1 = *pSrcT1;
-//                            out1 = *pPRT_in++;
-//                            *pSrcT1++ = in1 - FIX_MUL(in, out1);
-//                            j++;
-//                        }
-//                    } else {
-//                        pSrcT1 = pSrc + (i - l);
-//                        pPRT_in = pPivotRowIn + (j - l);
-//                        in1 = *pSrcT1;
-//                        in2 = *(pSrcT1 + 1);
-//                        in3 = *(pSrcT1 + 2);
-//                        in4 = *(pSrcT1 + 3);
-//                        out1 = *pPRT_in++;
-//                        out2 = *pPRT_in++;
-//                        out3 = *pPRT_in++;
-//                        out4 = *pPRT_in++;
-//                        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-//                        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-//                        *pSrcT1++ = in3 - FIX_MUL(in, out3);
-//                        *pSrcT1++ = in4 - FIX_MUL(in, out4);
-//                    }
-//                }
-//                pSrcT2 = pDst + i;
-//                pPRT_pDst = pPivotRowDst + j;
-//                in1 = *pSrcT2;
-//                in2 = *(pSrcT2 + 1);
-//                in3 = *(pSrcT2 + 2);
-//                in4 = *(pSrcT2 + 3);
-//                out1 = *pPRT_pDst++;
-//                out2 = *pPRT_pDst++;
-//                out3 = *pPRT_pDst++;
-//                out4 = *pPRT_pDst++;
-//                *pSrcT2++ = in1 - FIX_MUL(in, out1);
-//                *pSrcT2++ = in2 - FIX_MUL(in, out2);
-//                *pSrcT2++ = in3 - FIX_MUL(in, out3);
-//                *pSrcT2++ = in4 - FIX_MUL(in, out4);
-//            }
-//        }
-//        mempool_log_barrier(2, absolute_core_id);
-//        /* REPLACE ROWS */
-//        pSrcT1 = pSrc;
-//        pSrcT2 = pDst;
-//        core_id = absolute_core_id;
-//        for (k = core_id; k < m; k += NUM_CORES) {
-//            /* Only the columns to the right of the pivot are to be processed */
-//            if (k != l) {
-//                pSrcT1 = pSrc + k * n;
-//                pSrcT2 = pDst + k * n;
-//                /* Element of the reference row */
-//                in = *pSrcT1;
-//                /* Reference row pointers */
-//                pPRT_in = pPivotRowIn;
-//                pPRT_pDst = pPivotRowDst;
-//                /* Loop over the columns */
-//                j = 0;
-//                while (j < 4 * ((n - l) >> 2U)) {
-//                    in1 = pSrcT1[j];
-//                    in2 = pSrcT1[j + 1];
-//                    in3 = pSrcT1[j + 2];
-//                    in4 = pSrcT1[j + 3];
-//                    out1 = pPRT_in[j];
-//                    out2 = pPRT_in[j + 1];
-//                    out3 = pPRT_in[j + 2];
-//                    out4 = pPRT_in[j + 3];
-//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4;
-//                }
-//                while (j < n - l) {
-//                    in1 = pSrcT1[j];
-//                    out1 = pPRT_in[j];
-//                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-//                    j++;
-//                }
-//                /* Loop over the columns */
-//                j = 0;
-//                while (j < 4 * (n >> 2U)) {
-//                    in1 = pSrcT2[j];
-//                    in2 = pSrcT2[j + 1];
-//                    in3 = pSrcT2[j + 2];
-//                    in4 = pSrcT2[j + 3];
-//                    out1 = pPRT_pDst[j];
-//                    out2 = pPRT_pDst[j + 1];
-//                    out3 = pPRT_pDst[j + 2];
-//                    out4 = pPRT_pDst[j + 3];
-//                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
-//                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-//                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-//                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-//                    j += 4;
-//                }
-//                while (j < n) {
-//                    in1 = pSrcT2[j];
-//                    out1 = pPRT_pDst[j];
-//                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
-//                    j++;
-//                }
-//            }
-//        }
-//        mempool_log_barrier(2, absolute_core_id);
+    //        /* REPLACE ROWS */
+    //        pSrcT1 = pSrc;
+    //        pSrcT2 = pDst;
+    //        for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) {
+    //            k = i / n;
+    //            if (k != l) {
+    //                in = *(pSrc + k * n);
+    //                j = i - (k * n);
+    //                if (j >= 4 * (l >> 2U)) {
+    //                    if (j == 4 * (l >> 2U)) {
+    //                        pSrcT1 = pSrc + k * n;
+    //                        pPRT_in = pPivotRowIn;
+    //                        uint32_t bound = j + 4 - l;
+    //                        j = 0;
+    //                        while (j < bound) {
+    //                            in1 = *pSrcT1;
+    //                            out1 = *pPRT_in++;
+    //                            *pSrcT1++ = in1 - FIX_MUL(in, out1);
+    //                            j++;
+    //                        }
+    //                    } else {
+    //                        pSrcT1 = pSrc + (i - l);
+    //                        pPRT_in = pPivotRowIn + (j - l);
+    //                        in1 = *pSrcT1;
+    //                        in2 = *(pSrcT1 + 1);
+    //                        in3 = *(pSrcT1 + 2);
+    //                        in4 = *(pSrcT1 + 3);
+    //                        out1 = *pPRT_in++;
+    //                        out2 = *pPRT_in++;
+    //                        out3 = *pPRT_in++;
+    //                        out4 = *pPRT_in++;
+    //                        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+    //                        *pSrcT1++ = in2 - FIX_MUL(in, out2);
+    //                        *pSrcT1++ = in3 - FIX_MUL(in, out3);
+    //                        *pSrcT1++ = in4 - FIX_MUL(in, out4);
+    //                    }
+    //                }
+    //                pSrcT2 = pDst + i;
+    //                pPRT_pDst = pPivotRowDst + j;
+    //                in1 = *pSrcT2;
+    //                in2 = *(pSrcT2 + 1);
+    //                in3 = *(pSrcT2 + 2);
+    //                in4 = *(pSrcT2 + 3);
+    //                out1 = *pPRT_pDst++;
+    //                out2 = *pPRT_pDst++;
+    //                out3 = *pPRT_pDst++;
+    //                out4 = *pPRT_pDst++;
+    //                *pSrcT2++ = in1 - FIX_MUL(in, out1);
+    //                *pSrcT2++ = in2 - FIX_MUL(in, out2);
+    //                *pSrcT2++ = in3 - FIX_MUL(in, out3);
+    //                *pSrcT2++ = in4 - FIX_MUL(in, out4);
+    //            }
+    //        }
+    //        mempool_log_barrier(2, absolute_core_id);
+    //        /* REPLACE ROWS */
+    //        pSrcT1 = pSrc;
+    //        pSrcT2 = pDst;
+    //        core_id = absolute_core_id;
+    //        for (k = core_id; k < m; k += NUM_CORES) {
+    //            /* Only the columns to the right of the pivot are to be
+    //            processed */ if (k != l) {
+    //                pSrcT1 = pSrc + k * n;
+    //                pSrcT2 = pDst + k * n;
+    //                /* Element of the reference row */
+    //                in = *pSrcT1;
+    //                /* Reference row pointers */
+    //                pPRT_in = pPivotRowIn;
+    //                pPRT_pDst = pPivotRowDst;
+    //                /* Loop over the columns */
+    //                j = 0;
+    //                while (j < 4 * ((n - l) >> 2U)) {
+    //                    in1 = pSrcT1[j];
+    //                    in2 = pSrcT1[j + 1];
+    //                    in3 = pSrcT1[j + 2];
+    //                    in4 = pSrcT1[j + 3];
+    //                    out1 = pPRT_in[j];
+    //                    out2 = pPRT_in[j + 1];
+    //                    out3 = pPRT_in[j + 2];
+    //                    out4 = pPRT_in[j + 3];
+    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4;
+    //                }
+    //                while (j < n - l) {
+    //                    in1 = pSrcT1[j];
+    //                    out1 = pPRT_in[j];
+    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                    j++;
+    //                }
+    //                /* Loop over the columns */
+    //                j = 0;
+    //                while (j < 4 * (n >> 2U)) {
+    //                    in1 = pSrcT2[j];
+    //                    in2 = pSrcT2[j + 1];
+    //                    in3 = pSrcT2[j + 2];
+    //                    in4 = pSrcT2[j + 3];
+    //                    out1 = pPRT_pDst[j];
+    //                    out2 = pPRT_pDst[j + 1];
+    //                    out3 = pPRT_pDst[j + 2];
+    //                    out4 = pPRT_pDst[j + 3];
+    //                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
+    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4;
+    //                }
+    //                while (j < n) {
+    //                    in1 = pSrcT2[j];
+    //                    out1 = pPRT_pDst[j];
+    //                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
+    //                    j++;
+    //                }
+    //            }
+    //        }
+    //        mempool_log_barrier(2, absolute_core_id);
 
-        pSrc++;     /* Increment the input pointer */
-        l++;        /* Increment the index modifier */
-    }
-    mempool_log_barrier(2, absolute_core_id);
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+  mempool_log_barrier(2, absolute_core_id);
 
-    return 0;
+  return 0;
 }
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h
index 21aadbe39..a20b918e0 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32s.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h
@@ -8,302 +8,303 @@
 
 int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
-int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) {
+int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
 
-    int32_t *pSrcT1, *pSrcT2;                    /* Temporary input data matrix pointer */
-    int32_t *pDstT1, *pDstT2;                    /* Temporary output data matrix pointer */
-    int32_t *pPivotRowIn;                        /* Temporary input and output data matrix pointer */
-    int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
 
-    int32_t in = 0;
-    int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-    int32_t in1, in2, in3, in4;
-    int32_t out1, out2, out3, out4;
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
 
-    uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */
-    uint32_t i, j, k, l; /* loop counters */
-    uint32_t flag = 0U; /* Flag to check if the matrix is singular */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matrices must be square. */
+  uint32_t i, j, k, l; /* loop counters */
+  uint32_t flag = 0U;  /* Flag to check if the matrix is singular */
 
-    pDstT1 = pDst;  /* Working pointer for destination matrix */
-    /* CREATE THE IDENTITY MATRIX */
-    for (k = 0; k < m; k += 4) {
-        for (j = 0; j < n; j++) {
-            pDstT1[k * m + j] = (uint32_t) (k == j);
-            pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j);
-            pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j);
-            pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j);
-        }
+  pDstT1 = pDst; /* Working pointer for destination matrix */
+  /* CREATE THE IDENTITY MATRIX */
+  for (k = 0; k < m; k += 4) {
+    for (j = 0; j < n; j++) {
+      pDstT1[k * m + j] = (uint32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j);
     }
+  }
 
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-    while (l < n) {
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
 
-        pSrcT1 = pSrc + (l * n);
-        pDstT1 = pDst + (l * n);
-        k = 1U;
-        in = *pSrcT1;
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    k = 1U;
+    in = *pSrcT1;
 
-        /* CHECK IF PIVOT ELEMENT IS ZERO */
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
 
-        if (in == 0) {
-            /* Loop over the rows present below */
-            for (i = (l + 1U); i < m; i++) {
-                pSrcT2 = pSrc + (n * i);
-                pDstT2 = pDstT1 + (n * k);
+    if (in == 0) {
+      /* Loop over the rows present below */
+      for (i = (l + 1U); i < m; i++) {
+        pSrcT2 = pSrc + (n * i);
+        pDstT2 = pDstT1 + (n * k);
 
-                /* EXCHANGE */
+        /* EXCHANGE */
 
-                if (*pSrcT2 != 0) {
-                    /* Loop over colums to the right of the pivot */
-                    j = 0;
-                    while (j < (n - l) - (n - l) % 4) {
-                        Xchg1 = *(pSrcT2);
-                        Xchg2 = *(pSrcT2 + 1);
-                        Xchg3 = *(pSrcT2 + 2);
-                        Xchg4 = *(pSrcT2 + 3);
-                        out1 = *(pSrcT1);
-                        out2 = *(pSrcT1 + 1);
-                        out3 = *(pSrcT1 + 2);
-                        out4 = *(pSrcT1 + 3);
-                        *pSrcT2++ = out1;
-                        *pSrcT2++ = out2;
-                        *pSrcT2++ = out3;
-                        *pSrcT2++ = out4;
-                        *pSrcT1++ = Xchg1;
-                        *pSrcT1++ = Xchg2;
-                        *pSrcT1++ = Xchg3;
-                        *pSrcT1++ = Xchg4;
-                        j += 4;
-                    }
-                    while (j < n - l) {
-                      Xchg1 = *pSrcT2;
-                      *pSrcT2++ = *pSrcT1;
-                      *pSrcT1++ = Xchg1;
-                      j++;
-                    }
-                    /* Loop over colums */
-                    j = 0;
-                    while (j < n - n % 4) {
-                        Xchg1 = *(pDstT2);
-                        Xchg2 = *(pDstT2 + 1);
-                        Xchg3 = *(pDstT2 + 2);
-                        Xchg4 = *(pDstT2 + 3);
-                        out1 = *(pDstT1);
-                        out2 = *(pDstT1 + 1);
-                        out3 = *(pDstT1 + 2);
-                        out4 = *(pDstT1 + 3);
-                        *pDstT2++ = out1;
-                        *pDstT2++ = out2;
-                        *pDstT2++ = out3;
-                        *pDstT2++ = out4;
-                        *pDstT1++ = Xchg1;
-                        *pDstT1++ = Xchg2;
-                        *pDstT1++ = Xchg3;
-                        *pDstT1++ = Xchg4;
-                        j += 4;
-                    }
-                    while (j < n) {
-                        Xchg1 = *pDstT2;
-                        *pDstT2++ = *pDstT1;
-                        *pDstT1++ = Xchg1;
-                        j++;
-                    }
-                    flag = 1U;
-                    break;
-                }
-                k++;
-            }
-        }
-        /* Return when the matrix is singular */
-        if ((flag == 0U) && (in == 0)) {
-            return 1;
+        if (*pSrcT2 != 0) {
+          /* Loop over colums to the right of the pivot */
+          j = 0;
+          while (j < (n - l) - (n - l) % 4) {
+            Xchg1 = *(pSrcT2);
+            Xchg2 = *(pSrcT2 + 1);
+            Xchg3 = *(pSrcT2 + 2);
+            Xchg4 = *(pSrcT2 + 3);
+            out1 = *(pSrcT1);
+            out2 = *(pSrcT1 + 1);
+            out3 = *(pSrcT1 + 2);
+            out4 = *(pSrcT1 + 3);
+            *pSrcT2++ = out1;
+            *pSrcT2++ = out2;
+            *pSrcT2++ = out3;
+            *pSrcT2++ = out4;
+            *pSrcT1++ = Xchg1;
+            *pSrcT1++ = Xchg2;
+            *pSrcT1++ = Xchg3;
+            *pSrcT1++ = Xchg4;
+            j += 4;
+          }
+          while (j < n - l) {
+            Xchg1 = *pSrcT2;
+            *pSrcT2++ = *pSrcT1;
+            *pSrcT1++ = Xchg1;
+            j++;
+          }
+          /* Loop over colums */
+          j = 0;
+          while (j < n - n % 4) {
+            Xchg1 = *(pDstT2);
+            Xchg2 = *(pDstT2 + 1);
+            Xchg3 = *(pDstT2 + 2);
+            Xchg4 = *(pDstT2 + 3);
+            out1 = *(pDstT1);
+            out2 = *(pDstT1 + 1);
+            out3 = *(pDstT1 + 2);
+            out4 = *(pDstT1 + 3);
+            *pDstT2++ = out1;
+            *pDstT2++ = out2;
+            *pDstT2++ = out3;
+            *pDstT2++ = out4;
+            *pDstT1++ = Xchg1;
+            *pDstT1++ = Xchg2;
+            *pDstT1++ = Xchg3;
+            *pDstT1++ = Xchg4;
+            j += 4;
+          }
+          while (j < n) {
+            Xchg1 = *pDstT2;
+            *pDstT2++ = *pDstT1;
+            *pDstT1++ = Xchg1;
+            j++;
+          }
+          flag = 1U;
+          break;
         }
+        k++;
+      }
+    }
+    /* Return when the matrix is singular */
+    if ((flag == 0U) && (in == 0)) {
+      return 1;
+    }
 
-        /* DIVIDE BY THE PIVOT */
+    /* DIVIDE BY THE PIVOT */
 
-        /* Points to the pivot row of input and destination matrices */
-        pPivotRowIn = pSrc + (l * n);
-        pPivotRowDst = pDst + (l * n);
-        /* Temporary pointers to the pivot row pointers */
-        pSrcT1 = pPivotRowIn;
-        pSrcT2 = pPivotRowDst;
-        /* Pivot element of the row */
-        in = *pPivotRowIn;
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
 
-        /* Loop over number of columns to the right of the pilot element */
+    /* Loop over number of columns to the right of the pilot element */
+    j = 0;
+    while (j < 4 * ((n - l) >> 2U)) {
+      in1 = *pSrcT1;
+      in2 = *(pSrcT1 + 1);
+      in3 = *(pSrcT1 + 2);
+      in4 = *(pSrcT1 + 3);
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      *pSrcT1++ = out1;
+      *pSrcT1++ = out2;
+      *pSrcT1++ = out3;
+      *pSrcT1++ = out4;
+      j += 4;
+    }
+    while (j < n - l) {
+      in1 = *pSrcT1;
+      *pSrcT1++ = FIX_DIV(in1, in);
+      j++;
+    }
+    // switch ((n - l) % 4) {
+    //    case 3:
+    //        in1 = *pSrcT1;
+    //        in2 = *(pSrcT1 + 1);
+    //        in3 = *(pSrcT1 + 2);
+    //        out1 = FIX_DIV(in1, in);
+    //        out2 = FIX_DIV(in2, in);
+    //        out3 = FIX_DIV(in3, in);
+    //        *pSrcT1++ = out1;
+    //        *pSrcT1++ = out2;
+    //        *pSrcT1++ = out3;
+    //        break;
+    //    case 2:
+    //        in1 = *pSrcT1;
+    //        in2 = *(pSrcT1 + 1);
+    //        out1 = FIX_DIV(in1, in);
+    //        out2 = FIX_DIV(in2, in);
+    //        *pSrcT1++ = out1;
+    //        *pSrcT1++ = out2;
+    //        break;
+    //    case 1:
+    //        in1 = *pSrcT1;
+    //        out1 = FIX_DIV(in1, in);
+    //        *pSrcT1++ = out1;
+    //        break;
+    //}
+    /* Loop over number of columns of the destination matrix */
+    j = 0;
+    while (j < 4 * (n >> 2U)) {
+      in1 = *pSrcT2;
+      in2 = *(pSrcT2 + 1);
+      in3 = *(pSrcT2 + 2);
+      in4 = *(pSrcT2 + 3);
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      *pSrcT2++ = out1;
+      *pSrcT2++ = out2;
+      *pSrcT2++ = out3;
+      *pSrcT2++ = out4;
+      j += 4;
+    }
+    while (j < n) {
+      in1 = *pSrcT2;
+      *pSrcT2++ = FIX_DIV(in1, in);
+      j++;
+    }
+
+    /* REPLACE ROWS */
+
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    i = 0U; /* pivot index */
+    k = m;  /* row index */
+    while (k > 0U) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (i == l) {
+        pSrcT1 += n - l;
+        pSrcT2 += n;
+      } else {
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
         j = 0;
         while (j < 4 * ((n - l) >> 2U)) {
-            in1 = *pSrcT1;
-            in2 = *(pSrcT1 + 1);
-            in3 = *(pSrcT1 + 2);
-            in4 = *(pSrcT1 + 3);
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            *pSrcT1++ = out1;
-            *pSrcT1++ = out2;
-            *pSrcT1++ = out3;
-            *pSrcT1++ = out4;
-            j += 4;
+          in1 = *pSrcT1;
+          in2 = *(pSrcT1 + 1);
+          in3 = *(pSrcT1 + 2);
+          in4 = *(pSrcT1 + 3);
+          out1 = *pPRT_in++;
+          out2 = *pPRT_in++;
+          out3 = *pPRT_in++;
+          out4 = *pPRT_in++;
+          *pSrcT1++ = in1 - FIX_MUL(in, out1);
+          *pSrcT1++ = in2 - FIX_MUL(in, out2);
+          *pSrcT1++ = in3 - FIX_MUL(in, out3);
+          *pSrcT1++ = in4 - FIX_MUL(in, out4);
+          j += 4;
         }
         while (j < n - l) {
-            in1 = *pSrcT1;
-            *pSrcT1++ = FIX_DIV(in1, in);
-            j++;
+          in1 = *pSrcT1;
+          out1 = *pPRT_in++;
+          *pSrcT1++ = in1 - FIX_MUL(in, out1);
+          j++;
         }
-        //switch ((n - l) % 4) {
+        // switch ((n - l) % 4) {
         //    case 3:
         //        in1 = *pSrcT1;
         //        in2 = *(pSrcT1 + 1);
         //        in3 = *(pSrcT1 + 2);
-        //        out1 = FIX_DIV(in1, in);
-        //        out2 = FIX_DIV(in2, in);
-        //        out3 = FIX_DIV(in3, in);
-        //        *pSrcT1++ = out1;
-        //        *pSrcT1++ = out2;
-        //        *pSrcT1++ = out3;
+        //        out1 = *pPRT_in++;
+        //        out2 = *pPRT_in++;
+        //        out3 = *pPRT_in++;
+        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+        //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
+        //        *pSrcT1++ = in3 - FIX_MUL(in, out3);
         //        break;
         //    case 2:
         //        in1 = *pSrcT1;
         //        in2 = *(pSrcT1 + 1);
-        //        out1 = FIX_DIV(in1, in);
-        //        out2 = FIX_DIV(in2, in);
-        //        *pSrcT1++ = out1;
-        //        *pSrcT1++ = out2;
+        //        out1 = *pPRT_in++;
+        //        out2 = *pPRT_in++;
+        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
+        //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
         //        break;
         //    case 1:
         //        in1 = *pSrcT1;
-        //        out1 = FIX_DIV(in1, in);
-        //        *pSrcT1++ = out1;
+        //        out1 = *pPRT_in++;
+        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
         //        break;
         //}
-        /* Loop over number of columns of the destination matrix */
+        /* Loop over the number of columns to
+           replace the elements in the destination matrix */
         j = 0;
         while (j < 4 * (n >> 2U)) {
-            in1 = *pSrcT2;
-            in2 = *(pSrcT2 + 1);
-            in3 = *(pSrcT2 + 2);
-            in4 = *(pSrcT2 + 3);
-            out1 = FIX_DIV(in1, in);
-            out2 = FIX_DIV(in2, in);
-            out3 = FIX_DIV(in3, in);
-            out4 = FIX_DIV(in4, in);
-            *pSrcT2++ = out1;
-            *pSrcT2++ = out2;
-            *pSrcT2++ = out3;
-            *pSrcT2++ = out4;
-            j += 4;
+          in1 = *pSrcT2;
+          in2 = *(pSrcT2 + 1);
+          in3 = *(pSrcT2 + 2);
+          in4 = *(pSrcT2 + 3);
+          out1 = *pPRT_pDst++;
+          out2 = *pPRT_pDst++;
+          out3 = *pPRT_pDst++;
+          out4 = *pPRT_pDst++;
+          *pSrcT2++ = in1 - FIX_MUL(in, out1);
+          *pSrcT2++ = in2 - FIX_MUL(in, out2);
+          *pSrcT2++ = in3 - FIX_MUL(in, out3);
+          *pSrcT2++ = in4 - FIX_MUL(in, out4);
+          j += 4;
         }
         while (j < n) {
-            in1 = *pSrcT2;
-            *pSrcT2++ = FIX_DIV(in1, in);
-            j++;
-        }
-
-        /* REPLACE ROWS */
-
-        pSrcT1 = pSrc;
-        pSrcT2 = pDst;
-        i = 0U; /* pivot index */
-        k = m; /* row index */
-        while (k > 0U) {
-            /* Only the columns to the right of the pivot are to be processed */
-            if (i == l) {
-                pSrcT1 += n - l;
-                pSrcT2 += n;
-            } else {
-                /* Element of the reference row */
-                in = *pSrcT1;
-                /* Reference row pointers */
-                pPRT_in = pPivotRowIn;
-                pPRT_pDst = pPivotRowDst;
-                j = 0;
-                while (j < 4 * ((n - l) >> 2U)) {
-                    in1 = *pSrcT1;
-                    in2 = *(pSrcT1 + 1);
-                    in3 = *(pSrcT1 + 2);
-                    in4 = *(pSrcT1 + 3);
-                    out1 = *pPRT_in++;
-                    out2 = *pPRT_in++;
-                    out3 = *pPRT_in++;
-                    out4 = *pPRT_in++;
-                    *pSrcT1++ = in1 - FIX_MUL(in, out1);
-                    *pSrcT1++ = in2 - FIX_MUL(in, out2);
-                    *pSrcT1++ = in3 - FIX_MUL(in, out3);
-                    *pSrcT1++ = in4 - FIX_MUL(in, out4);
-                    j += 4;
-                }
-                while (j < n - l) {
-                    in1 = *pSrcT1;
-                    out1 = *pPRT_in++;
-                    *pSrcT1++ = in1 - FIX_MUL(in, out1);
-                    j++;
-                }
-                //switch ((n - l) % 4) {
-                //    case 3:
-                //        in1 = *pSrcT1;
-                //        in2 = *(pSrcT1 + 1);
-                //        in3 = *(pSrcT1 + 2);
-                //        out1 = *pPRT_in++;
-                //        out2 = *pPRT_in++;
-                //        out3 = *pPRT_in++;
-                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-                //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-                //        *pSrcT1++ = in3 - FIX_MUL(in, out3);
-                //        break;
-                //    case 2:
-                //        in1 = *pSrcT1;
-                //        in2 = *(pSrcT1 + 1);
-                //        out1 = *pPRT_in++;
-                //        out2 = *pPRT_in++;
-                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-                //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-                //        break;
-                //    case 1:
-                //        in1 = *pSrcT1;
-                //        out1 = *pPRT_in++;
-                //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-                //        break;
-                //}
-                /* Loop over the number of columns to
-                   replace the elements in the destination matrix */
-                j = 0;
-                while (j < 4 * (n >> 2U)) {
-                    in1 = *pSrcT2;
-                    in2 = *(pSrcT2 + 1);
-                    in3 = *(pSrcT2 + 2);
-                    in4 = *(pSrcT2 + 3);
-                    out1 = *pPRT_pDst++;
-                    out2 = *pPRT_pDst++;
-                    out3 = *pPRT_pDst++;
-                    out4 = *pPRT_pDst++;
-                    *pSrcT2++ = in1 - FIX_MUL(in, out1);
-                    *pSrcT2++ = in2 - FIX_MUL(in, out2);
-                    *pSrcT2++ = in3 - FIX_MUL(in, out3);
-                    *pSrcT2++ = in4 - FIX_MUL(in, out4);
-                    j += 4;
-                }
-                while (j < n) {
-                    in1 = *pSrcT2;
-                    out1 = *pPRT_pDst;
-                    *pSrcT2++ = in1 - FIX_MUL(in, out1);
-                    j++;
-                }
-            }
-            /* Increment temporary input pointer */
-            pSrcT1 = pSrcT1 + l;
-            /* Decrement loop counter */
-            k--;
-            /* Increment pivot index */
-            i++;
+          in1 = *pSrcT2;
+          out1 = *pPRT_pDst;
+          *pSrcT2++ = in1 - FIX_MUL(in, out1);
+          j++;
         }
-
-        pSrc++; /* Increment the input pointer */
-        l++; /* Increment the index modifier */
+      }
+      /* Increment temporary input pointer */
+      pSrcT1 = pSrcT1 + l;
+      /* Decrement loop counter */
+      k--;
+      /* Increment pivot index */
+      i++;
     }
 
-    return 0;
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+
+  return 0;
 }
- 
diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c
index 18e35f510..8a217c0cd 100644
--- a/software/apps/svd/main.c
+++ b/software/apps/svd/main.c
@@ -1,3 +1,9 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
@@ -6,7 +12,6 @@
 #include "nrutil.h"
 #include "svd.c"
 
-
 // Define Matrix dimensions:
 #define M 4
 #define N 32
@@ -42,8 +47,8 @@ void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
   }
 }
 
-void init_vector(int32_t *vector, uint32_t num_el,
-                 int32_t a, int32_t b, uint32_t core_id) {
+void init_vector(int32_t *vector, uint32_t num_el, int32_t a, int32_t b,
+                 uint32_t core_id) {
   uint32_t const split = 8; // How many blocks to split the vector into
   uint32_t const reminder = num_el % split;
   uint32_t i, j;
diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h
index 27b55fec2..a137444ab 100644
--- a/software/apps/svd/nrutil.h
+++ b/software/apps/svd/nrutil.h
@@ -1,3 +1,9 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
 //#include <stdio.h>
 //#include <stddef.h>
 //#include <stdlib.h>
@@ -9,57 +15,69 @@
 #define FREE_ARG char *
 
 static int32_t sqrarg;
-#define SQR(a)     ((sqrarg = (a)) == 0 ? 0 : sqrarg *sqrarg)
+#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg * sqrarg)
 static int32_t dsqrarg;
-#define DSQR(a)    ((dsqrarg = (a)) == 0 ? 0 : dsqrarg *dsqrarg)
+#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg * dsqrarg)
 static int32_t dmaxarg1, dmaxarg2;
-#define DMAX(a, b) (dmaxarg1 = (a), dmaxarg2 = (b), (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))
+#define DMAX(a, b)                                                             \
+  (dmaxarg1 = (a), dmaxarg2 = (b),                                             \
+   (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))
 static int32_t dminarg1, dminarg2;
-#define DMIN(a, b) (dminarg1 = (a), dminarg2 = (b), (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))
+#define DMIN(a, b)                                                             \
+  (dminarg1 = (a), dminarg2 = (b),                                             \
+   (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))
 static int32_t maxarg1, maxarg2;
-#define FMAX(a, b) (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
+#define FMAX(a, b)                                                             \
+  (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
 static int32_t minarg1, minarg2;
-#define FMIN(a, b) (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))
+#define FMIN(a, b)                                                             \
+  (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))
 static long lmaxarg1, lmaxarg2;
-#define LMAX(a, b) (lmaxarg1 = (a), lmaxarg2 = (b), (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))
+#define LMAX(a, b)                                                             \
+  (lmaxarg1 = (a), lmaxarg2 = (b),                                             \
+   (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))
 static long lminarg1, lminarg2;
-#define LMIN(a, b) (lminarg1 = (a), lminarg2 = (b), (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))
+#define LMIN(a, b)                                                             \
+  (lminarg1 = (a), lminarg2 = (b),                                             \
+   (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))
 static int32_t imaxarg1, imaxarg2;
-#define IMAX(a, b) (imaxarg1 = (a), imaxarg2 = (b), (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))
+#define IMAX(a, b)                                                             \
+  (imaxarg1 = (a), imaxarg2 = (b),                                             \
+   (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))
 static int32_t iminarg1, iminarg2;
-#define IMIN(a, b) (iminarg1 = (a), iminarg2 = (b), (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
+#define IMIN(a, b)                                                             \
+  (iminarg1 = (a), iminarg2 = (b),                                             \
+   (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
 #define ABS(a) (a < 0 ? -a : a)
 #define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a))
 
-int32_t sqrt_q32  (   const int32_t number,
-                      const uint32_t fracBits);
+int32_t sqrt_q32(const int32_t number, const uint32_t fracBits);
 
 #define sqrt2 0b1011010100000100
-int32_t sqrt_q32  (   const int32_t number,
-                      const uint32_t fracBits) {
+int32_t sqrt_q32(const int32_t number, const uint32_t fracBits) {
 
-    int32_t root = 0;
-    int32_t start = 0;
-    int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF)
-    int32_t mid;
+  int32_t root = 0;
+  int32_t start = 0;
+  int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF)
+  int32_t mid;
 
-    if (number > 0) {
-      while (start <= end) {
-          mid = (start + end) >> 1;
-          if (((mid * mid) >> fracBits) == number) {
-              root = mid;
-              break;
-          }
-          if (((mid * mid) >> fracBits) < number) {
-              start = mid + 1;
-              root = mid;
-          } else {
-              end = mid - 1;
-          }
+  if (number > 0) {
+    while (start <= end) {
+      mid = (start + end) >> 1;
+      if (((mid * mid) >> fracBits) == number) {
+        root = mid;
+        break;
+      }
+      if (((mid * mid) >> fracBits) < number) {
+        start = mid + 1;
+        root = mid;
+      } else {
+        end = mid - 1;
       }
     }
+  }
 
-    return root;
+  return root;
 }
 
 #endif
diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c
index a53c2695b..fa2fcbd0c 100644
--- a/software/apps/svd/svd.c
+++ b/software/apps/svd/svd.c
@@ -1,237 +1,242 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
 int32_t pythag(int32_t a, int32_t b);
 void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v);
 
-
 int32_t pythag(int32_t a, int32_t b) {
-    int32_t absa = ABS(a);
-    int32_t absb = ABS(b);
-    if (absa > absb) {
-        return absa * sqrt_q32(1 + SQR(absb / absa), 4);
-    } else {
-        return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4));
-    }
+  int32_t absa = ABS(a);
+  int32_t absb = ABS(b);
+  if (absa > absb) {
+    return absa * sqrt_q32(1 + SQR(absb / absa), 4);
+  } else {
+    return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4));
+  }
 }
 
 void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) {
-    int32_t flag, i, its, j, jj, k, l, nm;
-    int32_t anorm, c, f, g, h, s, scale, x, y, z;
-    int32_t rv1[n];
+  int32_t flag, i, its, j, jj, k, l, nm;
+  int32_t anorm, c, f, g, h, s, scale, x, y, z;
+  int32_t rv1[n];
 
-    //printf("PROVA\n");
+  // printf("PROVA\n");
 
-    g = scale = anorm = 0.0;
-    for (i = 1; i <= n; i++) {
-        l = i + 1;
-        rv1[i] = scale * g;
-        g = s = scale = 0.0;
-        if (i <= m) {
-            for (k = i; k <= m; k++) {
-                scale += ABS(a[k * m + i]);
-            }
-            if (scale) {
-                for (k = i; k <= m; k++) {
-                    a[k * m + i] /= scale;
-                    s += a[k * m + i] * a[k * m + i];
-                }
-                f = a[i * m + i];
-                g = -SIGN(sqrt_q32(s,4), f);
-                h = f * g - s;
-                a[i * m + i] = f - g;
-                for (j = l; j <= n; j++) {
-                    for (s = 0.0, k = i; k <= m; k++) {
-                        s += a[k * m + i] * a[k * m + i];
-                    }
-                    f = s / h;
-                    for (k = i; k <= m; k++) {
-                        a[k * m + i] += f * a[k * m + i];
-                    }
-                }
-                for (k = i; k <= m; k++) {
-                    a[k * m + i] *= scale;
-                }
-            }
+  g = scale = anorm = 0.0;
+  for (i = 1; i <= n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i <= m) {
+      for (k = i; k <= m; k++) {
+        scale += ABS(a[k * m + i]);
+      }
+      if (scale) {
+        for (k = i; k <= m; k++) {
+          a[k * m + i] /= scale;
+          s += a[k * m + i] * a[k * m + i];
+        }
+        f = a[i * m + i];
+        g = -SIGN(sqrt_q32(s, 4), f);
+        h = f * g - s;
+        a[i * m + i] = f - g;
+        for (j = l; j <= n; j++) {
+          for (s = 0.0, k = i; k <= m; k++) {
+            s += a[k * m + i] * a[k * m + i];
+          }
+          f = s / h;
+          for (k = i; k <= m; k++) {
+            a[k * m + i] += f * a[k * m + i];
+          }
         }
-        w[i] = scale * g;
-        g = s = scale = 0.0;
-        if (i <= m && i != n) {
-            for (k = l; k <= n; k++) {
-                scale += ABS(a[k * m + i]);
-            }
-            if (scale) {
-                for (k = l; k <= n; k++) {
-                    a[k * m + i] /= scale;
-                    s += a[i * m + k] * a[i * m + k];
-                }
-                f = a[i * m + l];
-                g = -SIGN(sqrt_q32(s,4), f);
-                h = f * g - s;
-                a[i * m + l] = f - g;
-                for (k = l; k <= n; k++) {
-                    rv1[k] = a[i * m + k] / h;
-                }
-                for (j = l; j <= m; j++) {
-                    for (s = 0, k = l; k <= n; k++) {
-                        s += a[j * m + k] * a[i * m + k];
-                    }
-                    for (k = l; k <= n; k++) {
-                        a[j * m + k] += s * rv1[k];
-                    }
-                }
-                for (k = l; k <= n; k++) {
-                    a[i * m + k] *= scale;
-                }
-            }
+        for (k = i; k <= m; k++) {
+          a[k * m + i] *= scale;
         }
-        anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i])));
+      }
     }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i <= m && i != n) {
+      for (k = l; k <= n; k++) {
+        scale += ABS(a[k * m + i]);
+      }
+      if (scale) {
+        for (k = l; k <= n; k++) {
+          a[k * m + i] /= scale;
+          s += a[i * m + k] * a[i * m + k];
+        }
+        f = a[i * m + l];
+        g = -SIGN(sqrt_q32(s, 4), f);
+        h = f * g - s;
+        a[i * m + l] = f - g;
+        for (k = l; k <= n; k++) {
+          rv1[k] = a[i * m + k] / h;
+        }
+        for (j = l; j <= m; j++) {
+          for (s = 0, k = l; k <= n; k++) {
+            s += a[j * m + k] * a[i * m + k];
+          }
+          for (k = l; k <= n; k++) {
+            a[j * m + k] += s * rv1[k];
+          }
+        }
+        for (k = l; k <= n; k++) {
+          a[i * m + k] *= scale;
+        }
+      }
+    }
+    anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i])));
+  }
 
-    for (i = n; i >= 1; i--) {
-        if (i < n) {
-            if (g) {
-                for (j = l; j <= n; j++) {
-                    v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g;
-                }
-                for (j = l; j <= n; j++) {
-                    for (s = 0, k = l; k <= n; k++) {
-                        s += a[i * m + k] * v[k * m + j];
-                    }
-                    for (k = l; k <= n; k++) {
-                        v[k * m + j] += s * v[k * m + i];
-                    }
-                }
-            }
-            for (j = l; j <= n; j++) {
-                v[i * m + j] = v[j * m + i] = 0;
-            }
+  for (i = n; i >= 1; i--) {
+    if (i < n) {
+      if (g) {
+        for (j = l; j <= n; j++) {
+          v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g;
+        }
+        for (j = l; j <= n; j++) {
+          for (s = 0, k = l; k <= n; k++) {
+            s += a[i * m + k] * v[k * m + j];
+          }
+          for (k = l; k <= n; k++) {
+            v[k * m + j] += s * v[k * m + i];
+          }
         }
-        v[i * m + i] = 1;
-        g = rv1[i];
-        l = i;
+      }
+      for (j = l; j <= n; j++) {
+        v[i * m + j] = v[j * m + i] = 0;
+      }
     }
+    v[i * m + i] = 1;
+    g = rv1[i];
+    l = i;
+  }
 
-//    for (i = IMIN(m, n); i >= 1; i--) {
-//        l = i + 1;
-//        g = w[i];
-//        for (j = l; j <= n; j++) {
-//            a[i][j] = 0;
-//        }
-//        if (g) {
-//            g = 1.0 / g;
-//            for (j = l; j <= n; j++) {
-//                for (s = 0.0, k = l; k <= m; k++) {
-//                    s += a[k][i] * a[k][j];
-//                }
-//                f = (s / a[i][i]) * g;
-//                for (k = i; k <= m; k++) {
-//                    a[k][j] += f * a[k][i];
-//                }
-//            }
-//            for (j = i; j <= m; j++) {
-//                a[j][i] *= g;
-//            }
-//        } else { for (j = i; j <= m; j++) {
-//                     a[j][i] = 0.0;
-//                 }
-//        }
-//        ++a[i][i];
-//    }
-//    for (k = n; k >= 1; k--) {
-//        for (its = 1; its <= 30; its++) {
-//            flag = 1;
-//            for (l = k; l >= 1; l--) {
-//                nm = l - 1;
-//                if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) {
-//                    flag = 0;
-//                    break;
-//                }
-//                if ((int32_t) (ABS(w[nm]) + anorm) == anorm) {
-//                    break;
-//                }
-//            }
-//            if (flag) {
-//                c = 0.0;
-//                s = 1.0;
-//                for (i = l; i <= k; i++) {
-//                    f = s * rv1[i];
-//                    rv1[i] = c * rv1[i];
-//                    if ((int32_t) (ABS(f) + anorm) == anorm) {
-//                        break;
-//                    }
-//                    g = w[i];
-//                    h = pythag(f, g);
-//                    w[i] = h;
-//                    h = 1.0 / h;
-//                    c = g * h;
-//                    s = -f * h;
-//                    for (j = 1; j <= m; j++) {
-//                        y = a[j][nm];
-//                        z = a[j][i];
-//                        a[j][nm] = y * c + z * s;
-//                        a[j][i] = z * c - y * s;
-//                    }
-//                }
-//            }
-//            z = w[k];
-//            if (l == k) {
-//                if (z < 0.0) {
-//                    w[k] = -z;
-//                    for (j = 1; j <= n; j++) {
-//                        v[j][k] = -v[j][k];
-//                    }
-//                }
-//                break;
-//            }
-//            if (its == 30) {
-//                exit(1);
-//            }
-//            x = w[l];
-//            nm = k - 1;
-//            y = w[nm];
-//            g = rv1[nm];
-//            h = rv1[k];
-//            f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
-//            g = pythag(f, 1.0);
-//            f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
-//            c = s = 1.0;
-//            for (j = l; j <= nm; j++) {
-//                i = j + 1;
-//                g = rv1[i];
-//                y = w[i];
-//                h = s * g;
-//                g = c * g;
-//                z = pythag(f, h);
-//                rv1[j] = z;
-//                c = f / z;
-//                s = h / z;
-//                f = x * c + g * s;
-//                g = g * c - x * s;
-//                h = y * s;
-//                y *= c;
-//                for (jj = 1; jj <= n; jj++) {
-//                    x = v[jj][j];
-//                    z = v[jj][i];
-//                    v[jj][j] = x * c + z * s;
-//                    v[jj][i] = z * c - x * s;
-//                }
-//                z = pythag(f, h);
-//                w[j] = z;
-//                if (z) {
-//                    z = 1.0 / z;
-//                    c = f * z;
-//                    s = h * z;
-//                }
-//                f = c * g + s * y;
-//                x = c * y - s * g;
-//                for (jj = 1; jj <= m; jj++) {
-//                    y = a[jj][j];
-//                    z = a[jj][i];
-//                    a[jj][j] = y * c + z * s;
-//                    a[jj][i] = z * c - y * s;
-//                }
-//            }
-//            rv1[l] = 0.0;
-//            rv1[k] = f;
-//            w[k] = x;
-//        }
-//    }
+  //    for (i = IMIN(m, n); i >= 1; i--) {
+  //        l = i + 1;
+  //        g = w[i];
+  //        for (j = l; j <= n; j++) {
+  //            a[i][j] = 0;
+  //        }
+  //        if (g) {
+  //            g = 1.0 / g;
+  //            for (j = l; j <= n; j++) {
+  //                for (s = 0.0, k = l; k <= m; k++) {
+  //                    s += a[k][i] * a[k][j];
+  //                }
+  //                f = (s / a[i][i]) * g;
+  //                for (k = i; k <= m; k++) {
+  //                    a[k][j] += f * a[k][i];
+  //                }
+  //            }
+  //            for (j = i; j <= m; j++) {
+  //                a[j][i] *= g;
+  //            }
+  //        } else { for (j = i; j <= m; j++) {
+  //                     a[j][i] = 0.0;
+  //                 }
+  //        }
+  //        ++a[i][i];
+  //    }
+  //    for (k = n; k >= 1; k--) {
+  //        for (its = 1; its <= 30; its++) {
+  //            flag = 1;
+  //            for (l = k; l >= 1; l--) {
+  //                nm = l - 1;
+  //                if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) {
+  //                    flag = 0;
+  //                    break;
+  //                }
+  //                if ((int32_t) (ABS(w[nm]) + anorm) == anorm) {
+  //                    break;
+  //                }
+  //            }
+  //            if (flag) {
+  //                c = 0.0;
+  //                s = 1.0;
+  //                for (i = l; i <= k; i++) {
+  //                    f = s * rv1[i];
+  //                    rv1[i] = c * rv1[i];
+  //                    if ((int32_t) (ABS(f) + anorm) == anorm) {
+  //                        break;
+  //                    }
+  //                    g = w[i];
+  //                    h = pythag(f, g);
+  //                    w[i] = h;
+  //                    h = 1.0 / h;
+  //                    c = g * h;
+  //                    s = -f * h;
+  //                    for (j = 1; j <= m; j++) {
+  //                        y = a[j][nm];
+  //                        z = a[j][i];
+  //                        a[j][nm] = y * c + z * s;
+  //                        a[j][i] = z * c - y * s;
+  //                    }
+  //                }
+  //            }
+  //            z = w[k];
+  //            if (l == k) {
+  //                if (z < 0.0) {
+  //                    w[k] = -z;
+  //                    for (j = 1; j <= n; j++) {
+  //                        v[j][k] = -v[j][k];
+  //                    }
+  //                }
+  //                break;
+  //            }
+  //            if (its == 30) {
+  //                exit(1);
+  //            }
+  //            x = w[l];
+  //            nm = k - 1;
+  //            y = w[nm];
+  //            g = rv1[nm];
+  //            h = rv1[k];
+  //            f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+  //            g = pythag(f, 1.0);
+  //            f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
+  //            c = s = 1.0;
+  //            for (j = l; j <= nm; j++) {
+  //                i = j + 1;
+  //                g = rv1[i];
+  //                y = w[i];
+  //                h = s * g;
+  //                g = c * g;
+  //                z = pythag(f, h);
+  //                rv1[j] = z;
+  //                c = f / z;
+  //                s = h / z;
+  //                f = x * c + g * s;
+  //                g = g * c - x * s;
+  //                h = y * s;
+  //                y *= c;
+  //                for (jj = 1; jj <= n; jj++) {
+  //                    x = v[jj][j];
+  //                    z = v[jj][i];
+  //                    v[jj][j] = x * c + z * s;
+  //                    v[jj][i] = z * c - x * s;
+  //                }
+  //                z = pythag(f, h);
+  //                w[j] = z;
+  //                if (z) {
+  //                    z = 1.0 / z;
+  //                    c = f * z;
+  //                    s = h * z;
+  //                }
+  //                f = c * g + s * y;
+  //                x = c * y - s * g;
+  //                for (jj = 1; jj <= m; jj++) {
+  //                    y = a[jj][j];
+  //                    z = a[jj][i];
+  //                    a[jj][j] = y * c + z * s;
+  //                    a[jj][i] = z * c - y * s;
+  //                }
+  //            }
+  //            rv1[l] = 0.0;
+  //            rv1[k] = f;
+  //            w[k] = x;
+  //        }
+  //    }
 }

From ae56dc47eac231f0541d7b3e26c78c0de935aad5 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Mon, 31 Oct 2022 08:56:09 +0100
Subject: [PATCH 18/22] [software] Erase SVD folder

---
 software/apps/svd/SVD_Householder.txt | 781 --------------------------
 software/apps/svd/main.c              |  98 ----
 software/apps/svd/nrutil.h            |  83 ---
 software/apps/svd/svd.c               | 242 --------
 4 files changed, 1204 deletions(-)
 delete mode 100644 software/apps/svd/SVD_Householder.txt
 delete mode 100644 software/apps/svd/main.c
 delete mode 100644 software/apps/svd/nrutil.h
 delete mode 100644 software/apps/svd/svd.c

diff --git a/software/apps/svd/SVD_Householder.txt b/software/apps/svd/SVD_Householder.txt
deleted file mode 100644
index 1631212de..000000000
--- a/software/apps/svd/SVD_Householder.txt
+++ /dev/null
@@ -1,781 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// File: singular_value_decomposition.c                                       //
-// Contents:                                                                  //
-//    Singular_Value_Decomposition                                            //
-//    Singular_Value_Decomposition_Solve                                      //
-//    Singular_Value_Decomposition_Inverse                                    //
-////////////////////////////////////////////////////////////////////////////////
-
-#include <string.h>              // required for memcpy()
-#include <float.h>               // required for DBL_EPSILON
-#include <math.h>                // required for fabs(), sqrt();
-
-#define MAX_ITERATION_COUNT 30   // Maximum number of iterations
-
-//                        Internally Defined Routines 
-static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
-    int ncols, double* U, double* V, double* diagonal, double* superdiagonal );
-static int  Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
-           double* U, double* V, double* diagonal, double* superdiagonal );
-static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
-                                double* singular_value, double* U, double* V);
-
-////////////////////////////////////////////////////////////////////////////////
-//  int Singular_Value_Decomposition(double* A, int nrows, int ncols,         //
-//        double* U, double* singular_values, double* V, double* dummy_array) //
-//                                                                            //
-//  Description:                                                              //
-//     This routine decomposes an m x n matrix A, with m >= n, into a product //
-//     of the three matrices U, D, and V', i.e. A = UDV', where U is an m x n //
-//     matrix whose columns are orthogonal, D is a n x n diagonal matrix, and //
-//     V is an n x n orthogonal matrix.  V' denotes the transpose of V.  If   //
-//     m < n, then the procedure may be used for the matrix A'.  The singular //
-//     values of A are the diagonal elements of the diagonal matrix D and     //
-//     correspond to the positive square roots of the eigenvalues of the      //
-//     matrix A'A.                                                            //
-//                                                                            //
-//     This procedure programmed here is based on the method of Golub and     //
-//     Reinsch as given on pages 134 - 151 of the "Handbook for Automatic     //
-//     Computation vol II - Linear Algebra" edited by Wilkinson and Reinsch   //
-//     and published by Springer-Verlag, 1971.                                //
-//                                                                            //
-//     The Golub and Reinsch's method for decomposing the matrix A into the   //
-//     product U, D, and V' is performed in three stages:                     //
-//       Stage 1:  Decompose A into the product of three matrices U1, B, V1'  //
-//         A = U1 B V1' where B is a bidiagonal matrix, and U1, and V1 are a  //
-//         product of Householder transformations.                            //
-//       Stage 2:  Use Given' transformations to reduce the bidiagonal matrix //
-//         B into the product of the three matrices U2, D, V2'.  The singular //
-//         value decomposition is then UDV'where U = U2 U1 and V' = V1' V2'.  //
-//       Stage 3:  Sort the matrix D in decreasing order of the singular      //
-//         values and interchange the columns of both U and V to reflect any  //
-//         change in the order of the singular values.                        //
-//                                                                            //
-//     After performing the singular value decomposition for A, call          //
-//     Singular_Value_Decomposition to solve the equation Ax = B or call      //
-//     Singular_Value_Decomposition_Inverse to calculate the pseudo-inverse   //
-//     of A.                                                                  //
-//                                                                            //
-//  Arguments:                                                                //
-//     double* A                                                              //
-//        On input, the pointer to the first element of the matrix            //
-//        A[nrows][ncols].  The matrix A is unchanged.                        //
-//     int nrows                                                              //
-//        The number of rows of the matrix A.                                 //
-//     int ncols                                                              //
-//        The number of columns of the matrix A.                              //
-//     double* U                                                              //
-//        On input, a pointer to a matrix with the same number of rows and    //
-//        columns as the matrix A.  On output, the matrix with mutually       //
-//        orthogonal columns which is the left-most factor in the singular    //
-//        value decomposition of A.                                           //
-//     double* singular_values                                                //
-//        On input, a pointer to an array dimensioned to same as the number   //
-//        of columns of the matrix A, ncols.  On output, the singular values  //
-//        of the matrix A sorted in decreasing order.  This array corresponds //
-//        to the diagonal matrix in the singular value decomposition of A.    //
-//     double* V                                                              //
-//        On input, a pointer to a square matrix with the same number of rows //
-//        and columns as the columns of the matrix A, i.e. V[ncols][ncols].   //
-//        On output, the orthogonal matrix whose transpose is the right-most  //
-//        factor in the singular value decomposition of A.                    //
-//     double* dummy_array                                                    //
-//        On input, a pointer to an array dimensioned to same as the number   //
-//        of columns of the matrix A, ncols.  This array is used to store     //
-//        the super-diagonal elements resulting from the Householder reduction//
-//        of the matrix A to bidiagonal form.  And as an input to the Given's //
-//        procedure to reduce the bidiagonal form to diagonal form.           //
-//                                                                            //
-//  Return Values:                                                            //
-//     0  Success                                                             //
-//    -1  Failure - During the Given's reduction of the bidiagonal form to    //
-//                  diagonal form the procedure failed to terminate within    //
-//                  MAX_ITERATION_COUNT iterations.                           //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     double A[M][N];                                                        //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double singular_values[N];                                             //
-//     double* dummy_array;                                                   //
-//                                                                            //
-//     (your code to initialize the matrix A)                                 //
-//     dummy_array = (double*) malloc(N * sizeof(double));                    //
-//     if (dummy_array == NULL) {printf(" No memory available\n"); exit(0); } //
-//                                                                            //
-//     err = Singular_Value_Decomposition((double*) A, M, N, (double*) U,     //
-//                              singular_values, (double*) V, dummy_array);   //
-//                                                                            //
-//     free(dummy_array);                                                     //
-//     if (err < 0) printf(" Failed to converge\n");                          //
-//     else { printf(" The singular value decomposition of A is \n");         //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-int Singular_Value_Decomposition(double* A, int nrows, int ncols, double* U, 
-                      double* singular_values, double* V, double* dummy_array)
-{
-   Householders_Reduction_to_Bidiagonal_Form( A, nrows, ncols, U, V,
-                                                singular_values, dummy_array);
-
-   if (Givens_Reduction_to_Diagonal_Form( nrows, ncols, U, V,
-                                singular_values, dummy_array ) < 0) return -1;
-
-   Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, U, V);
-  
-   return 0;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,//
-//  int ncols, double* U, double* V, double* diagonal, double* superdiagonal )//
-//                                                                            //
-//  Description:                                                              //
-//     This routine decomposes an m x n matrix A, with m >= n, into a product //
-//     of the three matrices U, B, and V', i.e. A = UBV', where U is an m x n //
-//     matrix whose columns are orthogonal, B is a n x n bidiagonal matrix,   //
-//     and V is an n x n orthogonal matrix.  V' denotes the transpose of V.   //
-//     If m < n, then the procedure may be used for the matrix A'.  The       //
-//                                                                            //
-//     The matrix U is the product of Householder transformations which       //
-//     annihilate the subdiagonal components of A while the matrix V is       //
-//     the product of Householder transformations which annihilate the        //
-//     components of A to the right of the superdiagonal.                     //
-//                                                                            //
-//     The Householder transformation which leaves invariant the first k-1    //
-//     elements of the k-th column and annihilates the all the elements below //
-//     the diagonal element is P = I - (2/u'u)uu', u is an nrows-dimensional  //
-//     vector the first k-1 components of which are zero and the last         //
-//     components agree with the current transformed matrix below the diagonal//
-//     diagonal, the remaining k-th element is the diagonal element - s, where//
-//     s = (+/-)sqrt(sum of squares of the elements below the diagonal), the  //
-//     sign is chosen opposite that of the diagonal element.                  //
-//                                                                            //
-//  Arguments:                                                                //
-//     double* A                                                              //
-//        On input, the pointer to the first element of the matrix            //
-//        A[nrows][ncols].  The matrix A is unchanged.                        //
-//     int nrows                                                              //
-//        The number of rows of the matrix A.                                 //
-//     int ncols                                                              //
-//        The number of columns of the matrix A.                              //
-//     double* U                                                              //
-//        On input, a pointer to a matrix with the same number of rows and    //
-//        columns as the matrix A.  On output, the matrix with mutually       //
-//        orthogonal columns which is the left-most factor in the bidiagonal  //
-//        decomposition of A.                                                 //
-//     double* V                                                              //
-//        On input, a pointer to a square matrix with the same number of rows //
-//        and columns as the columns of the matrix A, i.e. V[ncols][ncols].   //
-//        On output, the orthogonal matrix whose transpose is the right-most  //
-//        factor in the bidiagonal decomposition of A.                        //
-//     double* diagonal                                                       //
-//        On input, a pointer to an array dimensioned to same as the number   //
-//        of columns of the matrix A, ncols.  On output, the diagonal of the  //
-//        bidiagonal matrix.                                                  //
-//     double* superdiagonal                                                  //
-//        On input, a pointer to an array dimensioned to same as the number   //
-//        of columns of the matrix A, ncols.  On output, the superdiagonal    //
-//        of the bidiagonal matrix.                                           //
-//                                                                            //
-//  Return Values:                                                            //
-//     The function is of type void and therefore does not return a value.    //
-//     The matrices U, V, and the diagonal and superdiagonal are calculated   //
-//     using the addresses passed in the argument list.                       //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     double A[M][N];                                                        //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double diagonal[N];                                                    //
-//     double superdiagonal[N];                                               //
-//                                                                            //
-//     (your code to initialize the matrix A - Note this routine is not       //
-//     (accessible from outside i.e. it is declared static)                   //
-//                                                                            //
-//     Householders_Reduction_to_Bidiagonal_Form((double*) A, nrows, ncols,   //
-//                   (double*) U, (double*) V, diagonal, superdiagonal )      //
-//                                                                            //
-//     free(dummy_array);                                                     //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,
-    int ncols, double* U, double* V, double* diagonal, double* superdiagonal )
-{
-   int i,j,k,ip1;
-   double s, s2, si, scale;
-   double dum;
-   double *pu, *pui, *pv, *pvi;
-   double half_norm_squared;
-
-// Copy A to U
-
-   memcpy(U,A, sizeof(double) * nrows * ncols);
-
-//
- 
-   diagonal[0] = 0.0;
-   s = 0.0;
-   scale = 0.0;
-   for ( i = 0, pui = U, ip1 = 1; i < ncols; pui += ncols, i++, ip1++ ) {
-      superdiagonal[i] = scale * s;
-//       
-//                  Perform Householder transform on columns.
-//
-//       Calculate the normed squared of the i-th column vector starting at 
-//       row i.
-//
-      for (j = i, pu = pui, scale = 0.0; j < nrows; j++, pu += ncols)
-         scale += fabs( *(pu + i) );
-       
-      if (scale > 0.0) {
-         for (j = i, pu = pui, s2 = 0.0; j < nrows; j++, pu += ncols) {
-            *(pu + i) /= scale;
-            s2 += *(pu + i) * *(pu + i);
-         }
-//
-//    
-//       Chose sign of s which maximizes the norm
-//  
-         s = ( *(pui + i) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
-//
-//       Calculate -2/u'u
-//
-         half_norm_squared = *(pui + i) * s - s2;
-//
-//       Transform remaining columns by the Householder transform.
-//
-         *(pui + i) -= s;
-         
-         for (j = ip1; j < ncols; j++) {
-            for (k = i, si = 0.0, pu = pui; k < nrows; k++, pu += ncols)
-               si += *(pu + i) * *(pu + j);
-            si /= half_norm_squared;
-            for (k = i, pu = pui; k < nrows; k++, pu += ncols) {
-               *(pu + j) += si * *(pu + i);
-            }
-         }
-      }
-      for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) *= scale;
-      diagonal[i] = s * scale;
-//       
-//                  Perform Householder transform on rows.
-//
-//       Calculate the normed squared of the i-th row vector starting at 
-//       column i.
-//
-      s = 0.0;
-      scale = 0.0;
-      if (i >= nrows || i == (ncols - 1) ) continue;
-      for (j = ip1; j < ncols; j++) scale += fabs ( *(pui + j) );
-      if ( scale > 0.0 ) {
-         for (j = ip1, s2 = 0.0; j < ncols; j++) {
-            *(pui + j) /= scale;
-            s2 += *(pui + j) * *(pui + j);
-         }
-         s = ( *(pui + ip1) < 0.0 ) ? sqrt(s2) : -sqrt(s2);
-//
-//       Calculate -2/u'u
-//
-         half_norm_squared = *(pui + ip1) * s - s2;
-//
-//       Transform the rows by the Householder transform.
-//
-         *(pui + ip1) -= s;
-         for (k = ip1; k < ncols; k++)
-            superdiagonal[k] = *(pui + k) / half_norm_squared;
-         if ( i < (nrows - 1) ) {
-            for (j = ip1, pu = pui + ncols; j < nrows; j++, pu += ncols) {
-               for (k = ip1, si = 0.0; k < ncols; k++) 
-                  si += *(pui + k) * *(pu + k);
-               for (k = ip1; k < ncols; k++) { 
-                  *(pu + k) += si * superdiagonal[k];
-               }
-            }
-         }
-         for (k = ip1; k < ncols; k++) *(pui + k) *= scale;
-      }
-   }
-
-// Update V
-   pui = U + ncols * (ncols - 2);
-   pvi = V + ncols * (ncols - 1);
-   *(pvi + ncols - 1) = 1.0;
-   s = superdiagonal[ncols - 1];
-   pvi -= ncols;
-   for (i = ncols - 2, ip1 = ncols - 1; i >= 0; i--, pui -= ncols,
-                                                      pvi -= ncols, ip1-- ) {
-      if ( s != 0.0 ) {
-         pv = pvi + ncols;
-         for (j = ip1; j < ncols; j++, pv += ncols)
-            *(pv + i) = ( *(pui + j) / *(pui + ip1) ) / s;
-         for (j = ip1; j < ncols; j++) { 
-            si = 0.0;
-            for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
-               si += *(pui + k) * *(pv + j);
-            for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols)
-               *(pv + j) += si * *(pv + i);                  
-         }
-      }
-      pv = pvi + ncols;
-      for ( j = ip1; j < ncols; j++, pv += ncols ) {
-         *(pvi + j) = 0.0;
-         *(pv + i) = 0.0;
-      }
-      *(pvi + i) = 1.0;
-      s = superdiagonal[i];
-   }
-
-// Update U
-
-   pui = U + ncols * (ncols - 1);
-   for (i = ncols - 1, ip1 = ncols; i >= 0; ip1 = i, i--, pui -= ncols ) {
-      s = diagonal[i];
-      for ( j = ip1; j < ncols; j++) *(pui + j) = 0.0;
-      if ( s != 0.0 ) {
-         for (j = ip1; j < ncols; j++) { 
-            si = 0.0;
-            pu = pui + ncols;
-            for (k = ip1; k < nrows; k++, pu += ncols)
-               si += *(pu + i) * *(pu + j);
-            si = (si / *(pui + i) ) / s;
-            for (k = i, pu = pui; k < nrows; k++, pu += ncols)
-               *(pu + j) += si * *(pu + i);                  
-         }
-         for (j = i, pu = pui; j < nrows; j++, pu += ncols){
-            *(pu + i) /= s;
-         }
-      }
-      else 
-         for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) = 0.0;
-      *(pui + i) += 1.0;
-   }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,        //
-//         double* U, double* V, double* diagonal, double* superdiagonal )    //
-//                                                                            //
-//  Description:                                                              //
-//     This routine decomposes a bidiagonal matrix given by the arrays        //
-//     diagonal and superdiagonal into a product of three matrices U1, D and  //
-//     V1', the matrix U1 premultiplies U and is returned in U, the matrix    //
-//     V1 premultiplies V and is returned in V.  The matrix D is a diagonal   //
-//     matrix and replaces the array diagonal.                                //
-//                                                                            //
-//     The method used to annihilate the offdiagonal elements is a variant    //
-//     of the QR transformation.  The method consists of applying Givens      //
-//     rotations to the right and the left of the current matrix until        //
-//     the new off-diagonal elements are chased out of the matrix.            //
-//                                                                            //
-//     The process is an iterative process which due to roundoff errors may   //
-//     not converge within a predefined number of iterations.  (This should   //
-//     be unusual.)                                                           //
-//                                                                            //
-//  Arguments:                                                                //
-//     int nrows                                                              //
-//        The number of rows of the matrix U.                                 //
-//     int ncols                                                              //
-//        The number of columns of the matrix U.                              //
-//     double* U                                                              //
-//        On input, a pointer to a matrix already initialized to a matrix     //
-//        with mutually orthogonal columns.   On output, the matrix with      //
-//        mutually orthogonal columns.                                        //
-//     double* V                                                              //
-//        On input, a pointer to a square matrix with the same number of rows //
-//        and columns as the columns of the matrix U, i.e. V[ncols][ncols].   //
-//        The matrix V is assumed to be initialized to an orthogonal matrix.  //
-//        On output, V is an orthogonal matrix.                               //
-//     double* diagonal                                                       //
-//        On input, a pointer to an array of dimension ncols which initially  //
-//        contains the diagonal of the bidiagonal matrix.  On output, the     //
-//        it contains the diagonal of the diagonal matrix.                    //
-//     double* superdiagonal                                                  //
-//        On input, a pointer to an array of dimension ncols which initially  //
-//        the first component is zero and the successive components form the  //
-//        superdiagonal of the bidiagonal matrix.                             //
-//                                                                            //
-//  Return Values:                                                            //
-//     0  Success                                                             //
-//    -1  Failure - The procedure failed to terminate within                  //
-//                  MAX_ITERATION_COUNT iterations.                           //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double diagonal[N];                                                    //
-//     double superdiagonal[N];                                               //
-//     int err;                                                               //
-//                                                                            //
-//     (your code to initialize the matrices U, V, diagonal, and )            //
-//     ( superdiagonal.  - Note this routine is not accessible from outside)  //
-//     ( i.e. it is declared static.)                                         //
-//                                                                            //
-//     err = Givens_Reduction_to_Diagonal_Form( M,N,(double*)U,(double*)V,    //
-//                                                 diagonal, superdiagonal ); //
-//     if ( err < 0 ) printf("Failed to converge\n");                         //
-//     else { ... }                                                           //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols,
-           double* U, double* V, double* diagonal, double* superdiagonal )
-{
-
-   double epsilon;
-   double c, s;
-   double f,g,h;
-   double x,y,z;
-   double *pu, *pv;
-   int i,j,k,m;
-   int rotation_test;
-   int iteration_count;
-  
-   for (i = 0, x = 0.0; i < ncols; i++) {
-      y = fabs(diagonal[i]) + fabs(superdiagonal[i]);
-      if ( x < y ) x = y;
-   }
-   epsilon = x * DBL_EPSILON;
-   for (k = ncols - 1; k >= 0; k--) {
-      iteration_count = 0;
-      while(1) {
-         rotation_test = 1;
-         for (m = k; m >= 0; m--) { 
-            if (fabs(superdiagonal[m]) <= epsilon) {rotation_test = 0; break;}
-            if (fabs(diagonal[m-1]) <= epsilon) break;
-         }
-         if (rotation_test) {
-            c = 0.0;
-            s = 1.0;
-            for (i = m; i <= k; i++) {  
-               f = s * superdiagonal[i];
-               superdiagonal[i] *= c;
-               if (fabs(f) <= epsilon) break;
-               g = diagonal[i];
-               h = sqrt(f*f + g*g);
-               diagonal[i] = h;
-               c = g / h;
-               s = -f / h; 
-               for (j = 0, pu = U; j < nrows; j++, pu += ncols) { 
-                  y = *(pu + m - 1);
-                  z = *(pu + i);
-                  *(pu + m - 1 ) = y * c + z * s;
-                  *(pu + i) = -y * s + z * c;
-               }
-            }
-         }
-         z = diagonal[k];
-         if (m == k ) {
-            if ( z < 0.0 ) {
-               diagonal[k] = -z;
-               for ( j = 0, pv = V; j < ncols; j++, pv += ncols) 
-                  *(pv + k) = - *(pv + k);
-            }
-            break;
-         }
-         else {
-            if ( iteration_count >= MAX_ITERATION_COUNT ) return -1;
-            iteration_count++;
-            x = diagonal[m];
-            y = diagonal[k-1];
-            g = superdiagonal[k-1];
-            h = superdiagonal[k];
-            f = ( (y - z) * ( y + z ) + (g - h) * (g + h) )/(2.0 * h * y);
-            g = sqrt( f * f + 1.0 );
-            if ( f < 0.0 ) g = -g;
-            f = ( (x - z) * (x + z) + h * (y / (f + g) - h) ) / x;
-// Next QR Transformtion
-            c = 1.0;
-            s = 1.0;
-            for (i = m + 1; i <= k; i++) {
-               g = superdiagonal[i];
-               y = diagonal[i];
-               h = s * g;
-               g *= c;
-               z = sqrt( f * f + h * h );
-               superdiagonal[i-1] = z;
-               c = f / z;
-               s = h / z;
-               f =  x * c + g * s;
-               g = -x * s + g * c;
-               h = y * s;
-               y *= c;
-               for (j = 0, pv = V; j < ncols; j++, pv += ncols) {
-                  x = *(pv + i - 1);
-                  z = *(pv + i);
-                  *(pv + i - 1) = x * c + z * s;
-                  *(pv + i) = -x * s + z * c;
-               }
-               z = sqrt( f * f + h * h );
-               diagonal[i - 1] = z;
-               if (z != 0.0) {
-                  c = f / z;
-                  s = h / z;
-               } 
-               f = c * g + s * y;
-               x = -s * g + c * y;
-               for (j = 0, pu = U; j < nrows; j++, pu += ncols) {
-                  y = *(pu + i - 1);
-                  z = *(pu + i);
-                  *(pu + i - 1) = c * y + s * z;
-                  *(pu + i) = -s * y + c * z;
-               }
-            }
-            superdiagonal[m] = 0.0;
-            superdiagonal[k] = f;
-            diagonal[k] = x;
-         }
-      } 
-   }
-   return 0;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,       //
-//                            double* singular_values, double* U, double* V)  //
-//                                                                            //
-//  Description:                                                              //
-//     This routine sorts the singular values from largest to smallest        //
-//     singular value and interchanges the columns of U and the columns of V  //
-//     whenever a swap is made.  I.e. if the i-th singular value is swapped   //
-//     with the j-th singular value, then the i-th and j-th columns of U are  //
-//     interchanged and the i-th and j-th columns of V are interchanged.      //
-//                                                                            //
-//  Arguments:                                                                //
-//     int nrows                                                              //
-//        The number of rows of the matrix U.                                 //
-//     int ncols                                                              //
-//        The number of columns of the matrix U.                              //
-//     double* singular_values                                                //
-//        On input, a pointer to the array of singular values.  On output, the//
-//        sorted array of singular values.                                    //
-//     double* U                                                              //
-//        On input, a pointer to a matrix already initialized to a matrix     //
-//        with mutually orthogonal columns.  On output, the matrix with       //
-//        mutually orthogonal possibly permuted columns.                      //
-//     double* V                                                              //
-//        On input, a pointer to a square matrix with the same number of rows //
-//        and columns as the columns of the matrix U, i.e. V[ncols][ncols].   //
-//        The matrix V is assumed to be initialized to an orthogonal matrix.  //
-//        On output, V is an orthogonal matrix with possibly permuted columns.//
-//                                                                            //
-//  Return Values:                                                            //
-//        The function is of type void.                                       //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double diagonal[N];                                                    //
-//                                                                            //
-//     (your code to initialize the matrices U, V, and diagonal. )            //
-//     ( - Note this routine is not accessible from outside)                  //
-//     ( i.e. it is declared static.)                                         //
-//                                                                            //
-//     Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values,      //
-//                                                 (double*) U, (double*) V); //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols,
-                                double* singular_values, double* U, double* V)
-{
-   int i,j,max_index;
-   double temp;
-   double *p1, *p2;
-
-   for (i = 0; i < ncols - 1; i++) {
-      max_index = i;
-      for (j = i + 1; j < ncols; j++)
-         if (singular_values[j] > singular_values[max_index] ) 
-            max_index = j;
-      if (max_index == i) continue;
-      temp = singular_values[i];
-      singular_values[i] = singular_values[max_index];
-      singular_values[max_index] = temp;
-      p1 = U + max_index;
-      p2 = U + i;
-      for (j = 0; j < nrows; j++, p1 += ncols, p2 += ncols) {
-         temp = *p1;
-         *p1 = *p2;
-         *p2 = temp;
-      } 
-      p1 = V + max_index;
-      p2 = V + i;
-      for (j = 0; j < ncols; j++, p1 += ncols, p2 += ncols) {
-         temp = *p1;
-         *p1 = *p2;
-         *p2 = temp;
-      }
-   } 
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-//  void Singular_Value_Decomposition_Solve(double* U, double* D, double* V,  //
-//              double tolerance, int nrows, int ncols, double *B, double* x) //
-//                                                                            //
-//  Description:                                                              //
-//     This routine solves the system of linear equations Ax=B where A =UDV', //
-//     is the singular value decomposition of A.  Given UDV'x=B, then         //
-//     x = V(1/D)U'B, where 1/D is the pseudo-inverse of D, i.e. if D[i] > 0  //
-//     then (1/D)[i] = 1/D[i] and if D[i] = 0, then (1/D)[i] = 0.  Since      //
-//     the singular values are subject to round-off error.  A tolerance is    //
-//     given so that if D[i] < tolerance, D[i] is treated as if it is 0.      //
-//     The default tolerance is D[0] * DBL_EPSILON * ncols, if the user       //
-//     specified tolerance is less than the default tolerance, the default    //
-//     tolerance is used.                                                     //
-//                                                                            //
-//  Arguments:                                                                //
-//     double* U                                                              //
-//        A matrix with mutually orthonormal columns.                         //
-//     double* D                                                              //
-//        A diagonal matrix with decreasing non-negative diagonal elements.   //
-//        i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i.                  //
-//     double* V                                                              //
-//        An orthogonal matrix.                                               //
-//     double tolerance                                                       //
-//        An lower bound for non-zero singular values (provided tolerance >   //
-//        ncols * DBL_EPSILON * D[0]).                                        //
-//     int nrows                                                              //
-//        The number of rows of the matrix U and B.                           //
-//     int ncols                                                              //
-//        The number of columns of the matrix U.  Also the number of rows and //
-//        columns of the matrices D and V.                                    //
-//     double* B                                                              //
-//        A pointer to a vector dimensioned as nrows which is the  right-hand //
-//        side of the equation Ax = B where A = UDV'.                         //
-//     double* x                                                              //
-//        A pointer to a vector dimensioned as ncols, which is the least      //
-//        squares solution of the equation Ax = B where A = UDV'.             //
-//                                                                            //
-//  Return Values:                                                            //
-//        The function is of type void.                                       //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     #define NB                                                             //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double D[N];                                                           //
-//     double B[M];                                                           //
-//     double x[N];                                                           //
-//     double tolerance;                                                      //
-//                                                                            //
-//     (your code to initialize the matrices U,D,V,B)                         //
-//                                                                            //
-//     Singular_Value_Decomposition_Solve((double*) U, D, (double*) V,        //
-//                                              tolerance, M, N, B, x, bcols) //
-//                                                                            //
-//     printf(" The solution of Ax=B is \n");                                 //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-
-void Singular_Value_Decomposition_Solve(double* U, double* D, double* V,  
-                double tolerance, int nrows, int ncols, double *B, double* x) 
-{
-   int i,j,k;
-   double *pu, *pv;
-   double dum;
-
-   dum = DBL_EPSILON * D[0] * (double) ncols;
-   if (tolerance < dum) tolerance = dum;
-
-   for ( i = 0, pv = V; i < ncols; i++, pv += ncols) {
-      x[i] = 0.0;
-      for (j = 0; j < ncols; j++)
-         if (D[j] > tolerance ) {
-            for (k = 0, dum = 0.0, pu = U; k < nrows; k++, pu += ncols)
-               dum += *(pu + j) * B[k];
-            x[i] += dum * *(pv + j) / D[j];
-         }
-   } 
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-//  void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,//
-//                     double tolerance, int nrows, int ncols, double *Astar) //
-//                                                                            //
-//  Description:                                                              //
-//     This routine calculates the pseudo-inverse of the matrix A = UDV'.     //
-//     where U, D, V constitute the singular value decomposition of A.        //
-//     Let Astar be the pseudo-inverse then Astar = V(1/D)U', where 1/D is    //
-//     the pseudo-inverse of D, i.e. if D[i] > 0 then (1/D)[i] = 1/D[i] and   //
-//     if D[i] = 0, then (1/D)[i] = 0.  Because the singular values are       //
-//     subject to round-off error.  A tolerance is given so that if           //
-//     D[i] < tolerance, D[i] is treated as if it were 0.                     //
-//     The default tolerance is D[0] * DBL_EPSILON * ncols, assuming that the //
-//     diagonal matrix of singular values is sorted from largest to smallest, //
-//     if the user specified tolerance is less than the default tolerance,    //
-//     then the default tolerance is used.                                    //
-//                                                                            //
-//  Arguments:                                                                //
-//     double* U                                                              //
-//        A matrix with mutually orthonormal columns.                         //
-//     double* D                                                              //
-//        A diagonal matrix with decreasing non-negative diagonal elements.   //
-//        i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i.                  //
-//     double* V                                                              //
-//        An orthogonal matrix.                                               //
-//     double tolerance                                                       //
-//        An lower bound for non-zero singular values (provided tolerance >   //
-//        ncols * DBL_EPSILON * D[0]).                                        //
-//     int nrows                                                              //
-//        The number of rows of the matrix U and B.                           //
-//     int ncols                                                              //
-//        The number of columns of the matrix U.  Also the number of rows and //
-//        columns of the matrices D and V.                                    //
-//     double* Astar                                                          //
-//        On input, a pointer to the first element of an ncols x nrows matrix.//
-//        On output, the pseudo-inverse of UDV'.                              //
-//                                                                            //
-//  Return Values:                                                            //
-//        The function is of type void.                                       //
-//                                                                            //
-//  Example:                                                                  //
-//     #define M                                                              //
-//     #define N                                                              //
-//     double U[M][N];                                                        //
-//     double V[N][N];                                                        //
-//     double D[N];                                                           //
-//     double Astar[N][M];                                                    //
-//     double tolerance;                                                      //
-//                                                                            //
-//     (your code to initialize the matrices U,D,V)                           //
-//                                                                            //
-//     Singular_Value_Decomposition_Inverse((double*) U, D, (double*) V,      //
-//                                        tolerance, M, N, (double*) Astar);  //
-//                                                                            //
-//     printf(" The pseudo-inverse of A = UDV' is \n");                       //
-//           ...                                                              //
-////////////////////////////////////////////////////////////////////////////////
-//                                                                            //
-
-void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,  
-                        double tolerance, int nrows, int ncols, double *Astar) 
-{
-   int i,j,k;
-   double *pu, *pv, *pa;
-   double dum;
-
-   dum = DBL_EPSILON * D[0] * (double) ncols;
-   if (tolerance < dum) tolerance = dum;
-   for ( i = 0, pv = V, pa = Astar; i < ncols; i++, pv += ncols) 
-      for ( j = 0, pu = U; j < nrows; j++, pa++) 
-        for (k = 0, *pa = 0.0; k < ncols; k++, pu++)
-           if (D[k] > tolerance) *pa += *(pv + k) * *pu / D[k];
-}
diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c
deleted file mode 100644
index 8a217c0cd..000000000
--- a/software/apps/svd/main.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#include "encoding.h"
-#include "printf.h"
-#include "runtime.h"
-#include "synchronization.h"
-
-#include "nrutil.h"
-#include "svd.c"
-
-// Define Matrix dimensions:
-#define M 4
-#define N 32
-
-int32_t matrix_U[M * N] __attribute__((section(".l1_prio")));
-int32_t matrix_V[M * N] __attribute__((section(".l1_prio")));
-int32_t matrix_W[N] __attribute__((section(".l1_prio")));
-
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
-
-void init_vector(int32_t *vector, uint32_t num_el, int32_t a, int32_t b,
-                 uint32_t core_id) {
-  uint32_t const split = 8; // How many blocks to split the vector into
-  uint32_t const reminder = num_el % split;
-  uint32_t i, j;
-  for (i = core_id * split; i < core_id * split + split; i++) {
-    j = i % split;
-    vector[i] = a * (int32_t)j + b;
-  }
-  while (i < reminder) {
-    j = i % split;
-    vector[i] = a * (int32_t)j + b;
-  }
-}
-
-int volatile error __attribute__((section(".l1")));
-
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  int32_t const U_a = 1;
-  int32_t const U_b = 1;
-  int32_t const U_c = -32;
-  int32_t const V_a = 2;
-  int32_t const V_b = 1;
-  int32_t const V_c = 16;
-  // Init matrix
-  init_matrix(matrix_U, M, N, U_a, U_b, U_c, core_id, num_cores);
-  init_matrix(matrix_V, M, N, V_a, V_b, V_c, core_id, num_cores);
-  init_vector(matrix_W, N, V_a, V_b, core_id);
-  mempool_barrier(num_cores);
-
-  if (core_id == 0) {
-    // Test the Matri x SVD
-    svdcmp(matrix_U, M, N, matrix_W, matrix_V);
-  }
-
-  // Wait until all cores have finished
-  mempool_barrier(num_cores);
-
-  return error;
-}
diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h
deleted file mode 100644
index a137444ab..000000000
--- a/software/apps/svd/nrutil.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-//#include <stdio.h>
-//#include <stddef.h>
-//#include <stdlib.h>
-
-#ifndef NR_UTILS_H
-#define NR_UTILS_H
-
-#define NR_END 1
-#define FREE_ARG char *
-
-static int32_t sqrarg;
-#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg * sqrarg)
-static int32_t dsqrarg;
-#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg * dsqrarg)
-static int32_t dmaxarg1, dmaxarg2;
-#define DMAX(a, b)                                                             \
-  (dmaxarg1 = (a), dmaxarg2 = (b),                                             \
-   (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))
-static int32_t dminarg1, dminarg2;
-#define DMIN(a, b)                                                             \
-  (dminarg1 = (a), dminarg2 = (b),                                             \
-   (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))
-static int32_t maxarg1, maxarg2;
-#define FMAX(a, b)                                                             \
-  (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
-static int32_t minarg1, minarg2;
-#define FMIN(a, b)                                                             \
-  (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))
-static long lmaxarg1, lmaxarg2;
-#define LMAX(a, b)                                                             \
-  (lmaxarg1 = (a), lmaxarg2 = (b),                                             \
-   (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))
-static long lminarg1, lminarg2;
-#define LMIN(a, b)                                                             \
-  (lminarg1 = (a), lminarg2 = (b),                                             \
-   (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))
-static int32_t imaxarg1, imaxarg2;
-#define IMAX(a, b)                                                             \
-  (imaxarg1 = (a), imaxarg2 = (b),                                             \
-   (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))
-static int32_t iminarg1, iminarg2;
-#define IMIN(a, b)                                                             \
-  (iminarg1 = (a), iminarg2 = (b),                                             \
-   (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
-#define ABS(a) (a < 0 ? -a : a)
-#define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a))
-
-int32_t sqrt_q32(const int32_t number, const uint32_t fracBits);
-
-#define sqrt2 0b1011010100000100
-int32_t sqrt_q32(const int32_t number, const uint32_t fracBits) {
-
-  int32_t root = 0;
-  int32_t start = 0;
-  int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF)
-  int32_t mid;
-
-  if (number > 0) {
-    while (start <= end) {
-      mid = (start + end) >> 1;
-      if (((mid * mid) >> fracBits) == number) {
-        root = mid;
-        break;
-      }
-      if (((mid * mid) >> fracBits) < number) {
-        start = mid + 1;
-        root = mid;
-      } else {
-        end = mid - 1;
-      }
-    }
-  }
-
-  return root;
-}
-
-#endif
diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c
deleted file mode 100644
index fa2fcbd0c..000000000
--- a/software/apps/svd/svd.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-int32_t pythag(int32_t a, int32_t b);
-void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v);
-
-int32_t pythag(int32_t a, int32_t b) {
-  int32_t absa = ABS(a);
-  int32_t absb = ABS(b);
-  if (absa > absb) {
-    return absa * sqrt_q32(1 + SQR(absb / absa), 4);
-  } else {
-    return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4));
-  }
-}
-
-void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) {
-  int32_t flag, i, its, j, jj, k, l, nm;
-  int32_t anorm, c, f, g, h, s, scale, x, y, z;
-  int32_t rv1[n];
-
-  // printf("PROVA\n");
-
-  g = scale = anorm = 0.0;
-  for (i = 1; i <= n; i++) {
-    l = i + 1;
-    rv1[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i <= m) {
-      for (k = i; k <= m; k++) {
-        scale += ABS(a[k * m + i]);
-      }
-      if (scale) {
-        for (k = i; k <= m; k++) {
-          a[k * m + i] /= scale;
-          s += a[k * m + i] * a[k * m + i];
-        }
-        f = a[i * m + i];
-        g = -SIGN(sqrt_q32(s, 4), f);
-        h = f * g - s;
-        a[i * m + i] = f - g;
-        for (j = l; j <= n; j++) {
-          for (s = 0.0, k = i; k <= m; k++) {
-            s += a[k * m + i] * a[k * m + i];
-          }
-          f = s / h;
-          for (k = i; k <= m; k++) {
-            a[k * m + i] += f * a[k * m + i];
-          }
-        }
-        for (k = i; k <= m; k++) {
-          a[k * m + i] *= scale;
-        }
-      }
-    }
-    w[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i <= m && i != n) {
-      for (k = l; k <= n; k++) {
-        scale += ABS(a[k * m + i]);
-      }
-      if (scale) {
-        for (k = l; k <= n; k++) {
-          a[k * m + i] /= scale;
-          s += a[i * m + k] * a[i * m + k];
-        }
-        f = a[i * m + l];
-        g = -SIGN(sqrt_q32(s, 4), f);
-        h = f * g - s;
-        a[i * m + l] = f - g;
-        for (k = l; k <= n; k++) {
-          rv1[k] = a[i * m + k] / h;
-        }
-        for (j = l; j <= m; j++) {
-          for (s = 0, k = l; k <= n; k++) {
-            s += a[j * m + k] * a[i * m + k];
-          }
-          for (k = l; k <= n; k++) {
-            a[j * m + k] += s * rv1[k];
-          }
-        }
-        for (k = l; k <= n; k++) {
-          a[i * m + k] *= scale;
-        }
-      }
-    }
-    anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i])));
-  }
-
-  for (i = n; i >= 1; i--) {
-    if (i < n) {
-      if (g) {
-        for (j = l; j <= n; j++) {
-          v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g;
-        }
-        for (j = l; j <= n; j++) {
-          for (s = 0, k = l; k <= n; k++) {
-            s += a[i * m + k] * v[k * m + j];
-          }
-          for (k = l; k <= n; k++) {
-            v[k * m + j] += s * v[k * m + i];
-          }
-        }
-      }
-      for (j = l; j <= n; j++) {
-        v[i * m + j] = v[j * m + i] = 0;
-      }
-    }
-    v[i * m + i] = 1;
-    g = rv1[i];
-    l = i;
-  }
-
-  //    for (i = IMIN(m, n); i >= 1; i--) {
-  //        l = i + 1;
-  //        g = w[i];
-  //        for (j = l; j <= n; j++) {
-  //            a[i][j] = 0;
-  //        }
-  //        if (g) {
-  //            g = 1.0 / g;
-  //            for (j = l; j <= n; j++) {
-  //                for (s = 0.0, k = l; k <= m; k++) {
-  //                    s += a[k][i] * a[k][j];
-  //                }
-  //                f = (s / a[i][i]) * g;
-  //                for (k = i; k <= m; k++) {
-  //                    a[k][j] += f * a[k][i];
-  //                }
-  //            }
-  //            for (j = i; j <= m; j++) {
-  //                a[j][i] *= g;
-  //            }
-  //        } else { for (j = i; j <= m; j++) {
-  //                     a[j][i] = 0.0;
-  //                 }
-  //        }
-  //        ++a[i][i];
-  //    }
-  //    for (k = n; k >= 1; k--) {
-  //        for (its = 1; its <= 30; its++) {
-  //            flag = 1;
-  //            for (l = k; l >= 1; l--) {
-  //                nm = l - 1;
-  //                if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) {
-  //                    flag = 0;
-  //                    break;
-  //                }
-  //                if ((int32_t) (ABS(w[nm]) + anorm) == anorm) {
-  //                    break;
-  //                }
-  //            }
-  //            if (flag) {
-  //                c = 0.0;
-  //                s = 1.0;
-  //                for (i = l; i <= k; i++) {
-  //                    f = s * rv1[i];
-  //                    rv1[i] = c * rv1[i];
-  //                    if ((int32_t) (ABS(f) + anorm) == anorm) {
-  //                        break;
-  //                    }
-  //                    g = w[i];
-  //                    h = pythag(f, g);
-  //                    w[i] = h;
-  //                    h = 1.0 / h;
-  //                    c = g * h;
-  //                    s = -f * h;
-  //                    for (j = 1; j <= m; j++) {
-  //                        y = a[j][nm];
-  //                        z = a[j][i];
-  //                        a[j][nm] = y * c + z * s;
-  //                        a[j][i] = z * c - y * s;
-  //                    }
-  //                }
-  //            }
-  //            z = w[k];
-  //            if (l == k) {
-  //                if (z < 0.0) {
-  //                    w[k] = -z;
-  //                    for (j = 1; j <= n; j++) {
-  //                        v[j][k] = -v[j][k];
-  //                    }
-  //                }
-  //                break;
-  //            }
-  //            if (its == 30) {
-  //                exit(1);
-  //            }
-  //            x = w[l];
-  //            nm = k - 1;
-  //            y = w[nm];
-  //            g = rv1[nm];
-  //            h = rv1[k];
-  //            f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
-  //            g = pythag(f, 1.0);
-  //            f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
-  //            c = s = 1.0;
-  //            for (j = l; j <= nm; j++) {
-  //                i = j + 1;
-  //                g = rv1[i];
-  //                y = w[i];
-  //                h = s * g;
-  //                g = c * g;
-  //                z = pythag(f, h);
-  //                rv1[j] = z;
-  //                c = f / z;
-  //                s = h / z;
-  //                f = x * c + g * s;
-  //                g = g * c - x * s;
-  //                h = y * s;
-  //                y *= c;
-  //                for (jj = 1; jj <= n; jj++) {
-  //                    x = v[jj][j];
-  //                    z = v[jj][i];
-  //                    v[jj][j] = x * c + z * s;
-  //                    v[jj][i] = z * c - x * s;
-  //                }
-  //                z = pythag(f, h);
-  //                w[j] = z;
-  //                if (z) {
-  //                    z = 1.0 / z;
-  //                    c = f * z;
-  //                    s = h * z;
-  //                }
-  //                f = c * g + s * y;
-  //                x = c * y - s * g;
-  //                for (jj = 1; jj <= m; jj++) {
-  //                    y = a[jj][j];
-  //                    z = a[jj][i];
-  //                    a[jj][j] = y * c + z * s;
-  //                    a[jj][i] = z * c - y * s;
-  //                }
-  //            }
-  //            rv1[l] = 0.0;
-  //            rv1[k] = f;
-  //            w[k] = x;
-  //        }
-  //    }
-}

From 0fbf978d877baa4f718ab459032660507acd5b48 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Wed, 14 Dec 2022 15:10:13 +0100
Subject: [PATCH 19/22] [software] Fix reading of the number of cores

---
 software/apps/mat_inv/mempool_mat_inv_q32p.h  | 27 ++++++++++---------
 .../mat_inv/mempool_mat_inv_q32p_folded.h     |  3 ++-
 .../mat_inv/mempool_mat_inv_q32p_memsized.h   | 21 ++++++++-------
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
index 09e2b449f..c79548185 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h
@@ -24,6 +24,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
   int32_t out1, out2, out3, out4;
 
   uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
   uint32_t i, j, loopCnt, k, l; /* loop counters */
   uint32_t m =
       n; /* M is the number of rows. However, the matirces must be square. */
@@ -31,7 +32,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
   /* CREATE THE IDENTITY MATRIX */
 
   pDstT1 = pDst;
-  for (k = core_id * 4; k < m; k += 4 * NUM_CORES) {
+  for (k = core_id * 4; k < m; k += 4 * num_cores) {
     for (j = 0; j < m; j++) {
       pDstT1[k * m + j] = (uint32_t)(k == j);
       pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j);
@@ -39,7 +40,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
       pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j);
     }
   }
-  mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+  mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
   /* Loop over the number of columns of the input matrix. */
   loopCnt = n;
@@ -125,7 +126,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
         return 1;
       }
     }
-    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
     /* DIVIDE BY THE PIVOT */
     /* Points to the pivot row of input and destination matrices */
@@ -138,7 +139,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     in = *pPivotRowIn;
 
     ///* Loop over columns to the right of pivot */
-    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
       in1 = pSrcT1[j];
       in2 = pSrcT1[j + 1];
       in3 = pSrcT1[j + 2];
@@ -151,7 +152,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
       pSrcT1[j + 1] = out2;
       pSrcT1[j + 2] = out3;
       pSrcT1[j + 3] = out4;
-      // j += NUM_CORES * 4;
+      // j += num_cores * 4;
     }
     if (core_id == (n >> 2U) - 1) {
       j = 4 * ((n - l) >> 2U);
@@ -162,7 +163,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
       }
     }
     /* Loop over columns */
-    for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
       in1 = pSrcT2[j];
       in2 = pSrcT2[j + 1];
       in3 = pSrcT2[j + 2];
@@ -184,13 +185,13 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
         j++;
       }
     }
-    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
     /* REPLACE ROWS */
     pSrcT1 = pSrc;
     pSrcT2 = pDst;
     /* Loop over rows */
-    for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+    for (k = core_id * 4; k < m; k += num_cores * 4) {
       i = 0U;
       while (i < 4) {
         if ((i + k) != l) {
@@ -250,7 +251,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
         i++;
       }
     }
-    mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
     //        /* REPLACE ROWS */
     //        pSrcT1 = pSrc;
@@ -280,7 +281,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
     //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
     //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * NUM_CORES;
+    //                    j += 4 * num_cores;
     //                }
     //                if (core_id == (n >> 2U) - 1) {
     //                    j = 4 * ((n - l) >> 2U);
@@ -306,7 +307,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
     //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
     //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * NUM_CORES;
+    //                    j += 4 * num_cores;
     //                }
     //                if (core_id == (n >> 2U) - 1) {
     //                    j = 4 * (n >> 2U);
@@ -317,11 +318,11 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //                        j++;
     //                    }
     //                }
-    //                mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n /
+    //                mempool_log_partial_barrier(2, core_id, MIN(num_cores, n /
     //                4));
     //            }
     //        }
-    //        mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4));
+    //        mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
     pSrc++;    /* Increment the input pointer */
     loopCnt--; /* Decrement the loop counter */
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
index 6064a1faf..5015039ff 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
@@ -12,8 +12,9 @@ void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
 void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
   uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
   uint32_t i, j, k, shift;
-  for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) {
+  for (i = core_id * 4; i < n * n; i += num_cores * 4) {
     k = i / n;
     j = i % n;
     shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
index b697f9d24..3a5bfe5c0 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
+++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
@@ -26,6 +26,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
   int32_t out1, out2, out3, out4;
 
   uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = absolute_core_id;
   uint32_t i, j, k, l; /* loop counters */
   uint32_t m =
@@ -34,7 +35,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
   /* CREATE THE IDENTITY MATRIX */
 
   pDstT1 = pDst;
-  for (k = core_id * 4; k < m; k += NUM_CORES * 4) {
+  for (k = core_id * 4; k < m; k += num_cores * 4) {
     for (j = 0; j < n; j++) {
       pDstT1[k * n + j] = (uint32_t)(k == j);
       pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j);
@@ -43,7 +44,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     }
   }
   //    pDstT1 = pDst;
-  //    for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) {
+  //    for (i = absolute_core_id * 4; i < n * m; i += num_cores * 4) {
   //        k = i / n;
   //        j = i % n;
   //        pDstT1[k * n + j] = (uint32_t) (k == j);
@@ -147,8 +148,8 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     in = *pPivotRowIn;
     /* Loop over columns to the right of pivot */
     core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U);
-    core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
-    // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) {
+    core_id = core_id > num_cores ? core_id + num_cores : core_id;
+    // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
     //    in1 = pSrcT1[j];
     //    in2 = pSrcT1[j + 1];
     //    in3 = pSrcT1[j + 2];
@@ -196,8 +197,8 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     }
     /* Loop over columns */
     core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U);
-    core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id;
-    for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) {
+    core_id = core_id > num_cores ? core_id + num_cores : core_id;
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
       in1 = pSrcT2[j];
       in2 = pSrcT2[j + 1];
       in3 = pSrcT2[j + 2];
@@ -224,7 +225,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     /* REPLACE ROWS */
     pSrcT1 = pSrc;
     pSrcT2 = pDst;
-    for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) {
+    for (k = absolute_core_id / (n >> 2U); k < m; k += num_cores / (n >> 2U)) {
       /* Only the columns to the right of the pivot are to be processed */
       if (k != l) {
         pSrcT1 = pSrc + k * n;
@@ -369,7 +370,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //        uint32_t check = 0;
     //        if (absolute_core_id >= m * nPE)
     //            mempool_wfi();
-    //        for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) {
+    //        for (k = absolute_core_id / nPE; k < m; k += num_cores / nPE) {
     //            /* Only the columns to the right of the pivot are to be
     //            processed */ if (k != l) {
     //                pSrcT1 = pSrc + k * n;
@@ -504,7 +505,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //        /* REPLACE ROWS */
     //        pSrcT1 = pSrc;
     //        pSrcT2 = pDst;
-    //        for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) {
+    //        for (i = absolute_core_id * 4; i < (n * m); i += num_cores * 4) {
     //            k = i / n;
     //            if (k != l) {
     //                in = *(pSrc + k * n);
@@ -559,7 +560,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
     //        pSrcT1 = pSrc;
     //        pSrcT2 = pDst;
     //        core_id = absolute_core_id;
-    //        for (k = core_id; k < m; k += NUM_CORES) {
+    //        for (k = core_id; k < m; k += num_cores) {
     //            /* Only the columns to the right of the pivot are to be
     //            processed */ if (k != l) {
     //                pSrcT1 = pSrc + k * n;

From 4c42194546d86d3d70831551fc73e2820b3055a5 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 13 Apr 2023 09:35:42 +0200
Subject: [PATCH 20/22] [software] Move the kernels to runtime/kernels folder

[software] Add comment on algorithm
---
 software/apps/mat_inv/initialization.h        |   7 -
 software/apps/mat_inv/main.c                  |  17 +-
 software/apps/mat_inv/mempool_mat_inv_q32p.h  | 341 ----------
 .../mat_inv/mempool_mat_inv_q32p_folded.h     | 291 --------
 .../kernel/mempool_mat_inv_q32p.h}            | 626 +++++++++++++++++-
 .../kernel}/mempool_mat_inv_q32s.h            |  35 +-
 6 files changed, 653 insertions(+), 664 deletions(-)
 delete mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p.h
 delete mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
 rename software/{apps/mat_inv/mempool_mat_inv_q32p_memsized.h => runtime/kernel/mempool_mat_inv_q32p.h} (54%)
 rename software/{apps/mat_inv => runtime/kernel}/mempool_mat_inv_q32s.h (87%)

diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h
index 6e48e7951..a37d5f38c 100644
--- a/software/apps/mat_inv/initialization.h
+++ b/software/apps/mat_inv/initialization.h
@@ -31,13 +31,6 @@ void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
                        uint32_t core_id);
 
 void display(int32_t *A, int32_t n, int32_t m) {
-  // int32_t i, j;
-  // for (i = 0; i < n; i++) {
-  //  for (j = 0; j < m; j++) {
-  //    printf("%8d ", A[i * m + j]);
-  //  }
-  //  printf("\n");
-  //}
   int32_t i;
   for (i = 0; i < n * m; i++) {
     printf("Output[%d] = %8d\n", i, A[i]);
diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index ebe4eca06..c89edb079 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -11,21 +11,18 @@
 
 #define N 16
 #define M 16
-#define O 16
 #define N_BANKS (1024)
-#define N_USED_BANKS (64)
+#define N_USED_BANKS (16)
 
 #define VERBOSE
-// #define SINGLE
+#define SINGLE
 // #define PARALLEL
-#define MEMSIZED
+// #define MEMSIZED
 // #define FOLDED
 
 #include "initialization.h"
-#include "mempool_mat_inv_q32p.h"
-#include "mempool_mat_inv_q32p_folded.h"
-#include "mempool_mat_inv_q32p_memsized.h"
-#include "mempool_mat_inv_q32s.h"
+#include "kernel/mempool_mat_inv_q32p.h"
+#include "kernel/mempool_mat_inv_q32s.h"
 
 #ifdef FOLDED
 int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1")));
@@ -107,7 +104,7 @@ void multi_core_memsized() {
   mempool_barrier(num_cores);
 
   mempool_start_benchmark();
-  mempool_GJinv_q32p_memsized(matrix, inv, M, &flag);
+  mempool_GJinv_memsized_q32p(matrix, inv, M, &flag);
   mempool_stop_benchmark();
 
   mempool_barrier(num_cores);
@@ -141,7 +138,7 @@ void multi_core_folded() {
   mempool_stop_benchmark();
   if (core_id < nPE) {
     mempool_start_benchmark();
-    mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE);
+    mempool_GJinv_folded_q32p(folded_matrix, inv, M, &flag, nPE);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h
deleted file mode 100644
index c79548185..000000000
--- a/software/apps/mat_inv/mempool_mat_inv_q32p.h
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/* GAUSS JORDAN INVERSION */
-
-int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                       uint32_t *flag);
-
-int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                       uint32_t *flag) {
-
-  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
-  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
-  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
-  int32_t *pPRT_in, *pPivotRowDst,
-      *pPRT_pDst; /* Temporary input and output data matrix pointer */
-
-  int32_t in = 0;
-  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-  int32_t in1, in2, in3, in4;
-  int32_t out1, out2, out3, out4;
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t i, j, loopCnt, k, l; /* loop counters */
-  uint32_t m =
-      n; /* M is the number of rows. However, the matirces must be square. */
-
-  /* CREATE THE IDENTITY MATRIX */
-
-  pDstT1 = pDst;
-  for (k = core_id * 4; k < m; k += 4 * num_cores) {
-    for (j = 0; j < m; j++) {
-      pDstT1[k * m + j] = (uint32_t)(k == j);
-      pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j);
-      pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j);
-      pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j);
-    }
-  }
-  mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
-  /* Loop over the number of columns of the input matrix. */
-  loopCnt = n;
-  /* Index modifier to navigate through the columns */
-  l = 0U;
-
-  while (loopCnt > 0U) {
-
-    pSrcT1 = pSrc + (l * n);
-    pDstT1 = pDst + (l * n);
-    in = *pSrcT1;
-
-    /* CHECK IF PIVOT ELEMENT IS ZERO */
-    if (core_id == 0) {
-      if (in == 0U) {
-        /* Loop over the rows present below */
-        for (k = l + 1U; k < m; k++) {
-          pSrcT2 = pSrc + (n * k);
-          pDstT2 = pDst + (n * k);
-          /* EXCHANGE */
-          if (*pSrcT2 != 0) {
-            /* Loop over colums to the right of the pivot */
-            j = 0;
-            while (j < 4 * ((n - l) >> 2U)) {
-              Xchg1 = pSrcT2[j];
-              Xchg2 = pSrcT2[j + 1];
-              Xchg3 = pSrcT2[j + 2];
-              Xchg4 = pSrcT2[j + 3];
-              out1 = pSrcT1[j];
-              out2 = pSrcT1[j + 1];
-              out3 = pSrcT1[j + 2];
-              out4 = pSrcT1[j + 3];
-              pSrcT2[j] = out1;
-              pSrcT2[j + 1] = out2;
-              pSrcT2[j + 2] = out3;
-              pSrcT2[j + 3] = out4;
-              pSrcT1[j] = Xchg1;
-              pSrcT1[j + 1] = Xchg2;
-              pSrcT1[j + 2] = Xchg3;
-              pSrcT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n - l) {
-              Xchg1 = pSrcT2[j];
-              pSrcT2[j] = pSrcT1[j];
-              pSrcT1[j] = Xchg1;
-              j++;
-            }
-            /* Loop over colums */
-            j = 0;
-            while (j < 4 * (n >> 2U)) {
-              Xchg1 = pDstT2[j];
-              Xchg2 = pDstT2[j + 1];
-              Xchg3 = pDstT2[j + 2];
-              Xchg4 = pDstT2[j + 3];
-              out1 = pDstT1[j];
-              out2 = pDstT1[j + 1];
-              out3 = pDstT1[j + 2];
-              out4 = pDstT1[j + 3];
-              pDstT2[j] = out1;
-              pDstT2[j + 1] = out2;
-              pDstT2[j + 2] = out3;
-              pDstT2[j + 3] = out4;
-              pDstT1[j] = Xchg1;
-              pDstT1[j + 1] = Xchg2;
-              pDstT1[j + 2] = Xchg3;
-              pDstT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n) {
-              Xchg1 = pDstT2[j];
-              pDstT2[j] = pDstT1[j];
-              pDstT1[j] = Xchg1;
-              j++;
-            }
-            *flag = 1U;
-            break;
-          }
-        }
-      }
-      /* Update the status if the matrix is singular */
-      if ((*flag == 0U) && (in == 0U)) {
-        return 1;
-      }
-    }
-    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
-    /* DIVIDE BY THE PIVOT */
-    /* Points to the pivot row of input and destination matrices */
-    pPivotRowIn = pSrc + (l * n);
-    pPivotRowDst = pDst + (l * n);
-    /* Temporary pointers to the pivot row pointers */
-    pSrcT1 = pPivotRowIn;
-    pSrcT2 = pPivotRowDst;
-    /* Pivot element of the row */
-    in = *pPivotRowIn;
-
-    ///* Loop over columns to the right of pivot */
-    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
-      in1 = pSrcT1[j];
-      in2 = pSrcT1[j + 1];
-      in3 = pSrcT1[j + 2];
-      in4 = pSrcT1[j + 3];
-      out1 = FIX_DIV(in1, in);
-      out2 = FIX_DIV(in2, in);
-      out3 = FIX_DIV(in3, in);
-      out4 = FIX_DIV(in4, in);
-      pSrcT1[j] = out1;
-      pSrcT1[j + 1] = out2;
-      pSrcT1[j + 2] = out3;
-      pSrcT1[j + 3] = out4;
-      // j += num_cores * 4;
-    }
-    if (core_id == (n >> 2U) - 1) {
-      j = 4 * ((n - l) >> 2U);
-      while (j < n - l) {
-        in1 = pSrcT1[j];
-        pSrcT1[j] = FIX_DIV(in1, in);
-        j++;
-      }
-    }
-    /* Loop over columns */
-    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
-      in1 = pSrcT2[j];
-      in2 = pSrcT2[j + 1];
-      in3 = pSrcT2[j + 2];
-      in4 = pSrcT2[j + 3];
-      out1 = FIX_DIV(in1, in);
-      out2 = FIX_DIV(in2, in);
-      out3 = FIX_DIV(in3, in);
-      out4 = FIX_DIV(in4, in);
-      pSrcT2[j] = out1;
-      pSrcT2[j + 1] = out2;
-      pSrcT2[j + 2] = out3;
-      pSrcT2[j + 3] = out4;
-    }
-    if (core_id == (n >> 2U) - 1) {
-      j = 4 * (n >> 2U);
-      while (j < n) {
-        in1 = pSrcT2[j];
-        pSrcT2[j] = FIX_DIV(in1, in);
-        j++;
-      }
-    }
-    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
-    /* REPLACE ROWS */
-    pSrcT1 = pSrc;
-    pSrcT2 = pDst;
-    /* Loop over rows */
-    for (k = core_id * 4; k < m; k += num_cores * 4) {
-      i = 0U;
-      while (i < 4) {
-        if ((i + k) != l) {
-          pSrcT1 = pSrc + (i + k) * n;
-          pSrcT2 = pDst + (i + k) * n;
-          /* Element of the reference row */
-          in = *pSrcT1;
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-          /* Loop over columns to the right of pivot */
-          j = 0;
-          while (j < 4 * ((n - l) >> 2U)) {
-            in1 = pSrcT1[j];
-            in2 = pSrcT1[j + 1];
-            in3 = pSrcT1[j + 2];
-            in4 = pSrcT1[j + 3];
-            out1 = pPRT_in[j];
-            out2 = pPRT_in[j + 1];
-            out3 = pPRT_in[j + 2];
-            out4 = pPRT_in[j + 3];
-            pSrcT1[j] = in1 - FIX_MUL(in, out1);
-            pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-            pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-            pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-            j += 4;
-          }
-          while (j < n - l) {
-            in1 = pSrcT1[j];
-            out1 = pPRT_in[j];
-            pSrcT1[j] = in1 - FIX_MUL(in, out1);
-            j++;
-          }
-          /* Loop over columns */
-          j = 0;
-          while (j < 4 * (n >> 2U)) {
-            in1 = pSrcT2[j];
-            in2 = pSrcT2[j + 1];
-            in3 = pSrcT2[j + 2];
-            in4 = pSrcT2[j + 3];
-            out1 = pPRT_pDst[j];
-            out2 = pPRT_pDst[j + 1];
-            out3 = pPRT_pDst[j + 2];
-            out4 = pPRT_pDst[j + 3];
-            pSrcT2[j] = in1 - FIX_MUL(in, out1);
-            pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-            pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-            pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-            j += 4;
-          }
-          while (j < n) {
-            in1 = pSrcT2[j];
-            out1 = pPRT_pDst[j];
-            pSrcT2[j] = in1 - FIX_MUL(in, out1);
-            j++;
-          }
-        }
-        i++;
-      }
-    }
-    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
-    //        /* REPLACE ROWS */
-    //        pSrcT1 = pSrc;
-    //        pSrcT2 = pDst;
-    //        /* Loop over rows */
-    //        for (k = 0; k < m; k++) {
-    //            if (k != l) {
-    //                pSrcT1 = pSrc + k * n;
-    //                pSrcT2 = pDst + k * n;
-    //                /* Element of the reference row */
-    //                in = *pSrcT1;
-    //                pPRT_in = pPivotRowIn;
-    //                pPRT_pDst = pPivotRowDst;
-    //                /* Loop over columns to the right of pivot */
-    //                j = core_id * 4;
-    //                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n
-    //                - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) {
-    //                    in1 = pSrcT1[j];
-    //                    in2 = pSrcT1[j + 1];
-    //                    in3 = pSrcT1[j + 2];
-    //                    in4 = pSrcT1[j + 3];
-    //                    out1 = pPRT_in[j];
-    //                    out2 = pPRT_in[j + 1];
-    //                    out3 = pPRT_in[j + 2];
-    //                    out4 = pPRT_in[j + 3];
-    //                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * num_cores;
-    //                }
-    //                if (core_id == (n >> 2U) - 1) {
-    //                    j = 4 * ((n - l) >> 2U);
-    //                    while (j < n - l) {
-    //                        in1 = pSrcT1[j];
-    //                        out1 = pPRT_in[j];
-    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                        j++;
-    //                    }
-    //                }
-    //                /* Loop over columns */
-    //                j = core_id * 4;
-    //                while (j < 4 * (n >> 2U)) {
-    //                    in1 = pSrcT2[j];
-    //                    in2 = pSrcT2[j + 1];
-    //                    in3 = pSrcT2[j + 2];
-    //                    in4 = pSrcT2[j + 3];
-    //                    out1 = pPRT_pDst[j];
-    //                    out2 = pPRT_pDst[j + 1];
-    //                    out3 = pPRT_pDst[j + 2];
-    //                    out4 = pPRT_pDst[j + 3];
-    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * num_cores;
-    //                }
-    //                if (core_id == (n >> 2U) - 1) {
-    //                    j = 4 * (n >> 2U);
-    //                    while (j < n) {
-    //                        in1 = pSrcT2[j];
-    //                        out1 = pPRT_pDst[j];
-    //                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-    //                        j++;
-    //                    }
-    //                }
-    //                mempool_log_partial_barrier(2, core_id, MIN(num_cores, n /
-    //                4));
-    //            }
-    //        }
-    //        mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
-    pSrc++;    /* Increment the input pointer */
-    loopCnt--; /* Decrement the loop counter */
-    l++;       /* Increment the index modifier */
-  }
-
-  //    if ((flag != 1U) && (x == 0)) {
-  //        for (i = 0; i < m * n; i++) {
-  //            if (pSrc[i] != 0)
-  //                break;
-  //        }
-  //        if (i == m * n)
-  //            return 1;
-  //    }
-  return 0;
-}
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
deleted file mode 100644
index 5015039ff..000000000
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/* GAUSS JORDAN INVERSION */
-
-int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                              uint32_t *flag, uint32_t nPE);
-void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
-
-void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t i, j, k, shift;
-  for (i = core_id * 4; i < n * n; i += num_cores * 4) {
-    k = i / n;
-    j = i % n;
-    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-    pDst[shift + j] = pSrc[i];
-    pDst[shift + j + 1] = pSrc[i + 1];
-    pDst[shift + j + 2] = pSrc[i + 2];
-    pDst[shift + j + 3] = pSrc[i + 3];
-  }
-  mempool_log_barrier(2, core_id);
-}
-
-int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                              uint32_t *flag, uint32_t nPE) {
-
-  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
-  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
-  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
-  int32_t *pPRT_in, *pPivotRowDst,
-      *pPRT_pDst; /* Temporary input and output data matrix pointer */
-
-  int32_t in = 0;
-  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-  int32_t in1, in2, in3, in4;
-  int32_t out1, out2, out3, out4;
-
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id;
-  uint32_t shift = 0;
-  uint32_t i, j, k, l; /* loop counters */
-  uint32_t m =
-      n; /* M is the number of rows. However, the matrices must be square. */
-
-  /* CREATE THE IDENTITY MATRIX */
-  pDstT1 = pDst;
-  for (i = core_id * 4; i < n * m; i += nPE * 4) {
-    k = i / n;
-    j = i % n;
-    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-    pDstT1[shift + j] = (uint32_t)(k == j);
-    pDstT1[shift + j + 1] = (uint32_t)(k == (j + 1));
-    pDstT1[shift + j + 2] = (uint32_t)(k == (j + 2));
-    pDstT1[shift + j + 3] = (uint32_t)(k == (j + 3));
-  }
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  /* Index modifier to navigate through the columns */
-  l = 0U;
-  while (l < n) {
-
-    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
-    pSrcT1 = pSrc + shift;
-    pDstT1 = pDst + shift;
-    in = *pSrcT1;
-
-    /* CHECK IF PIVOT ELEMENT IS ZERO */
-    if (absolute_core_id == 0) {
-      if (in == 0U) {
-        /* Loop over the rows present below */
-        for (k = l + 1U; k < m; k++) {
-          shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-          pSrcT2 = pSrc + shift;
-          pDstT2 = pDst + shift;
-          /* EXCHANGE */
-          if (*pSrcT2 != 0) {
-            /* Loop over colums to the right of the pivot */
-            j = 0;
-            while (j < 4 * ((n - l) >> 2U)) {
-              Xchg1 = pSrcT2[j];
-              Xchg2 = pSrcT2[j + 1];
-              Xchg3 = pSrcT2[j + 2];
-              Xchg4 = pSrcT2[j + 3];
-              out1 = pSrcT1[j];
-              out2 = pSrcT1[j + 1];
-              out3 = pSrcT1[j + 2];
-              out4 = pSrcT1[j + 3];
-              pSrcT2[j] = out1;
-              pSrcT2[j + 1] = out2;
-              pSrcT2[j + 2] = out3;
-              pSrcT2[j + 3] = out4;
-              pSrcT1[j] = Xchg1;
-              pSrcT1[j + 1] = Xchg2;
-              pSrcT1[j + 2] = Xchg3;
-              pSrcT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n - l) {
-              Xchg1 = pSrcT2[j];
-              pSrcT2[j] = pSrcT1[j];
-              pSrcT1[j] = Xchg1;
-              j++;
-            }
-            /* Loop over colums */
-            j = 0;
-            while (j < 4 * (n >> 2U)) {
-              Xchg1 = pDstT2[j];
-              Xchg2 = pDstT2[j + 1];
-              Xchg3 = pDstT2[j + 2];
-              Xchg4 = pDstT2[j + 3];
-              out1 = pDstT1[j];
-              out2 = pDstT1[j + 1];
-              out3 = pDstT1[j + 2];
-              out4 = pDstT1[j + 3];
-              pDstT2[j] = out1;
-              pDstT2[j + 1] = out2;
-              pDstT2[j + 2] = out3;
-              pDstT2[j + 3] = out4;
-              pDstT1[j] = Xchg1;
-              pDstT1[j + 1] = Xchg2;
-              pDstT1[j + 2] = Xchg3;
-              pDstT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n) {
-              Xchg1 = pDstT2[j];
-              pDstT2[j] = pDstT1[j];
-              pDstT1[j] = Xchg1;
-              j++;
-            }
-            *flag = 1U;
-            break;
-          }
-        }
-      }
-      /* Update the status if the matrix is singular */
-      if ((*flag == 0U) && (in == 0U)) {
-        return 1;
-      }
-    }
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-    /* DIVIDE BY THE PIVOT */
-    /* Points to the pivot row of input and destination matrices */
-    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
-    pPivotRowIn = pSrc + shift;
-    pPivotRowDst = pDst + shift;
-    /* Temporary pointers to the pivot row pointers */
-    pSrcT1 = pPivotRowIn;
-    pSrcT2 = pPivotRowDst;
-    /* Pivot element of the row */
-    in = *pPivotRowIn;
-
-    /* Loop over columns to the right of pivot */
-    core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
-    core_id = core_id > nPE ? core_id + nPE : core_id;
-    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
-      in1 = pSrcT1[j];
-      in2 = pSrcT1[j + 1];
-      in3 = pSrcT1[j + 2];
-      in4 = pSrcT1[j + 3];
-      out1 = FIX_DIV(in1, in);
-      out2 = FIX_DIV(in2, in);
-      out3 = FIX_DIV(in3, in);
-      out4 = FIX_DIV(in4, in);
-      pSrcT1[j] = out1;
-      pSrcT1[j + 1] = out2;
-      pSrcT1[j + 2] = out3;
-      pSrcT1[j + 3] = out4;
-    }
-    if (core_id == 0) {
-      j = 4 * ((n - l) >> 2U);
-      while (j < n - l) {
-        in1 = pSrcT1[j];
-        pSrcT1[j] = FIX_DIV(in1, in);
-        j++;
-      }
-    }
-
-    /* Loop over columns */
-    core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
-    core_id = core_id > nPE ? core_id + nPE : core_id;
-    for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
-      in1 = pSrcT2[j];
-      in2 = pSrcT2[j + 1];
-      in3 = pSrcT2[j + 2];
-      in4 = pSrcT2[j + 3];
-      out1 = FIX_DIV(in1, in);
-      out2 = FIX_DIV(in2, in);
-      out3 = FIX_DIV(in3, in);
-      out4 = FIX_DIV(in4, in);
-      pSrcT2[j] = out1;
-      pSrcT2[j + 1] = out2;
-      pSrcT2[j + 2] = out3;
-      pSrcT2[j + 3] = out4;
-    }
-    if (core_id == (n >> 2U) - 1) {
-      j = 4 * (n >> 2U);
-      while (j < n) {
-        in1 = pSrcT2[j];
-        pSrcT2[j] = FIX_DIV(in1, in);
-        j++;
-      }
-    }
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-    /* REPLACE ROWS */
-    pSrcT1 = pSrc;
-    pSrcT2 = pDst;
-    for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
-      /* Only the columns to the right of the pivot are to be processed */
-      if (k != l) {
-        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
-        pSrcT1 = pSrc + shift;
-        pSrcT2 = pDst + shift;
-        /* Element of the reference row */
-        in = *pSrcT1;
-        /* Reference row pointers */
-        pPRT_in = pPivotRowIn;
-        pPRT_pDst = pPivotRowDst;
-        /* Loop over the columns */
-        core_id = absolute_core_id % (n >> 2U);
-        core_id = core_id - (l >> 2U);
-        j = core_id * 4;
-        while (j < 4 * ((n - l) >> 2U)) {
-          out1 = pPRT_in[j];
-          out2 = pPRT_in[j + 1];
-          out3 = pPRT_in[j + 2];
-          out4 = pPRT_in[j + 3];
-          in1 = pSrcT1[j];
-          in2 = pSrcT1[j + 1];
-          in3 = pSrcT1[j + 2];
-          in4 = pSrcT1[j + 3];
-          pSrcT1[j] = in1 - FIX_MUL(in, out1);
-          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-          j += 4 * (n >> 2U);
-        }
-        if (core_id == 0) {
-          j = 4 * ((n - l) >> 2U);
-          while (j < n - l) {
-            in1 = pSrcT1[j];
-            out1 = pPRT_in[j];
-            pSrcT1[j] = in1 - FIX_MUL(in, out1);
-            j++;
-          }
-        }
-        core_id = absolute_core_id % (n >> 2U);
-        /* Loop over the columns */
-        j = core_id * 4;
-        while (j < 4 * (n >> 2U)) {
-          out1 = pPRT_pDst[j];
-          out2 = pPRT_pDst[j + 1];
-          out3 = pPRT_pDst[j + 2];
-          out4 = pPRT_pDst[j + 3];
-          in1 = pSrcT2[j];
-          in2 = pSrcT2[j + 1];
-          in3 = pSrcT2[j + 2];
-          in4 = pSrcT2[j + 3];
-          pSrcT2[j] = in1 - FIX_MUL(in, out1);
-          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-          j += 4 * (n >> 2U);
-        }
-        if (core_id == (n >> 2U) - 1) {
-          j = 4 * (n >> 2U);
-          while (j < n) {
-            in1 = pSrcT2[j];
-            out1 = pPRT_pDst[j];
-            pSrcT2[j] = in1 - FIX_MUL(in, out1);
-            j++;
-          }
-        }
-      }
-    }
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-    pSrc++; /* Increment the input pointer */
-    l++;    /* Increment the index modifier */
-  }
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  return 0;
-}
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/runtime/kernel/mempool_mat_inv_q32p.h
similarity index 54%
rename from software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
rename to software/runtime/kernel/mempool_mat_inv_q32p.h
index 3a5bfe5c0..42b26eb21 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h
+++ b/software/runtime/kernel/mempool_mat_inv_q32p.h
@@ -8,10 +8,356 @@
 
 uint32_t volatile pivot_barrier __attribute__((section(".l1")));
 
-int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                                uint32_t *flag);
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
-int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
+void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t i, j, k, shift;
+  for (i = core_id * 4; i < n * n; i += num_cores * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDst[shift + j] = pSrc[i];
+    pDst[shift + j + 1] = pSrc[i + 1];
+    pDst[shift + j + 2] = pSrc[i + 2];
+    pDst[shift + j + 3] = pSrc[i + 3];
+  }
+  mempool_log_barrier(2, core_id);
+}
+
+int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                       uint32_t *flag) {
+
+  int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn;     /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t i, j, loopCnt, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matirces must be square. */
+
+  /* CREATE THE IDENTITY MATRIX */
+
+  pDstT1 = pDst;
+  for (k = core_id * 4; k < m; k += 4 * num_cores) {
+    for (j = 0; j < m; j++) {
+      pDstT1[k * m + j] = (int32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j);
+    }
+  }
+  mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+  /* Loop over the number of columns of the input matrix. */
+  loopCnt = n;
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+
+  while (loopCnt > 0U) {
+
+    pSrcT1 = pSrc + (l * n);
+    pDstT1 = pDst + (l * n);
+    in = *pSrcT1;
+
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          pSrcT2 = pSrc + (n * k);
+          pDstT2 = pDst + (n * k);
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
+            }
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
+        }
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    pPivotRowIn = pSrc + (l * n);
+    pPivotRowDst = pDst + (l * n);
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+
+    ///* Loop over columns to the right of pivot */
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+      // j += num_cores * 4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    /* Loop over columns */
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    /* Loop over rows */
+    for (k = core_id * 4; k < m; k += num_cores * 4) {
+      i = 0U;
+      while (i < 4) {
+        if ((i + k) != l) {
+          pSrcT1 = pSrc + (i + k) * n;
+          pSrcT2 = pDst + (i + k) * n;
+          /* Element of the reference row */
+          in = *pSrcT1;
+          pPRT_in = pPivotRowIn;
+          pPRT_pDst = pPivotRowDst;
+          /* Loop over columns to the right of pivot */
+          j = 0;
+          while (j < 4 * ((n - l) >> 2U)) {
+            in1 = pSrcT1[j];
+            in2 = pSrcT1[j + 1];
+            in3 = pSrcT1[j + 2];
+            in4 = pSrcT1[j + 3];
+            out1 = pPRT_in[j];
+            out2 = pPRT_in[j + 1];
+            out3 = pPRT_in[j + 2];
+            out4 = pPRT_in[j + 3];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+          /* Loop over columns */
+          j = 0;
+          while (j < 4 * (n >> 2U)) {
+            in1 = pSrcT2[j];
+            in2 = pSrcT2[j + 1];
+            in3 = pSrcT2[j + 2];
+            in4 = pSrcT2[j + 3];
+            out1 = pPRT_pDst[j];
+            out2 = pPRT_pDst[j + 1];
+            out3 = pPRT_pDst[j + 2];
+            out4 = pPRT_pDst[j + 3];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+            pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+            pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+            j += 4;
+          }
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+        i++;
+      }
+    }
+    mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    //        /* REPLACE ROWS */
+    //        pSrcT1 = pSrc;
+    //        pSrcT2 = pDst;
+    //        /* Loop over rows */
+    //        for (k = 0; k < m; k++) {
+    //            if (k != l) {
+    //                pSrcT1 = pSrc + k * n;
+    //                pSrcT2 = pDst + k * n;
+    //                /* Element of the reference row */
+    //                in = *pSrcT1;
+    //                pPRT_in = pPivotRowIn;
+    //                pPRT_pDst = pPivotRowDst;
+    //                /* Loop over columns to the right of pivot */
+    //                j = core_id * 4;
+    //                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n
+    //                - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) {
+    //                    in1 = pSrcT1[j];
+    //                    in2 = pSrcT1[j + 1];
+    //                    in3 = pSrcT1[j + 2];
+    //                    in4 = pSrcT1[j + 3];
+    //                    out1 = pPRT_in[j];
+    //                    out2 = pPRT_in[j + 1];
+    //                    out3 = pPRT_in[j + 2];
+    //                    out4 = pPRT_in[j + 3];
+    //                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
+    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * num_cores;
+    //                }
+    //                if (core_id == (n >> 2U) - 1) {
+    //                    j = 4 * ((n - l) >> 2U);
+    //                    while (j < n - l) {
+    //                        in1 = pSrcT1[j];
+    //                        out1 = pPRT_in[j];
+    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
+    //                        j++;
+    //                    }
+    //                }
+    //                /* Loop over columns */
+    //                j = core_id * 4;
+    //                while (j < 4 * (n >> 2U)) {
+    //                    in1 = pSrcT2[j];
+    //                    in2 = pSrcT2[j + 1];
+    //                    in3 = pSrcT2[j + 2];
+    //                    in4 = pSrcT2[j + 3];
+    //                    out1 = pPRT_pDst[j];
+    //                    out2 = pPRT_pDst[j + 1];
+    //                    out3 = pPRT_pDst[j + 2];
+    //                    out4 = pPRT_pDst[j + 3];
+    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
+    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+    //                    j += 4 * num_cores;
+    //                }
+    //                if (core_id == (n >> 2U) - 1) {
+    //                    j = 4 * (n >> 2U);
+    //                    while (j < n) {
+    //                        in1 = pSrcT2[j];
+    //                        out1 = pPRT_pDst[j];
+    //                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
+    //                        j++;
+    //                    }
+    //                }
+    //                mempool_log_partial_barrier(2, core_id, MIN(num_cores, n /
+    //                4));
+    //            }
+    //        }
+    //        mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
+
+    pSrc++;    /* Increment the input pointer */
+    loopCnt--; /* Decrement the loop counter */
+    l++;       /* Increment the index modifier */
+  }
+
+  //    if ((flag != 1U) && (x == 0)) {
+  //        for (i = 0; i < m * n; i++) {
+  //            if (pSrc[i] != 0)
+  //                break;
+  //        }
+  //        if (i == m * n)
+  //            return 1;
+  //    }
+  return 0;
+}
+
+int mempool_GJinv_memsized_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
                                 uint32_t *flag) {
 
   int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
@@ -28,7 +374,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
   uint32_t absolute_core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = absolute_core_id;
-  uint32_t i, j, k, l; /* loop counters */
+  uint32_t j, k, l; /* loop counters */
   uint32_t m =
       n; /* M is the number of rows. However, the matirces must be square. */
 
@@ -37,10 +383,10 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
   pDstT1 = pDst;
   for (k = core_id * 4; k < m; k += num_cores * 4) {
     for (j = 0; j < n; j++) {
-      pDstT1[k * n + j] = (uint32_t)(k == j);
-      pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j);
-      pDstT1[(k + 2) * n + j] = (uint32_t)((k + 2) == j);
-      pDstT1[(k + 3) * n + j] = (uint32_t)((k + 3) == j);
+      pDstT1[k * n + j] = (int32_t)(k == j);
+      pDstT1[(k + 1) * n + j] = (int32_t)((k + 1) == j);
+      pDstT1[(k + 2) * n + j] = (int32_t)((k + 2) == j);
+      pDstT1[(k + 3) * n + j] = (int32_t)((k + 3) == j);
     }
   }
   //    pDstT1 = pDst;
@@ -627,3 +973,267 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n,
 
   return 0;
 }
+
+int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
+                              uint32_t *flag, uint32_t nPE) {
+
+  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
+  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
+  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
+  int32_t *pPRT_in, *pPivotRowDst,
+      *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+  int32_t in = 0;
+  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
+  int32_t in1, in2, in3, in4;
+  int32_t out1, out2, out3, out4;
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  uint32_t shift = 0;
+  uint32_t i, j, k, l; /* loop counters */
+  uint32_t m =
+      n; /* M is the number of rows. However, the matrices must be square. */
+
+  /* CREATE THE IDENTITY MATRIX */
+  pDstT1 = pDst;
+  for (i = core_id * 4; i < n * m; i += nPE * 4) {
+    k = i / n;
+    j = i % n;
+    shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+    pDstT1[shift + j] = (int32_t)(k == j);
+    pDstT1[shift + j + 1] = (int32_t)(k == (j + 1));
+    pDstT1[shift + j + 2] = (int32_t)(k == (j + 2));
+    pDstT1[shift + j + 3] = (int32_t)(k == (j + 3));
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  /* Index modifier to navigate through the columns */
+  l = 0U;
+  while (l < n) {
+
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pSrcT1 = pSrc + shift;
+    pDstT1 = pDst + shift;
+    in = *pSrcT1;
+
+    /* CHECK IF PIVOT ELEMENT IS ZERO */
+    if (absolute_core_id == 0) {
+      if (in == 0U) {
+        /* Loop over the rows present below */
+        for (k = l + 1U; k < m; k++) {
+          shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+          pSrcT2 = pSrc + shift;
+          pDstT2 = pDst + shift;
+          /* EXCHANGE */
+          if (*pSrcT2 != 0) {
+            /* Loop over colums to the right of the pivot */
+            j = 0;
+            while (j < 4 * ((n - l) >> 2U)) {
+              Xchg1 = pSrcT2[j];
+              Xchg2 = pSrcT2[j + 1];
+              Xchg3 = pSrcT2[j + 2];
+              Xchg4 = pSrcT2[j + 3];
+              out1 = pSrcT1[j];
+              out2 = pSrcT1[j + 1];
+              out3 = pSrcT1[j + 2];
+              out4 = pSrcT1[j + 3];
+              pSrcT2[j] = out1;
+              pSrcT2[j + 1] = out2;
+              pSrcT2[j + 2] = out3;
+              pSrcT2[j + 3] = out4;
+              pSrcT1[j] = Xchg1;
+              pSrcT1[j + 1] = Xchg2;
+              pSrcT1[j + 2] = Xchg3;
+              pSrcT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n - l) {
+              Xchg1 = pSrcT2[j];
+              pSrcT2[j] = pSrcT1[j];
+              pSrcT1[j] = Xchg1;
+              j++;
+            }
+            /* Loop over colums */
+            j = 0;
+            while (j < 4 * (n >> 2U)) {
+              Xchg1 = pDstT2[j];
+              Xchg2 = pDstT2[j + 1];
+              Xchg3 = pDstT2[j + 2];
+              Xchg4 = pDstT2[j + 3];
+              out1 = pDstT1[j];
+              out2 = pDstT1[j + 1];
+              out3 = pDstT1[j + 2];
+              out4 = pDstT1[j + 3];
+              pDstT2[j] = out1;
+              pDstT2[j + 1] = out2;
+              pDstT2[j + 2] = out3;
+              pDstT2[j + 3] = out4;
+              pDstT1[j] = Xchg1;
+              pDstT1[j + 1] = Xchg2;
+              pDstT1[j + 2] = Xchg3;
+              pDstT1[j + 3] = Xchg4;
+              j += 4;
+            }
+            while (j < n) {
+              Xchg1 = pDstT2[j];
+              pDstT2[j] = pDstT1[j];
+              pDstT1[j] = Xchg1;
+              j++;
+            }
+            *flag = 1U;
+            break;
+          }
+        }
+      }
+      /* Update the status if the matrix is singular */
+      if ((*flag == 0U) && (in == 0U)) {
+        return 1;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* DIVIDE BY THE PIVOT */
+    /* Points to the pivot row of input and destination matrices */
+    shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS;
+    pPivotRowIn = pSrc + shift;
+    pPivotRowDst = pDst + shift;
+    /* Temporary pointers to the pivot row pointers */
+    pSrcT1 = pPivotRowIn;
+    pSrcT2 = pPivotRowDst;
+    /* Pivot element of the row */
+    in = *pPivotRowIn;
+
+    /* Loop over columns to the right of pivot */
+    core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) {
+      in1 = pSrcT1[j];
+      in2 = pSrcT1[j + 1];
+      in3 = pSrcT1[j + 2];
+      in4 = pSrcT1[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT1[j] = out1;
+      pSrcT1[j + 1] = out2;
+      pSrcT1[j + 2] = out3;
+      pSrcT1[j + 3] = out4;
+    }
+    if (core_id == 0) {
+      j = 4 * ((n - l) >> 2U);
+      while (j < n - l) {
+        in1 = pSrcT1[j];
+        pSrcT1[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+
+    /* Loop over columns */
+    core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U);
+    core_id = core_id > nPE ? core_id + nPE : core_id;
+    for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) {
+      in1 = pSrcT2[j];
+      in2 = pSrcT2[j + 1];
+      in3 = pSrcT2[j + 2];
+      in4 = pSrcT2[j + 3];
+      out1 = FIX_DIV(in1, in);
+      out2 = FIX_DIV(in2, in);
+      out3 = FIX_DIV(in3, in);
+      out4 = FIX_DIV(in4, in);
+      pSrcT2[j] = out1;
+      pSrcT2[j + 1] = out2;
+      pSrcT2[j + 2] = out3;
+      pSrcT2[j + 3] = out4;
+    }
+    if (core_id == (n >> 2U) - 1) {
+      j = 4 * (n >> 2U);
+      while (j < n) {
+        in1 = pSrcT2[j];
+        pSrcT2[j] = FIX_DIV(in1, in);
+        j++;
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    /* REPLACE ROWS */
+    pSrcT1 = pSrc;
+    pSrcT2 = pDst;
+    for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) {
+      /* Only the columns to the right of the pivot are to be processed */
+      if (k != l) {
+        shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+        pSrcT1 = pSrc + shift;
+        pSrcT2 = pDst + shift;
+        /* Element of the reference row */
+        in = *pSrcT1;
+        /* Reference row pointers */
+        pPRT_in = pPivotRowIn;
+        pPRT_pDst = pPivotRowDst;
+        /* Loop over the columns */
+        core_id = absolute_core_id % (n >> 2U);
+        core_id = core_id - (l >> 2U);
+        j = core_id * 4;
+        while (j < 4 * ((n - l) >> 2U)) {
+          out1 = pPRT_in[j];
+          out2 = pPRT_in[j + 1];
+          out3 = pPRT_in[j + 2];
+          out4 = pPRT_in[j + 3];
+          in1 = pSrcT1[j];
+          in2 = pSrcT1[j + 1];
+          in3 = pSrcT1[j + 2];
+          in4 = pSrcT1[j + 3];
+          pSrcT1[j] = in1 - FIX_MUL(in, out1);
+          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
+        }
+        if (core_id == 0) {
+          j = 4 * ((n - l) >> 2U);
+          while (j < n - l) {
+            in1 = pSrcT1[j];
+            out1 = pPRT_in[j];
+            pSrcT1[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+        core_id = absolute_core_id % (n >> 2U);
+        /* Loop over the columns */
+        j = core_id * 4;
+        while (j < 4 * (n >> 2U)) {
+          out1 = pPRT_pDst[j];
+          out2 = pPRT_pDst[j + 1];
+          out3 = pPRT_pDst[j + 2];
+          out4 = pPRT_pDst[j + 3];
+          in1 = pSrcT2[j];
+          in2 = pSrcT2[j + 1];
+          in3 = pSrcT2[j + 2];
+          in4 = pSrcT2[j + 3];
+          pSrcT2[j] = in1 - FIX_MUL(in, out1);
+          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
+          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
+          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
+          j += 4 * (n >> 2U);
+        }
+        if (core_id == (n >> 2U) - 1) {
+          j = 4 * (n >> 2U);
+          while (j < n) {
+            in1 = pSrcT2[j];
+            out1 = pPRT_pDst[j];
+            pSrcT2[j] = in1 - FIX_MUL(in, out1);
+            j++;
+          }
+        }
+      }
+    }
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+    pSrc++; /* Increment the input pointer */
+    l++;    /* Increment the index modifier */
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  return 0;
+}
diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h
similarity index 87%
rename from software/apps/mat_inv/mempool_mat_inv_q32s.h
rename to software/runtime/kernel/mempool_mat_inv_q32s.h
index a20b918e0..0d4c77c7a 100644
--- a/software/apps/mat_inv/mempool_mat_inv_q32s.h
+++ b/software/runtime/kernel/mempool_mat_inv_q32s.h
@@ -8,6 +8,21 @@
 
 int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
+/* GAUSS JORDAN ALGORITHM
+  - Form the augmented matrix by the identity matrix
+  - LOOP OVER ROWS ...
+  - Check if the element on the diagonal of the input matrix is zero
+    > The element is zero, check if there is a nonzero element in one of the
+  rows below on the same column > Exchange the row with the row containing a
+  nonzero element on the same column > If there is no such element then the
+  matrix is singular and the algorithm fails
+
+  - Divide the current row by the element on the diagonal
+  - Replace all the rows below with the sum of that row and a multiple of the
+  current row (row i), so that each new element in column i, below row i is
+  zero.
+*/
+
 int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
 
   int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
@@ -30,10 +45,10 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
   /* CREATE THE IDENTITY MATRIX */
   for (k = 0; k < m; k += 4) {
     for (j = 0; j < n; j++) {
-      pDstT1[k * m + j] = (uint32_t)(k == j);
-      pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j);
-      pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j);
-      pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j);
+      pDstT1[k * m + j] = (int32_t)(k == j);
+      pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j);
+      pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j);
+      pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j);
     }
   }
 
@@ -133,7 +148,7 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
     /* Pivot element of the row */
     in = *pPivotRowIn;
 
-    /* Loop over number of columns to the right of the pilot element */
+    /* Loop over columns to the right of the pilot element */
     j = 0;
     while (j < 4 * ((n - l) >> 2U)) {
       in1 = *pSrcT1;
@@ -155,6 +170,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
       *pSrcT1++ = FIX_DIV(in1, in);
       j++;
     }
+
+    /* Alternative = remainder of loop unrolling using switch-case */
     // switch ((n - l) % 4) {
     //    case 3:
     //        in1 = *pSrcT1;
@@ -181,7 +198,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
     //        *pSrcT1++ = out1;
     //        break;
     //}
-    /* Loop over number of columns of the destination matrix */
+
+    /* Loop over columns of the destination matrix */
     j = 0;
     while (j < 4 * (n >> 2U)) {
       in1 = *pSrcT2;
@@ -243,6 +261,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
           *pSrcT1++ = in1 - FIX_MUL(in, out1);
           j++;
         }
+
+        /* Alternative = remainder of loop unrolling using switch-case */
         // switch ((n - l) % 4) {
         //    case 3:
         //        in1 = *pSrcT1;
@@ -269,7 +289,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
         //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
         //        break;
         //}
-        /* Loop over the number of columns to
+
+        /* Loop over the columns to
            replace the elements in the destination matrix */
         j = 0;
         while (j < 4 * (n >> 2U)) {

From cc31b71293d323975a4c5d740c506a965b6ebb2d Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Fri, 26 May 2023 13:44:25 +0200
Subject: [PATCH 21/22] [software] Clean up

---
 software/apps/mat_inv/main.c                  | 107 +--
 .../runtime/kernel/mempool_mat_inv_q32p.h     | 699 +-----------------
 .../runtime/kernel/mempool_mat_inv_q32s.h     |  56 --
 3 files changed, 30 insertions(+), 832 deletions(-)

diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c
index c89edb079..7ada71ebb 100644
--- a/software/apps/mat_inv/main.c
+++ b/software/apps/mat_inv/main.c
@@ -17,7 +17,6 @@
 #define VERBOSE
 #define SINGLE
 // #define PARALLEL
-// #define MEMSIZED
 // #define FOLDED
 
 #include "initialization.h"
@@ -37,102 +36,52 @@ int32_t inv[M * M] __attribute__((aligned(N), section(".l1")));
 uint32_t flag __attribute__((section(".l1")));
 #endif
 
-// Driver program
-void single_core() {
+int main() {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   // Initialize barrier and synchronize
   mempool_barrier_init(core_id);
 
+/* initialize the data */
+#if defined(SINGLE) || defined(PARALLEL)
   init_matrix(matrix, N, M, -156, 427, -219, core_id);
   init_matrix_zeros(inv, M, M, core_id);
-  mempool_barrier(num_cores);
-
   if (core_id == 0) {
-    mempool_start_benchmark();
-    mempool_GJinv_q32s(matrix, inv, M);
-    mempool_stop_benchmark();
+    flag = 0U;
   }
   mempool_barrier(num_cores);
-#ifdef VERBOSE
-  if (core_id == 0)
-    display(inv, N, M);
-#endif
-  mempool_barrier(num_cores);
-}
-
-void multi_core() {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
 
+#elif defined(FOLDED)
+  uint32_t nPE = N_USED_BANKS >> 2U;
   init_matrix(matrix, N, M, -156, 427, -219, core_id);
-  init_matrix_zeros(inv, M, M, core_id);
+  init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+  init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
   if (core_id == 0) {
     flag = 0U;
   }
   mempool_barrier(num_cores);
 
-  if (core_id < MIN(NUM_CORES, N / 4)) {
-    mempool_start_benchmark();
-    mempool_GJinv_q32p(matrix, inv, M, &flag);
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-#ifdef VERBOSE
-  if (core_id == 0)
-    display(inv, M, N);
 #endif
-  mempool_barrier(num_cores);
-}
-
-void multi_core_memsized() {
 
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  init_matrix(matrix, N, M, -156, 427, -219, core_id);
-  init_matrix_zeros(inv, N, M, core_id);
+/* Execute the kernel */
+#if defined(SINGLE)
   if (core_id == 0) {
-    flag = 0U;
+    mempool_start_benchmark();
+    mempool_GJinv_q32s(matrix, inv, M);
+    mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
 
-  mempool_start_benchmark();
-  mempool_GJinv_memsized_q32p(matrix, inv, M, &flag);
-  mempool_stop_benchmark();
-
-  mempool_barrier(num_cores);
-#ifdef VERBOSE
-  if (core_id == 0)
-    display(inv, M, N);
-#endif
-  mempool_barrier(num_cores);
-}
-
-#ifdef FOLDED
-void multi_core_folded() {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t nPE = N_USED_BANKS >> 2U;
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  init_matrix(matrix, N, M, -156, 427, -219, core_id);
-  init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
-  init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
-  if (core_id == 0) {
-    flag = 0U;
-    __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED);
+#elif defined(PARALLEL)
+  if (core_id < MIN(NUM_CORES, N / 4)) {
+    mempool_start_benchmark();
+    mempool_GJinv_q32p(matrix, inv, M, &flag);
+    mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
 
+#elif defined(FOLDED)
   mempool_start_benchmark();
   fold_matrix(matrix, folded_matrix, N);
   mempool_stop_benchmark();
@@ -142,23 +91,15 @@ void multi_core_folded() {
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
+
+#endif
+
+/* Display the result of computation */
 #ifdef VERBOSE
   if (core_id == 0)
-    display_folded(inv, M, N);
-#endif
+    display(inv, M, N);
   mempool_barrier(num_cores);
-}
 #endif
 
-int main() {
-#if defined(SINGLE)
-  single_core();
-#elif defined(PARALLEL)
-  multi_core();
-#elif defined(MEMSIZED)
-  multi_core_memsized();
-#elif defined(FOLDED)
-  multi_core_folded();
-#endif
   return 0;
 }
diff --git a/software/runtime/kernel/mempool_mat_inv_q32p.h b/software/runtime/kernel/mempool_mat_inv_q32p.h
index 42b26eb21..a937ae33e 100644
--- a/software/runtime/kernel/mempool_mat_inv_q32p.h
+++ b/software/runtime/kernel/mempool_mat_inv_q32p.h
@@ -6,8 +6,6 @@
 
 /* GAUSS JORDAN INVERSION */
 
-uint32_t volatile pivot_barrier __attribute__((section(".l1")));
-
 void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n);
 
 void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) {
@@ -270,77 +268,6 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     }
     mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
 
-    //        /* REPLACE ROWS */
-    //        pSrcT1 = pSrc;
-    //        pSrcT2 = pDst;
-    //        /* Loop over rows */
-    //        for (k = 0; k < m; k++) {
-    //            if (k != l) {
-    //                pSrcT1 = pSrc + k * n;
-    //                pSrcT2 = pDst + k * n;
-    //                /* Element of the reference row */
-    //                in = *pSrcT1;
-    //                pPRT_in = pPivotRowIn;
-    //                pPRT_pDst = pPivotRowDst;
-    //                /* Loop over columns to the right of pivot */
-    //                j = core_id * 4;
-    //                // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n
-    //                - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) {
-    //                    in1 = pSrcT1[j];
-    //                    in2 = pSrcT1[j + 1];
-    //                    in3 = pSrcT1[j + 2];
-    //                    in4 = pSrcT1[j + 3];
-    //                    out1 = pPRT_in[j];
-    //                    out2 = pPRT_in[j + 1];
-    //                    out3 = pPRT_in[j + 2];
-    //                    out4 = pPRT_in[j + 3];
-    //                    pSrcT1[j]     = in1 - FIX_MUL(in, out1);
-    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * num_cores;
-    //                }
-    //                if (core_id == (n >> 2U) - 1) {
-    //                    j = 4 * ((n - l) >> 2U);
-    //                    while (j < n - l) {
-    //                        in1 = pSrcT1[j];
-    //                        out1 = pPRT_in[j];
-    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                        j++;
-    //                    }
-    //                }
-    //                /* Loop over columns */
-    //                j = core_id * 4;
-    //                while (j < 4 * (n >> 2U)) {
-    //                    in1 = pSrcT2[j];
-    //                    in2 = pSrcT2[j + 1];
-    //                    in3 = pSrcT2[j + 2];
-    //                    in4 = pSrcT2[j + 3];
-    //                    out1 = pPRT_pDst[j];
-    //                    out2 = pPRT_pDst[j + 1];
-    //                    out3 = pPRT_pDst[j + 2];
-    //                    out4 = pPRT_pDst[j + 3];
-    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * num_cores;
-    //                }
-    //                if (core_id == (n >> 2U) - 1) {
-    //                    j = 4 * (n >> 2U);
-    //                    while (j < n) {
-    //                        in1 = pSrcT2[j];
-    //                        out1 = pPRT_pDst[j];
-    //                        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-    //                        j++;
-    //                    }
-    //                }
-    //                mempool_log_partial_barrier(2, core_id, MIN(num_cores, n /
-    //                4));
-    //            }
-    //        }
-    //        mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4));
-
     pSrc++;    /* Increment the input pointer */
     loopCnt--; /* Decrement the loop counter */
     l++;       /* Increment the index modifier */
@@ -357,623 +284,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
   return 0;
 }
 
-int mempool_GJinv_memsized_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
-                                uint32_t *flag) {
-
-  int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */
-  int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */
-  int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */
-  int32_t *pPRT_in, *pPivotRowDst,
-      *pPRT_pDst; /* Temporary input and output data matrix pointer */
-
-  int32_t in = 0;
-  int32_t Xchg1, Xchg2, Xchg3, Xchg4;
-  int32_t in1, in2, in3, in4;
-  int32_t out1, out2, out3, out4;
-
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t core_id = absolute_core_id;
-  uint32_t j, k, l; /* loop counters */
-  uint32_t m =
-      n; /* M is the number of rows. However, the matirces must be square. */
-
-  /* CREATE THE IDENTITY MATRIX */
-
-  pDstT1 = pDst;
-  for (k = core_id * 4; k < m; k += num_cores * 4) {
-    for (j = 0; j < n; j++) {
-      pDstT1[k * n + j] = (int32_t)(k == j);
-      pDstT1[(k + 1) * n + j] = (int32_t)((k + 1) == j);
-      pDstT1[(k + 2) * n + j] = (int32_t)((k + 2) == j);
-      pDstT1[(k + 3) * n + j] = (int32_t)((k + 3) == j);
-    }
-  }
-  //    pDstT1 = pDst;
-  //    for (i = absolute_core_id * 4; i < n * m; i += num_cores * 4) {
-  //        k = i / n;
-  //        j = i % n;
-  //        pDstT1[k * n + j] = (uint32_t) (k == j);
-  //        pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1));
-  //        pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2));
-  //        pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3));
-  //    }
-  //    mempool_log_barrier(2, absolute_core_id);
-
-  /* Index modifier to navigate through the columns */
-  l = 0U;
-  while (l < n) {
-
-    pSrcT1 = pSrc + (l * n);
-    pDstT1 = pDst + (l * n);
-    in = *pSrcT1;
-
-    /* CHECK IF PIVOT ELEMENT IS ZERO */
-    if (absolute_core_id == 0) {
-      if (in == 0U) {
-        /* Loop over the rows present below */
-        for (k = l + 1U; k < m; k++) {
-          pSrcT2 = pSrc + (n * k);
-          pDstT2 = pDst + (n * k);
-          /* EXCHANGE */
-          if (*pSrcT2 != 0) {
-            /* Loop over colums to the right of the pivot */
-            j = 0;
-            while (j < 4 * ((n - l) >> 2U)) {
-              Xchg1 = pSrcT2[j];
-              Xchg2 = pSrcT2[j + 1];
-              Xchg3 = pSrcT2[j + 2];
-              Xchg4 = pSrcT2[j + 3];
-              out1 = pSrcT1[j];
-              out2 = pSrcT1[j + 1];
-              out3 = pSrcT1[j + 2];
-              out4 = pSrcT1[j + 3];
-              pSrcT2[j] = out1;
-              pSrcT2[j + 1] = out2;
-              pSrcT2[j + 2] = out3;
-              pSrcT2[j + 3] = out4;
-              pSrcT1[j] = Xchg1;
-              pSrcT1[j + 1] = Xchg2;
-              pSrcT1[j + 2] = Xchg3;
-              pSrcT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n - l) {
-              Xchg1 = pSrcT2[j];
-              pSrcT2[j] = pSrcT1[j];
-              pSrcT1[j] = Xchg1;
-              j++;
-            }
-            /* Loop over colums */
-            j = 0;
-            while (j < 4 * (n >> 2U)) {
-              Xchg1 = pDstT2[j];
-              Xchg2 = pDstT2[j + 1];
-              Xchg3 = pDstT2[j + 2];
-              Xchg4 = pDstT2[j + 3];
-              out1 = pDstT1[j];
-              out2 = pDstT1[j + 1];
-              out3 = pDstT1[j + 2];
-              out4 = pDstT1[j + 3];
-              pDstT2[j] = out1;
-              pDstT2[j + 1] = out2;
-              pDstT2[j + 2] = out3;
-              pDstT2[j + 3] = out4;
-              pDstT1[j] = Xchg1;
-              pDstT1[j + 1] = Xchg2;
-              pDstT1[j + 2] = Xchg3;
-              pDstT1[j + 3] = Xchg4;
-              j += 4;
-            }
-            while (j < n) {
-              Xchg1 = pDstT2[j];
-              pDstT2[j] = pDstT1[j];
-              pDstT1[j] = Xchg1;
-              j++;
-            }
-            *flag = 1U;
-            break;
-          }
-        }
-      }
-      /* Update the status if the matrix is singular */
-      if ((*flag == 0U) && (in == 0U)) {
-        return 1;
-      }
-    }
-    mempool_log_barrier(2, absolute_core_id);
-
-    /* DIVIDE BY THE PIVOT */
-    /* Points to the pivot row of input and destination matrices */
-    pPivotRowIn = pSrc + (l * n);
-    pPivotRowDst = pDst + (l * n);
-    /* Temporary pointers to the pivot row pointers */
-    pSrcT1 = pPivotRowIn;
-    pSrcT2 = pPivotRowDst;
-    /* Pivot element of the row */
-    in = *pPivotRowIn;
-    /* Loop over columns to the right of pivot */
-    core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U);
-    core_id = core_id > num_cores ? core_id + num_cores : core_id;
-    // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) {
-    //    in1 = pSrcT1[j];
-    //    in2 = pSrcT1[j + 1];
-    //    in3 = pSrcT1[j + 2];
-    //    in4 = pSrcT1[j + 3];
-    //    out1 = FIX_DIV(in1, in);
-    //    out2 = FIX_DIV(in2, in);
-    //    out3 = FIX_DIV(in3, in);
-    //    out4 = FIX_DIV(in4, in);
-    //    pSrcT1[j] = out1;
-    //    pSrcT1[j + 1] = out2;
-    //    pSrcT1[j + 2] = out3;
-    //    pSrcT1[j + 3] = out4;
-    //}
-    // if (core_id == 0) {
-    //    j = 4 * ((n - l) >> 2U);
-    //    while (j < n - l) {
-    //        in1 = pSrcT1[j];
-    //        pSrcT1[j] = FIX_DIV(in1, in);
-    //        j++;
-    //    }
-    //}
-    if (core_id == 0) {
-      j = 0;
-      while (j < 4 - l % 4) {
-        in1 = pSrcT1[j];
-        pSrcT1[j] = FIX_DIV(in1, in);
-        j++;
-      }
-    } else {
-      j = core_id * 4 - l % 4;
-      if (j < (n - l)) {
-        in1 = pSrcT1[j];
-        in2 = pSrcT1[j + 1];
-        in3 = pSrcT1[j + 2];
-        in4 = pSrcT1[j + 3];
-        out1 = FIX_DIV(in1, in);
-        out2 = FIX_DIV(in2, in);
-        out3 = FIX_DIV(in3, in);
-        out4 = FIX_DIV(in4, in);
-        pSrcT1[j] = out1;
-        pSrcT1[j + 1] = out2;
-        pSrcT1[j + 2] = out3;
-        pSrcT1[j + 3] = out4;
-      }
-    }
-    /* Loop over columns */
-    core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U);
-    core_id = core_id > num_cores ? core_id + num_cores : core_id;
-    for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) {
-      in1 = pSrcT2[j];
-      in2 = pSrcT2[j + 1];
-      in3 = pSrcT2[j + 2];
-      in4 = pSrcT2[j + 3];
-      out1 = FIX_DIV(in1, in);
-      out2 = FIX_DIV(in2, in);
-      out3 = FIX_DIV(in3, in);
-      out4 = FIX_DIV(in4, in);
-      pSrcT2[j] = out1;
-      pSrcT2[j + 1] = out2;
-      pSrcT2[j + 2] = out3;
-      pSrcT2[j + 3] = out4;
-    }
-    // if (core_id == (n >> 2U) - 1) {
-    //    j = 4 * (n >> 2U);
-    //    while (j < n) {
-    //        in1 = pSrcT2[j];
-    //        pSrcT2[j] = FIX_DIV(in1, in);
-    //        j++;
-    //    }
-    //}
-    mempool_log_barrier(2, absolute_core_id);
-
-    /* REPLACE ROWS */
-    pSrcT1 = pSrc;
-    pSrcT2 = pDst;
-    for (k = absolute_core_id / (n >> 2U); k < m; k += num_cores / (n >> 2U)) {
-      /* Only the columns to the right of the pivot are to be processed */
-      if (k != l) {
-        pSrcT1 = pSrc + k * n;
-        pSrcT2 = pDst + k * n;
-        /* Element of the reference row */
-        in = *pSrcT1;
-        /* Reference row pointers */
-        pPRT_in = pPivotRowIn;
-        pPRT_pDst = pPivotRowDst;
-        /* Loop over the columns */
-        core_id = absolute_core_id % (n >> 2U);
-        core_id = core_id - (l >> 2U);
-        j = core_id * 4;
-        while (j < 4 * ((n - l) >> 2U)) {
-          out1 = pPRT_in[j];
-          out2 = pPRT_in[j + 1];
-          out3 = pPRT_in[j + 2];
-          out4 = pPRT_in[j + 3];
-          in1 = pSrcT1[j];
-          in2 = pSrcT1[j + 1];
-          in3 = pSrcT1[j + 2];
-          in4 = pSrcT1[j + 3];
-          pSrcT1[j] = in1 - FIX_MUL(in, out1);
-          pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-          pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-          pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-          j += 4 * (n >> 2U);
-        }
-        if (core_id == 0) {
-          j = 4 * ((n - l) >> 2U);
-          while (j < n - l) {
-            in1 = pSrcT1[j];
-            out1 = pPRT_in[j];
-            pSrcT1[j] = in1 - FIX_MUL(in, out1);
-            j++;
-          }
-        }
-        /* Loop over the columns */
-        core_id = absolute_core_id % (n >> 2U);
-        j = core_id * 4;
-        while (j < 4 * (n >> 2U)) {
-          out1 = pPRT_pDst[j];
-          out2 = pPRT_pDst[j + 1];
-          out3 = pPRT_pDst[j + 2];
-          out4 = pPRT_pDst[j + 3];
-          in1 = pSrcT2[j];
-          in2 = pSrcT2[j + 1];
-          in3 = pSrcT2[j + 2];
-          in4 = pSrcT2[j + 3];
-          pSrcT2[j] = in1 - FIX_MUL(in, out1);
-          pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-          pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-          pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-          j += 4 * (n >> 2U);
-        }
-        // if (core_id == (n >> 2U) - 1) {
-        //    j = 4 * (n >> 2U);
-        //    while (j < n) {
-        //        in1 = pSrcT2[j];
-        //        out1 = pPRT_pDst[j];
-        //        pSrcT2[j] = in1 - FIX_MUL(in, out1);
-        //        j++;
-        //    }
-        //}
-        // uint32_t core_id_in;
-        // uint32_t core_id_Dst;
-        // int32_t p1_in, p2_in, p3_in, p4_in;
-        // int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst;
-        // core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U);
-        // core_id_Dst = absolute_core_id % (n >> 2U);
-        // j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4);
-        // i = core_id_Dst * 4;
-        // p1_in = pPRT_in[j];
-        // p2_in = pPRT_in[j + 1];
-        // p3_in = pPRT_in[j + 2];
-        // p4_in = pPRT_in[j + 3];
-        // p1_Dst = pPRT_pDst[i];
-        // p2_Dst = pPRT_pDst[i + 1];
-        // p3_Dst = pPRT_pDst[i + 2];
-        // p4_Dst = pPRT_pDst[i + 3];
-        // if(core_id_in == 0) {
-        //    switch (4 - l % 4) {
-        //        case (1):
-        //            in1 = pSrcT1[j];
-        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-        //            break;
-        //        case (2):
-        //            in1 = pSrcT1[j];
-        //            in2 = pSrcT1[j + 1];
-        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-        //            break;
-        //        case (3):
-        //            in1 = pSrcT1[j];
-        //            in2 = pSrcT1[j + 1];
-        //            in3 = pSrcT1[j + 2];
-        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-        //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-        //            break;
-        //        case (4):
-        //            in1 = pSrcT1[j];
-        //            in2 = pSrcT1[j + 1];
-        //            in3 = pSrcT1[j + 2];
-        //            in4 = pSrcT1[j + 3];
-        //            pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-        //            pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-        //            pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-        //            pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
-        //            break;
-        //    }
-        //} else {
-        //    in1 = pSrcT1[j];
-        //    in2 = pSrcT1[j + 1];
-        //    in3 = pSrcT1[j + 2];
-        //    in4 = pSrcT1[j + 3];
-        //    pSrcT1[j] = in1 - FIX_MUL(in, p1_in);
-        //    pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in);
-        //    pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in);
-        //    pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in);
-        //}
-        // in1 = pSrcT2[i];
-        // in2 = pSrcT2[i + 1];
-        // in3 = pSrcT2[i + 2];
-        // in4 = pSrcT2[i + 3];
-        // pSrcT2[i]     = in1 - FIX_MUL(in, p1_Dst);
-        // pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst);
-        // pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst);
-        // pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst);
-      }
-    }
-    mempool_log_barrier(2, absolute_core_id);
-
-    //        /* REPLACE ROWS */
-    //        pSrcT1 = pSrc;
-    //        pSrcT2 = pDst;
-    //        /* Reference row pointers */
-    //        pPRT_in = pSrc + (l * n);
-    //        pPRT_pDst = pDst + (l * n);
-    //        int32_t pivot = *pPRT_in;
-    //        uint32_t nPE = (n >> 2U);
-    //        uint32_t check = 0;
-    //        if (absolute_core_id >= m * nPE)
-    //            mempool_wfi();
-    //        for (k = absolute_core_id / nPE; k < m; k += num_cores / nPE) {
-    //            /* Only the columns to the right of the pivot are to be
-    //            processed */ if (k != l) {
-    //                pSrcT1 = pSrc + k * n;
-    //                pSrcT2 = pDst + k * n;
-    //                /* Element of the reference row */
-    //                in = *pSrcT1;
-    //                /* Loop over the columns */
-    //                core_id = absolute_core_id % nPE;
-    //                core_id = core_id - (l >> 2U);
-    //                j = core_id * 4;
-    //                while (j < 4 * ((n - l) >> 2U)) {
-    //                    out1 = pPRT_in[j];
-    //                    out2 = pPRT_in[j + 1];
-    //                    out3 = pPRT_in[j + 2];
-    //                    out4 = pPRT_in[j + 3];
-    //                    out1 = FIX_DIV(out1, pivot);
-    //                    out2 = FIX_DIV(out2, pivot);
-    //                    out3 = FIX_DIV(out3, pivot);
-    //                    out4 = FIX_DIV(out4, pivot);
-    //                    in1 = pSrcT1[j];
-    //                    in2 = pSrcT1[j + 1];
-    //                    in3 = pSrcT1[j + 2];
-    //                    in4 = pSrcT1[j + 3];
-    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * (n >> 2U);
-    //                }
-    //                if (core_id == 0) {
-    //                    j = 4 * ((n - l) >> 2U);
-    //                    while (j < n - l) {
-    //                        out1 = pPRT_in[j];
-    //                        out1 = FIX_DIV(out1, pivot);
-    //                        in1 = pSrcT1[j];
-    //                        pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                        j++;
-    //                    }
-    //                }
-    //                /* Loop over the columns */
-    //                core_id = absolute_core_id % nPE;
-    //                j = core_id * 4;
-    //                while (j < 4 * (n >> 2U)) {
-    //                    out1 = pPRT_pDst[j];
-    //                    out2 = pPRT_pDst[j + 1];
-    //                    out3 = pPRT_pDst[j + 2];
-    //                    out4 = pPRT_pDst[j + 3];
-    //                    out1 = FIX_DIV(out1, pivot);
-    //                    out2 = FIX_DIV(out2, pivot);
-    //                    out3 = FIX_DIV(out3, pivot);
-    //                    out4 = FIX_DIV(out4, pivot);
-    //                    in1 = pSrcT2[j];
-    //                    in2 = pSrcT2[j + 1];
-    //                    in3 = pSrcT2[j + 2];
-    //                    in4 = pSrcT2[j + 3];
-    //                    pSrcT2[j]     = in1 - FIX_MUL(in, out1);
-    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4 * nPE;
-    //                }
-    //                __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED);
-    //                mempool_wfi();
-    //            } else {
-    //                do {
-    //                    check = __atomic_fetch_add(&pivot_barrier, 0,
-    //                    __ATOMIC_RELAXED); mempool_wait(20);
-    //                } while (check < ((m - 1) * nPE));
-    //                /* Loop over the columns */
-    //                core_id = absolute_core_id % (n >> 2U);
-    //                core_id = core_id - (l >> 2U);
-    //                j = core_id * 4;
-    //                while (j < 4 * ((n - l) >> 2U)) {
-    //                    in1 = pPRT_in[j];
-    //                    in2 = pPRT_in[j + 1];
-    //                    in3 = pPRT_in[j + 2];
-    //                    in4 = pPRT_in[j + 3];
-    //                    out1 = FIX_DIV(in1, pivot);
-    //                    out2 = FIX_DIV(in2, pivot);
-    //                    out3 = FIX_DIV(in3, pivot);
-    //                    out4 = FIX_DIV(in4, pivot);
-    //                    pPRT_in[j] = out1;
-    //                    pPRT_in[j + 1] = out2;
-    //                    pPRT_in[j + 2] = out3;
-    //                    pPRT_in[j + 3] = out4;
-    //                    j += 4 * (n >> 2U);
-    //                }
-    //                if (core_id == 0) {
-    //                    j = 4 * ((n - l) >> 2U);
-    //                    while (j < n - l) {
-    //                        in1 = pPRT_in[j];
-    //                        pPRT_in[j] = FIX_DIV(in1, pivot);
-    //                        j++;
-    //                    }
-    //                }
-    //                /* Loop over the columns */
-    //                core_id = absolute_core_id % (n >> 2U);
-    //                j = core_id * 4;
-    //                while (j < 4 * (n >> 2U)) {
-    //                    in1 = pPRT_pDst[j];
-    //                    in2 = pPRT_pDst[j + 1];
-    //                    in3 = pPRT_pDst[j + 2];
-    //                    in4 = pPRT_pDst[j + 3];
-    //                    out1 = FIX_DIV(in1, pivot);
-    //                    out2 = FIX_DIV(in2, pivot);
-    //                    out3 = FIX_DIV(in3, pivot);
-    //                    out4 = FIX_DIV(in4, pivot);
-    //                    pPRT_pDst[j] = out1;
-    //                    pPRT_pDst[j + 1] = out2;
-    //                    pPRT_pDst[j + 2] = out3;
-    //                    pPRT_pDst[j + 3] = out4;
-    //                    j += 4 * (n >> 2U);
-    //                }
-    //                if (core_id == (n >> 2U) - 1) {
-    //                    j = 4 * (n >> 2U);
-    //                    while (j < n) {
-    //                        in1 = pPRT_pDst[j];
-    //                        pPRT_pDst[j] = FIX_DIV(in1, pivot);
-    //                        j++;
-    //                    }
-    //                }
-    //                if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1,
-    //                __ATOMIC_RELAXED)) {
-    //                    __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED);
-    //                    __sync_synchronize();
-    //                    wake_up_all();
-    //                }
-    //                mempool_wfi();
-    //            }
-    //        }
-
-    //        /* REPLACE ROWS */
-    //        pSrcT1 = pSrc;
-    //        pSrcT2 = pDst;
-    //        for (i = absolute_core_id * 4; i < (n * m); i += num_cores * 4) {
-    //            k = i / n;
-    //            if (k != l) {
-    //                in = *(pSrc + k * n);
-    //                j = i - (k * n);
-    //                if (j >= 4 * (l >> 2U)) {
-    //                    if (j == 4 * (l >> 2U)) {
-    //                        pSrcT1 = pSrc + k * n;
-    //                        pPRT_in = pPivotRowIn;
-    //                        uint32_t bound = j + 4 - l;
-    //                        j = 0;
-    //                        while (j < bound) {
-    //                            in1 = *pSrcT1;
-    //                            out1 = *pPRT_in++;
-    //                            *pSrcT1++ = in1 - FIX_MUL(in, out1);
-    //                            j++;
-    //                        }
-    //                    } else {
-    //                        pSrcT1 = pSrc + (i - l);
-    //                        pPRT_in = pPivotRowIn + (j - l);
-    //                        in1 = *pSrcT1;
-    //                        in2 = *(pSrcT1 + 1);
-    //                        in3 = *(pSrcT1 + 2);
-    //                        in4 = *(pSrcT1 + 3);
-    //                        out1 = *pPRT_in++;
-    //                        out2 = *pPRT_in++;
-    //                        out3 = *pPRT_in++;
-    //                        out4 = *pPRT_in++;
-    //                        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-    //                        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-    //                        *pSrcT1++ = in3 - FIX_MUL(in, out3);
-    //                        *pSrcT1++ = in4 - FIX_MUL(in, out4);
-    //                    }
-    //                }
-    //                pSrcT2 = pDst + i;
-    //                pPRT_pDst = pPivotRowDst + j;
-    //                in1 = *pSrcT2;
-    //                in2 = *(pSrcT2 + 1);
-    //                in3 = *(pSrcT2 + 2);
-    //                in4 = *(pSrcT2 + 3);
-    //                out1 = *pPRT_pDst++;
-    //                out2 = *pPRT_pDst++;
-    //                out3 = *pPRT_pDst++;
-    //                out4 = *pPRT_pDst++;
-    //                *pSrcT2++ = in1 - FIX_MUL(in, out1);
-    //                *pSrcT2++ = in2 - FIX_MUL(in, out2);
-    //                *pSrcT2++ = in3 - FIX_MUL(in, out3);
-    //                *pSrcT2++ = in4 - FIX_MUL(in, out4);
-    //            }
-    //        }
-    //        mempool_log_barrier(2, absolute_core_id);
-    //        /* REPLACE ROWS */
-    //        pSrcT1 = pSrc;
-    //        pSrcT2 = pDst;
-    //        core_id = absolute_core_id;
-    //        for (k = core_id; k < m; k += num_cores) {
-    //            /* Only the columns to the right of the pivot are to be
-    //            processed */ if (k != l) {
-    //                pSrcT1 = pSrc + k * n;
-    //                pSrcT2 = pDst + k * n;
-    //                /* Element of the reference row */
-    //                in = *pSrcT1;
-    //                /* Reference row pointers */
-    //                pPRT_in = pPivotRowIn;
-    //                pPRT_pDst = pPivotRowDst;
-    //                /* Loop over the columns */
-    //                j = 0;
-    //                while (j < 4 * ((n - l) >> 2U)) {
-    //                    in1 = pSrcT1[j];
-    //                    in2 = pSrcT1[j + 1];
-    //                    in3 = pSrcT1[j + 2];
-    //                    in4 = pSrcT1[j + 3];
-    //                    out1 = pPRT_in[j];
-    //                    out2 = pPRT_in[j + 1];
-    //                    out3 = pPRT_in[j + 2];
-    //                    out4 = pPRT_in[j + 3];
-    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                    pSrcT1[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT1[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT1[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4;
-    //                }
-    //                while (j < n - l) {
-    //                    in1 = pSrcT1[j];
-    //                    out1 = pPRT_in[j];
-    //                    pSrcT1[j] = in1 - FIX_MUL(in, out1);
-    //                    j++;
-    //                }
-    //                /* Loop over the columns */
-    //                j = 0;
-    //                while (j < 4 * (n >> 2U)) {
-    //                    in1 = pSrcT2[j];
-    //                    in2 = pSrcT2[j + 1];
-    //                    in3 = pSrcT2[j + 2];
-    //                    in4 = pSrcT2[j + 3];
-    //                    out1 = pPRT_pDst[j];
-    //                    out2 = pPRT_pDst[j + 1];
-    //                    out3 = pPRT_pDst[j + 2];
-    //                    out4 = pPRT_pDst[j + 3];
-    //                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
-    //                    pSrcT2[j + 1] = in2 - FIX_MUL(in, out2);
-    //                    pSrcT2[j + 2] = in3 - FIX_MUL(in, out3);
-    //                    pSrcT2[j + 3] = in4 - FIX_MUL(in, out4);
-    //                    j += 4;
-    //                }
-    //                while (j < n) {
-    //                    in1 = pSrcT2[j];
-    //                    out1 = pPRT_pDst[j];
-    //                    pSrcT2[j] = in1 - FIX_MUL(in, out1);
-    //                    j++;
-    //                }
-    //            }
-    //        }
-    //        mempool_log_barrier(2, absolute_core_id);
-
-    pSrc++; /* Increment the input pointer */
-    l++;    /* Increment the index modifier */
-  }
-  mempool_log_barrier(2, absolute_core_id);
-
-  return 0;
-}
-
+/* The input matrix is folded in memory, to have ony local accesses */
 int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
                               uint32_t *flag, uint32_t nPE) {
 
@@ -991,9 +302,10 @@ int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
   uint32_t absolute_core_id = mempool_get_core_id();
   uint32_t core_id = absolute_core_id;
   uint32_t shift = 0;
-  uint32_t i, j, k, l; /* loop counters */
-  uint32_t m =
-      n; /* M is the number of rows. However, the matrices must be square. */
+  /* loop counters */
+  uint32_t i, j, k, l;
+  /* M is the number of rows. However, the matrices must be square. */
+  uint32_t m = n;
 
   /* CREATE THE IDENTITY MATRIX */
   pDstT1 = pDst;
@@ -1018,6 +330,7 @@ int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n,
     in = *pSrcT1;
 
     /* CHECK IF PIVOT ELEMENT IS ZERO */
+    // This is done by a single core
     if (absolute_core_id == 0) {
       if (in == 0U) {
         /* Loop over the rows present below */
diff --git a/software/runtime/kernel/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h
index 0d4c77c7a..ce84de24e 100644
--- a/software/runtime/kernel/mempool_mat_inv_q32s.h
+++ b/software/runtime/kernel/mempool_mat_inv_q32s.h
@@ -171,34 +171,6 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
       j++;
     }
 
-    /* Alternative = remainder of loop unrolling using switch-case */
-    // switch ((n - l) % 4) {
-    //    case 3:
-    //        in1 = *pSrcT1;
-    //        in2 = *(pSrcT1 + 1);
-    //        in3 = *(pSrcT1 + 2);
-    //        out1 = FIX_DIV(in1, in);
-    //        out2 = FIX_DIV(in2, in);
-    //        out3 = FIX_DIV(in3, in);
-    //        *pSrcT1++ = out1;
-    //        *pSrcT1++ = out2;
-    //        *pSrcT1++ = out3;
-    //        break;
-    //    case 2:
-    //        in1 = *pSrcT1;
-    //        in2 = *(pSrcT1 + 1);
-    //        out1 = FIX_DIV(in1, in);
-    //        out2 = FIX_DIV(in2, in);
-    //        *pSrcT1++ = out1;
-    //        *pSrcT1++ = out2;
-    //        break;
-    //    case 1:
-    //        in1 = *pSrcT1;
-    //        out1 = FIX_DIV(in1, in);
-    //        *pSrcT1++ = out1;
-    //        break;
-    //}
-
     /* Loop over columns of the destination matrix */
     j = 0;
     while (j < 4 * (n >> 2U)) {
@@ -262,34 +234,6 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) {
           j++;
         }
 
-        /* Alternative = remainder of loop unrolling using switch-case */
-        // switch ((n - l) % 4) {
-        //    case 3:
-        //        in1 = *pSrcT1;
-        //        in2 = *(pSrcT1 + 1);
-        //        in3 = *(pSrcT1 + 2);
-        //        out1 = *pPRT_in++;
-        //        out2 = *pPRT_in++;
-        //        out3 = *pPRT_in++;
-        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-        //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-        //        *pSrcT1++ = in3 - FIX_MUL(in, out3);
-        //        break;
-        //    case 2:
-        //        in1 = *pSrcT1;
-        //        in2 = *(pSrcT1 + 1);
-        //        out1 = *pPRT_in++;
-        //        out2 = *pPRT_in++;
-        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-        //        *pSrcT1++ = in2 - FIX_MUL(in, out2);
-        //        break;
-        //    case 1:
-        //        in1 = *pSrcT1;
-        //        out1 = *pPRT_in++;
-        //        *pSrcT1++ = in1 - FIX_MUL(in, out1);
-        //        break;
-        //}
-
         /* Loop over the columns to
            replace the elements in the destination matrix */
         j = 0;

From c04dea31421a5c210f037540901c26d0000714d6 Mon Sep 17 00:00:00 2001
From: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Fri, 26 May 2023 15:10:37 +0200
Subject: [PATCH 22/22] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f9c9660fd..c78894eab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Use custom compiler for VCS specified with `CC` and `CCX` environment variable
 - Implement operand gating for SIMD and MAC Units in Snitch IPU's DSP Unit
 - Add Channel Estimation application and kernels
+- Add Gauss-Jordan matrix inversion kernel
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`