From 03a8625e36329e539ef807143d130f06f3195eca Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 13 Jul 2022 18:39:21 +0200 Subject: [PATCH 01/22] [software] Add Moore Penrose inversion kernel --- .../apps/MP_matrix_inverse/initialization.h | 53 +++ software/apps/MP_matrix_inverse/inverse.h | 390 ++++++++++++++++++ software/apps/MP_matrix_inverse/main.c | 82 ++++ 3 files changed, 525 insertions(+) create mode 100644 software/apps/MP_matrix_inverse/initialization.h create mode 100644 software/apps/MP_matrix_inverse/inverse.h create mode 100644 software/apps/MP_matrix_inverse/main.c diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h new file mode 100644 index 000000000..e7e834de9 --- /dev/null +++ b/software/apps/MP_matrix_inverse/initialization.h @@ -0,0 +1,53 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id, + uint32_t num_cores); + +void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id, uint32_t num_cores); + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id, + uint32_t num_cores) { + uint32_t const split = 8; // How many rows/columns to split the matrix into + if (num_columns > num_rows) { + // Parallelize over columns + uint32_t const c_start = (num_rows / split) * (core_id % split); + uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); + for (uint32_t j = (core_id / split); j < num_columns; + j += (num_cores / split)) { + for (uint32_t i = c_start; i < c_end; ++i) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } else { + // Parallelize over rows + uint32_t const c_start = (num_columns / split) * (core_id % split); + uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); + for (uint32_t i = (core_id / split); i < num_rows; + i += (num_cores / split)) { + for (uint32_t j = c_start; j < c_end; ++j) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } +} + +void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id, uint32_t num_cores) { + + if(core_id == 0) { + for(uint32_t i = 0; i < num_columns; i++) { + for(uint32_t j = 0; j < num_rows; j++) { + matrix[j * num_rows + i] = 0; + } + } + printf("SONO QUI\n"); + } + +} diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h new file mode 100644 index 000000000..19dfc9b1e --- /dev/null +++ b/software/apps/MP_matrix_inverse/inverse.h @@ -0,0 +1,390 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n); + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n); + +void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n); + +int32_t determinant(int32_t *A, int32_t n); + +void adjoint(int32_t *A,int32_t *adj, int32_t n); + +int32_t inverse(int32_t *A, int32_t *inverse, int32_t n); + +int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n); + + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + t_matrix[j * n + i]=matrix[i * n + j]; + } + } +} + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) { + int k; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { // not j 0U) { + /* Writing all zeroes in lower triangle of the destination matrix */ + j = m - rowCnt; + while (j > 0U) { + *pDstT1++ = 0; + j--; + } + + /* Writing all ones in the diagonal of the destination matrix */ + *pDstT1++ = 1; + + /* Writing all zeroes in upper triangle of the destination matrix */ + j = rowCnt - 1U; + while (j > 0U) { + *pDstT1++ = 0; + j--; + } + + /* Decrement loop counter */ + rowCnt--; + } + + /* Loop over the number of columns of the input matrix. + All the elements in each column are processed by the row operations */ + loopCnt = n; + + /* Index modifier to navigate through the columns */ + l = 0U; + + while (loopCnt > 0U) { + /* Check if the pivot element is zero.. + * If it is zero then interchange the row with non zero row below. + * If there is no non zero element to replace in the rows below, + * then the matrix is Singular. */ + + /* Working pointer for the input matrix that points + * to the pivot element of the particular row */ + pSrcT1 = pSrc + (l * n); + + /* Working pointer for the destination matrix that points + * to the pivot element of the particular row */ + pDstT1 = pDst + (l * n); + + /* Temporary variable to hold the pivot value */ + in = *pSrcT1; + + /* Destination pointer modifier */ + k = 1U; + + /* Check if the pivot element is zero */ + if (*pSrcT1 == 0) { + /* Loop over the number rows present below */ + + for (i = (l + 1U); i < m; i++) { + /* Update the input and destination pointers */ + pSrcT2 = pSrcT1 + (n * i); + pDstT2 = pDstT1 + (n * k); + + /* Check if there is a non zero pivot element to + * replace in the rows below */ + if (*pSrcT2 != 0) { + /* Loop over number of columns + * to the right of the pilot element */ + j = n - l; + + while (j > 0U) { + /* Exchange the row elements of the input matrix */ + Xchg = *pSrcT2; + *pSrcT2++ = *pSrcT1; + *pSrcT1++ = Xchg; + + /* Decrement the loop counter */ + j--; + } + + /* Loop over number of columns of the destination matrix */ + j = n; + + while (j > 0U) { + /* Exchange the row elements of the destination matrix */ + Xchg = *pDstT2; + *pDstT2++ = *pDstT1; + *pDstT1++ = Xchg; + + /* Decrement loop counter */ + j--; + } + + /* Flag to indicate whether exchange is done or not */ + flag = 1U; + + /* Break after exchange is done */ + break; + } + + /* Update the destination pointer modifier */ + k++; + + /* Decrement loop counter */ + } + } + + /* Update the status if the matrix is singular */ + if ((flag != 1U) && (in == 0)) { + return 1; + } + + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + + /* Pivot element of the row */ + in = *pPivotRowIn; + + /* Loop over number of columns + * to the right of the pilot element */ + j = (n - l); + + while (j > 0U) { + /* Divide each element of the row of the input matrix + * by the pivot element */ + in1 = *pSrcT1; + *pSrcT1++ = in1 / in; + + /* Decrement the loop counter */ + j--; + } + + /* Loop over number of columns of the destination matrix */ + j = n; + + while (j > 0U) { + /* Divide each element of the row of the destination matrix + * by the pivot element */ + in1 = *pSrcT2; + *pSrcT2++ = in1 / in; + + /* Decrement the loop counter */ + j--; + } + + /* Replace the rows with the sum of that row and a multiple of row i + * so that each new element in column i above row i is zero.*/ + + /* Temporary pointers for input and destination matrices */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + + /* index used to check for pivot element */ + i = 0U; + + /* Loop over number of rows */ + /* to be replaced by the sum of that row and a multiple of row i */ + k = m; + + while (k > 0U) { + /* Check for the pivot element */ + if (i == l) { + /* If the processing element is the pivot element, + only the columns to the right are to be processed */ + pSrcT1 += n - l; + + pSrcT2 += n; + } else { + /* Element of the reference row */ + in = *pSrcT1; + + /* Working pointers for input and destination pivot rows */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + + /* Loop over the number of columns to the right of the pivot element, + to replace the elements in the input matrix */ + j = (n - l); + + while (j > 0U) { + /* Replace the element by the sum of that row + and a multiple of the reference row */ + in1 = *pSrcT1; + *pSrcT1++ = in1 - (in * *pPRT_in++); + + /* Decrement the loop counter */ + j--; + } + + /* Loop over the number of columns to + replace the elements in the destination matrix */ + j = n; + + while (j > 0U) { + /* Replace the element by the sum of that row + and a multiple of the reference row */ + in1 = *pSrcT2; + *pSrcT2++ = in1 - (in * *pPRT_pDst++); + + /* Decrement loop counter */ + j--; + } + } + + /* Increment temporary input pointer */ + pSrcT1 = pSrcT1 + l; + + /* Decrement loop counter */ + k--; + + /* Increment pivot index */ + i++; + } + + /* Increment the input pointer */ + pSrc++; + + /* Decrement the loop counter */ + loopCnt--; + + /* Increment the index modifier */ + l++; + } + + if ((flag != 1U) && (in == 0)) { + for (i = 0; i < m * n; i++) { + if (pSrc[i] != 0) + break; + } + + if (i == m * n) + return 1; + } + + return 0; +} + diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c new file mode 100644 index 000000000..587ee06b0 --- /dev/null +++ b/software/apps/MP_matrix_inverse/main.c @@ -0,0 +1,82 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +//#include +//#include + +#define N 5 + +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "inverse.h" +#include "initialization.h" + +// C++ program to find Moore-Penrose inverse matrix + +// Generic function to display the matrix. We use it to display +// both adjoin and inverse. adjoin is integer matrix and inverse +// is a int32_t. +void display(int32_t *A, int32_t n) +{ + for (int i = 0; i < n; i++) + { + for (int j = 0; j < n; j++) + printf("%4d ", A[i * n + j]); + printf("\n"); + } +} + +// Driver program +int main() +{ + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + int32_t matrix[N * N] = { -2, 2, 7, 9, 4, 0, 8, + 1, 0, 0, 3, 1, 0, 9, + -3, 1, 5, 0, 2, 1, 7, + 3,-1,-9, 4, 6, 5, 2, + 1, 0, 4, 4, 1, 0, 9, + 8, 0, 3, 8, 6, 5, 2, + 5, 6, 4, 1, 3, 2, 0 }; + + int32_t t_matrix[N * N]; + int32_t matrix_mult[N * N]; + int32_t pseudoinverse[N * N]; + int32_t inv[N * N]; // To store inverse + +// init_matrix_zeros(t_matrix, N, N, core_id, num_cores); +// init_matrix_zeros(matrix_mult, N, N, core_id, num_cores); +// init_matrix_zeros(pseudoinverse, N, N, core_id, num_cores); +// init_matrix_zeros(adj, N, N, core_id, num_cores); +// init_matrix_zeros(inv, N, N, core_id, num_cores); + if(core_id == 0) + display(matrix, N); + + if(core_id == 0) { + Transpose(matrix, t_matrix, N); + printf("\nThe Transpose is :\n"); + display(t_matrix, N); + printf("The product of the matrix is: \n"); + MatrixMult(t_matrix,matrix,matrix_mult, N); + display(matrix_mult, N); + printf("\nThe Inverse is :\n"); + if (inverse(matrix_mult, inv, N)) + display(inv, N); + MatrixMult(inv,t_matrix,pseudoinverse, N); + printf("\nThe Monroe-penrose inverse is :\n"); + display(pseudoinverse, N); + } + + mempool_barrier(num_cores); + return 0; +} From c56a0552808469af4c91c9b63bbe4fc24abd49e9 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 13 Jul 2022 18:39:55 +0200 Subject: [PATCH 02/22] [software] add singular value decomposition kernel --- software/apps/svd/SVD_Householder.txt | 781 ++++++++++++++++++++++++++ software/apps/svd/main.c | 93 +++ software/apps/svd/nrutil.h | 65 +++ software/apps/svd/svd.c | 237 ++++++++ 4 files changed, 1176 insertions(+) create mode 100644 software/apps/svd/SVD_Householder.txt create mode 100644 software/apps/svd/main.c create mode 100644 software/apps/svd/nrutil.h create mode 100644 software/apps/svd/svd.c diff --git a/software/apps/svd/SVD_Householder.txt b/software/apps/svd/SVD_Householder.txt new file mode 100644 index 000000000..1631212de --- /dev/null +++ b/software/apps/svd/SVD_Householder.txt @@ -0,0 +1,781 @@ +//////////////////////////////////////////////////////////////////////////////// +// File: singular_value_decomposition.c // +// Contents: // +// Singular_Value_Decomposition // +// Singular_Value_Decomposition_Solve // +// Singular_Value_Decomposition_Inverse // +//////////////////////////////////////////////////////////////////////////////// + +#include // required for memcpy() +#include // required for DBL_EPSILON +#include // required for fabs(), sqrt(); + +#define MAX_ITERATION_COUNT 30 // Maximum number of iterations + +// Internally Defined Routines +static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows, + int ncols, double* U, double* V, double* diagonal, double* superdiagonal ); +static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, + double* U, double* V, double* diagonal, double* superdiagonal ); +static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, + double* singular_value, double* U, double* V); + +//////////////////////////////////////////////////////////////////////////////// +// int Singular_Value_Decomposition(double* A, int nrows, int ncols, // +// double* U, double* singular_values, double* V, double* dummy_array) // +// // +// Description: // +// This routine decomposes an m x n matrix A, with m >= n, into a product // +// of the three matrices U, D, and V', i.e. A = UDV', where U is an m x n // +// matrix whose columns are orthogonal, D is a n x n diagonal matrix, and // +// V is an n x n orthogonal matrix. V' denotes the transpose of V. If // +// m < n, then the procedure may be used for the matrix A'. The singular // +// values of A are the diagonal elements of the diagonal matrix D and // +// correspond to the positive square roots of the eigenvalues of the // +// matrix A'A. // +// // +// This procedure programmed here is based on the method of Golub and // +// Reinsch as given on pages 134 - 151 of the "Handbook for Automatic // +// Computation vol II - Linear Algebra" edited by Wilkinson and Reinsch // +// and published by Springer-Verlag, 1971. // +// // +// The Golub and Reinsch's method for decomposing the matrix A into the // +// product U, D, and V' is performed in three stages: // +// Stage 1: Decompose A into the product of three matrices U1, B, V1' // +// A = U1 B V1' where B is a bidiagonal matrix, and U1, and V1 are a // +// product of Householder transformations. // +// Stage 2: Use Given' transformations to reduce the bidiagonal matrix // +// B into the product of the three matrices U2, D, V2'. The singular // +// value decomposition is then UDV'where U = U2 U1 and V' = V1' V2'. // +// Stage 3: Sort the matrix D in decreasing order of the singular // +// values and interchange the columns of both U and V to reflect any // +// change in the order of the singular values. // +// // +// After performing the singular value decomposition for A, call // +// Singular_Value_Decomposition to solve the equation Ax = B or call // +// Singular_Value_Decomposition_Inverse to calculate the pseudo-inverse // +// of A. // +// // +// Arguments: // +// double* A // +// On input, the pointer to the first element of the matrix // +// A[nrows][ncols]. The matrix A is unchanged. // +// int nrows // +// The number of rows of the matrix A. // +// int ncols // +// The number of columns of the matrix A. // +// double* U // +// On input, a pointer to a matrix with the same number of rows and // +// columns as the matrix A. On output, the matrix with mutually // +// orthogonal columns which is the left-most factor in the singular // +// value decomposition of A. // +// double* singular_values // +// On input, a pointer to an array dimensioned to same as the number // +// of columns of the matrix A, ncols. On output, the singular values // +// of the matrix A sorted in decreasing order. This array corresponds // +// to the diagonal matrix in the singular value decomposition of A. // +// double* V // +// On input, a pointer to a square matrix with the same number of rows // +// and columns as the columns of the matrix A, i.e. V[ncols][ncols]. // +// On output, the orthogonal matrix whose transpose is the right-most // +// factor in the singular value decomposition of A. // +// double* dummy_array // +// On input, a pointer to an array dimensioned to same as the number // +// of columns of the matrix A, ncols. This array is used to store // +// the super-diagonal elements resulting from the Householder reduction// +// of the matrix A to bidiagonal form. And as an input to the Given's // +// procedure to reduce the bidiagonal form to diagonal form. // +// // +// Return Values: // +// 0 Success // +// -1 Failure - During the Given's reduction of the bidiagonal form to // +// diagonal form the procedure failed to terminate within // +// MAX_ITERATION_COUNT iterations. // +// // +// Example: // +// #define M // +// #define N // +// double A[M][N]; // +// double U[M][N]; // +// double V[N][N]; // +// double singular_values[N]; // +// double* dummy_array; // +// // +// (your code to initialize the matrix A) // +// dummy_array = (double*) malloc(N * sizeof(double)); // +// if (dummy_array == NULL) {printf(" No memory available\n"); exit(0); } // +// // +// err = Singular_Value_Decomposition((double*) A, M, N, (double*) U, // +// singular_values, (double*) V, dummy_array); // +// // +// free(dummy_array); // +// if (err < 0) printf(" Failed to converge\n"); // +// else { printf(" The singular value decomposition of A is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +int Singular_Value_Decomposition(double* A, int nrows, int ncols, double* U, + double* singular_values, double* V, double* dummy_array) +{ + Householders_Reduction_to_Bidiagonal_Form( A, nrows, ncols, U, V, + singular_values, dummy_array); + + if (Givens_Reduction_to_Diagonal_Form( nrows, ncols, U, V, + singular_values, dummy_array ) < 0) return -1; + + Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, U, V); + + return 0; +} + + +//////////////////////////////////////////////////////////////////////////////// +// static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,// +// int ncols, double* U, double* V, double* diagonal, double* superdiagonal )// +// // +// Description: // +// This routine decomposes an m x n matrix A, with m >= n, into a product // +// of the three matrices U, B, and V', i.e. A = UBV', where U is an m x n // +// matrix whose columns are orthogonal, B is a n x n bidiagonal matrix, // +// and V is an n x n orthogonal matrix. V' denotes the transpose of V. // +// If m < n, then the procedure may be used for the matrix A'. The // +// // +// The matrix U is the product of Householder transformations which // +// annihilate the subdiagonal components of A while the matrix V is // +// the product of Householder transformations which annihilate the // +// components of A to the right of the superdiagonal. // +// // +// The Householder transformation which leaves invariant the first k-1 // +// elements of the k-th column and annihilates the all the elements below // +// the diagonal element is P = I - (2/u'u)uu', u is an nrows-dimensional // +// vector the first k-1 components of which are zero and the last // +// components agree with the current transformed matrix below the diagonal// +// diagonal, the remaining k-th element is the diagonal element - s, where// +// s = (+/-)sqrt(sum of squares of the elements below the diagonal), the // +// sign is chosen opposite that of the diagonal element. // +// // +// Arguments: // +// double* A // +// On input, the pointer to the first element of the matrix // +// A[nrows][ncols]. The matrix A is unchanged. // +// int nrows // +// The number of rows of the matrix A. // +// int ncols // +// The number of columns of the matrix A. // +// double* U // +// On input, a pointer to a matrix with the same number of rows and // +// columns as the matrix A. On output, the matrix with mutually // +// orthogonal columns which is the left-most factor in the bidiagonal // +// decomposition of A. // +// double* V // +// On input, a pointer to a square matrix with the same number of rows // +// and columns as the columns of the matrix A, i.e. V[ncols][ncols]. // +// On output, the orthogonal matrix whose transpose is the right-most // +// factor in the bidiagonal decomposition of A. // +// double* diagonal // +// On input, a pointer to an array dimensioned to same as the number // +// of columns of the matrix A, ncols. On output, the diagonal of the // +// bidiagonal matrix. // +// double* superdiagonal // +// On input, a pointer to an array dimensioned to same as the number // +// of columns of the matrix A, ncols. On output, the superdiagonal // +// of the bidiagonal matrix. // +// // +// Return Values: // +// The function is of type void and therefore does not return a value. // +// The matrices U, V, and the diagonal and superdiagonal are calculated // +// using the addresses passed in the argument list. // +// // +// Example: // +// #define M // +// #define N // +// double A[M][N]; // +// double U[M][N]; // +// double V[N][N]; // +// double diagonal[N]; // +// double superdiagonal[N]; // +// // +// (your code to initialize the matrix A - Note this routine is not // +// (accessible from outside i.e. it is declared static) // +// // +// Householders_Reduction_to_Bidiagonal_Form((double*) A, nrows, ncols, // +// (double*) U, (double*) V, diagonal, superdiagonal ) // +// // +// free(dummy_array); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows, + int ncols, double* U, double* V, double* diagonal, double* superdiagonal ) +{ + int i,j,k,ip1; + double s, s2, si, scale; + double dum; + double *pu, *pui, *pv, *pvi; + double half_norm_squared; + +// Copy A to U + + memcpy(U,A, sizeof(double) * nrows * ncols); + +// + + diagonal[0] = 0.0; + s = 0.0; + scale = 0.0; + for ( i = 0, pui = U, ip1 = 1; i < ncols; pui += ncols, i++, ip1++ ) { + superdiagonal[i] = scale * s; +// +// Perform Householder transform on columns. +// +// Calculate the normed squared of the i-th column vector starting at +// row i. +// + for (j = i, pu = pui, scale = 0.0; j < nrows; j++, pu += ncols) + scale += fabs( *(pu + i) ); + + if (scale > 0.0) { + for (j = i, pu = pui, s2 = 0.0; j < nrows; j++, pu += ncols) { + *(pu + i) /= scale; + s2 += *(pu + i) * *(pu + i); + } +// +// +// Chose sign of s which maximizes the norm +// + s = ( *(pui + i) < 0.0 ) ? sqrt(s2) : -sqrt(s2); +// +// Calculate -2/u'u +// + half_norm_squared = *(pui + i) * s - s2; +// +// Transform remaining columns by the Householder transform. +// + *(pui + i) -= s; + + for (j = ip1; j < ncols; j++) { + for (k = i, si = 0.0, pu = pui; k < nrows; k++, pu += ncols) + si += *(pu + i) * *(pu + j); + si /= half_norm_squared; + for (k = i, pu = pui; k < nrows; k++, pu += ncols) { + *(pu + j) += si * *(pu + i); + } + } + } + for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) *= scale; + diagonal[i] = s * scale; +// +// Perform Householder transform on rows. +// +// Calculate the normed squared of the i-th row vector starting at +// column i. +// + s = 0.0; + scale = 0.0; + if (i >= nrows || i == (ncols - 1) ) continue; + for (j = ip1; j < ncols; j++) scale += fabs ( *(pui + j) ); + if ( scale > 0.0 ) { + for (j = ip1, s2 = 0.0; j < ncols; j++) { + *(pui + j) /= scale; + s2 += *(pui + j) * *(pui + j); + } + s = ( *(pui + ip1) < 0.0 ) ? sqrt(s2) : -sqrt(s2); +// +// Calculate -2/u'u +// + half_norm_squared = *(pui + ip1) * s - s2; +// +// Transform the rows by the Householder transform. +// + *(pui + ip1) -= s; + for (k = ip1; k < ncols; k++) + superdiagonal[k] = *(pui + k) / half_norm_squared; + if ( i < (nrows - 1) ) { + for (j = ip1, pu = pui + ncols; j < nrows; j++, pu += ncols) { + for (k = ip1, si = 0.0; k < ncols; k++) + si += *(pui + k) * *(pu + k); + for (k = ip1; k < ncols; k++) { + *(pu + k) += si * superdiagonal[k]; + } + } + } + for (k = ip1; k < ncols; k++) *(pui + k) *= scale; + } + } + +// Update V + pui = U + ncols * (ncols - 2); + pvi = V + ncols * (ncols - 1); + *(pvi + ncols - 1) = 1.0; + s = superdiagonal[ncols - 1]; + pvi -= ncols; + for (i = ncols - 2, ip1 = ncols - 1; i >= 0; i--, pui -= ncols, + pvi -= ncols, ip1-- ) { + if ( s != 0.0 ) { + pv = pvi + ncols; + for (j = ip1; j < ncols; j++, pv += ncols) + *(pv + i) = ( *(pui + j) / *(pui + ip1) ) / s; + for (j = ip1; j < ncols; j++) { + si = 0.0; + for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols) + si += *(pui + k) * *(pv + j); + for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols) + *(pv + j) += si * *(pv + i); + } + } + pv = pvi + ncols; + for ( j = ip1; j < ncols; j++, pv += ncols ) { + *(pvi + j) = 0.0; + *(pv + i) = 0.0; + } + *(pvi + i) = 1.0; + s = superdiagonal[i]; + } + +// Update U + + pui = U + ncols * (ncols - 1); + for (i = ncols - 1, ip1 = ncols; i >= 0; ip1 = i, i--, pui -= ncols ) { + s = diagonal[i]; + for ( j = ip1; j < ncols; j++) *(pui + j) = 0.0; + if ( s != 0.0 ) { + for (j = ip1; j < ncols; j++) { + si = 0.0; + pu = pui + ncols; + for (k = ip1; k < nrows; k++, pu += ncols) + si += *(pu + i) * *(pu + j); + si = (si / *(pui + i) ) / s; + for (k = i, pu = pui; k < nrows; k++, pu += ncols) + *(pu + j) += si * *(pu + i); + } + for (j = i, pu = pui; j < nrows; j++, pu += ncols){ + *(pu + i) /= s; + } + } + else + for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) = 0.0; + *(pui + i) += 1.0; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, // +// double* U, double* V, double* diagonal, double* superdiagonal ) // +// // +// Description: // +// This routine decomposes a bidiagonal matrix given by the arrays // +// diagonal and superdiagonal into a product of three matrices U1, D and // +// V1', the matrix U1 premultiplies U and is returned in U, the matrix // +// V1 premultiplies V and is returned in V. The matrix D is a diagonal // +// matrix and replaces the array diagonal. // +// // +// The method used to annihilate the offdiagonal elements is a variant // +// of the QR transformation. The method consists of applying Givens // +// rotations to the right and the left of the current matrix until // +// the new off-diagonal elements are chased out of the matrix. // +// // +// The process is an iterative process which due to roundoff errors may // +// not converge within a predefined number of iterations. (This should // +// be unusual.) // +// // +// Arguments: // +// int nrows // +// The number of rows of the matrix U. // +// int ncols // +// The number of columns of the matrix U. // +// double* U // +// On input, a pointer to a matrix already initialized to a matrix // +// with mutually orthogonal columns. On output, the matrix with // +// mutually orthogonal columns. // +// double* V // +// On input, a pointer to a square matrix with the same number of rows // +// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. // +// The matrix V is assumed to be initialized to an orthogonal matrix. // +// On output, V is an orthogonal matrix. // +// double* diagonal // +// On input, a pointer to an array of dimension ncols which initially // +// contains the diagonal of the bidiagonal matrix. On output, the // +// it contains the diagonal of the diagonal matrix. // +// double* superdiagonal // +// On input, a pointer to an array of dimension ncols which initially // +// the first component is zero and the successive components form the // +// superdiagonal of the bidiagonal matrix. // +// // +// Return Values: // +// 0 Success // +// -1 Failure - The procedure failed to terminate within // +// MAX_ITERATION_COUNT iterations. // +// // +// Example: // +// #define M // +// #define N // +// double U[M][N]; // +// double V[N][N]; // +// double diagonal[N]; // +// double superdiagonal[N]; // +// int err; // +// // +// (your code to initialize the matrices U, V, diagonal, and ) // +// ( superdiagonal. - Note this routine is not accessible from outside) // +// ( i.e. it is declared static.) // +// // +// err = Givens_Reduction_to_Diagonal_Form( M,N,(double*)U,(double*)V, // +// diagonal, superdiagonal ); // +// if ( err < 0 ) printf("Failed to converge\n"); // +// else { ... } // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, + double* U, double* V, double* diagonal, double* superdiagonal ) +{ + + double epsilon; + double c, s; + double f,g,h; + double x,y,z; + double *pu, *pv; + int i,j,k,m; + int rotation_test; + int iteration_count; + + for (i = 0, x = 0.0; i < ncols; i++) { + y = fabs(diagonal[i]) + fabs(superdiagonal[i]); + if ( x < y ) x = y; + } + epsilon = x * DBL_EPSILON; + for (k = ncols - 1; k >= 0; k--) { + iteration_count = 0; + while(1) { + rotation_test = 1; + for (m = k; m >= 0; m--) { + if (fabs(superdiagonal[m]) <= epsilon) {rotation_test = 0; break;} + if (fabs(diagonal[m-1]) <= epsilon) break; + } + if (rotation_test) { + c = 0.0; + s = 1.0; + for (i = m; i <= k; i++) { + f = s * superdiagonal[i]; + superdiagonal[i] *= c; + if (fabs(f) <= epsilon) break; + g = diagonal[i]; + h = sqrt(f*f + g*g); + diagonal[i] = h; + c = g / h; + s = -f / h; + for (j = 0, pu = U; j < nrows; j++, pu += ncols) { + y = *(pu + m - 1); + z = *(pu + i); + *(pu + m - 1 ) = y * c + z * s; + *(pu + i) = -y * s + z * c; + } + } + } + z = diagonal[k]; + if (m == k ) { + if ( z < 0.0 ) { + diagonal[k] = -z; + for ( j = 0, pv = V; j < ncols; j++, pv += ncols) + *(pv + k) = - *(pv + k); + } + break; + } + else { + if ( iteration_count >= MAX_ITERATION_COUNT ) return -1; + iteration_count++; + x = diagonal[m]; + y = diagonal[k-1]; + g = superdiagonal[k-1]; + h = superdiagonal[k]; + f = ( (y - z) * ( y + z ) + (g - h) * (g + h) )/(2.0 * h * y); + g = sqrt( f * f + 1.0 ); + if ( f < 0.0 ) g = -g; + f = ( (x - z) * (x + z) + h * (y / (f + g) - h) ) / x; +// Next QR Transformtion + c = 1.0; + s = 1.0; + for (i = m + 1; i <= k; i++) { + g = superdiagonal[i]; + y = diagonal[i]; + h = s * g; + g *= c; + z = sqrt( f * f + h * h ); + superdiagonal[i-1] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = -x * s + g * c; + h = y * s; + y *= c; + for (j = 0, pv = V; j < ncols; j++, pv += ncols) { + x = *(pv + i - 1); + z = *(pv + i); + *(pv + i - 1) = x * c + z * s; + *(pv + i) = -x * s + z * c; + } + z = sqrt( f * f + h * h ); + diagonal[i - 1] = z; + if (z != 0.0) { + c = f / z; + s = h / z; + } + f = c * g + s * y; + x = -s * g + c * y; + for (j = 0, pu = U; j < nrows; j++, pu += ncols) { + y = *(pu + i - 1); + z = *(pu + i); + *(pu + i - 1) = c * y + s * z; + *(pu + i) = -s * y + c * z; + } + } + superdiagonal[m] = 0.0; + superdiagonal[k] = f; + diagonal[k] = x; + } + } + } + return 0; +} + + +//////////////////////////////////////////////////////////////////////////////// +// static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, // +// double* singular_values, double* U, double* V) // +// // +// Description: // +// This routine sorts the singular values from largest to smallest // +// singular value and interchanges the columns of U and the columns of V // +// whenever a swap is made. I.e. if the i-th singular value is swapped // +// with the j-th singular value, then the i-th and j-th columns of U are // +// interchanged and the i-th and j-th columns of V are interchanged. // +// // +// Arguments: // +// int nrows // +// The number of rows of the matrix U. // +// int ncols // +// The number of columns of the matrix U. // +// double* singular_values // +// On input, a pointer to the array of singular values. On output, the// +// sorted array of singular values. // +// double* U // +// On input, a pointer to a matrix already initialized to a matrix // +// with mutually orthogonal columns. On output, the matrix with // +// mutually orthogonal possibly permuted columns. // +// double* V // +// On input, a pointer to a square matrix with the same number of rows // +// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. // +// The matrix V is assumed to be initialized to an orthogonal matrix. // +// On output, V is an orthogonal matrix with possibly permuted columns.// +// // +// Return Values: // +// The function is of type void. // +// // +// Example: // +// #define M // +// #define N // +// double U[M][N]; // +// double V[N][N]; // +// double diagonal[N]; // +// // +// (your code to initialize the matrices U, V, and diagonal. ) // +// ( - Note this routine is not accessible from outside) // +// ( i.e. it is declared static.) // +// // +// Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, // +// (double*) U, (double*) V); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, + double* singular_values, double* U, double* V) +{ + int i,j,max_index; + double temp; + double *p1, *p2; + + for (i = 0; i < ncols - 1; i++) { + max_index = i; + for (j = i + 1; j < ncols; j++) + if (singular_values[j] > singular_values[max_index] ) + max_index = j; + if (max_index == i) continue; + temp = singular_values[i]; + singular_values[i] = singular_values[max_index]; + singular_values[max_index] = temp; + p1 = U + max_index; + p2 = U + i; + for (j = 0; j < nrows; j++, p1 += ncols, p2 += ncols) { + temp = *p1; + *p1 = *p2; + *p2 = temp; + } + p1 = V + max_index; + p2 = V + i; + for (j = 0; j < ncols; j++, p1 += ncols, p2 += ncols) { + temp = *p1; + *p1 = *p2; + *p2 = temp; + } + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// void Singular_Value_Decomposition_Solve(double* U, double* D, double* V, // +// double tolerance, int nrows, int ncols, double *B, double* x) // +// // +// Description: // +// This routine solves the system of linear equations Ax=B where A =UDV', // +// is the singular value decomposition of A. Given UDV'x=B, then // +// x = V(1/D)U'B, where 1/D is the pseudo-inverse of D, i.e. if D[i] > 0 // +// then (1/D)[i] = 1/D[i] and if D[i] = 0, then (1/D)[i] = 0. Since // +// the singular values are subject to round-off error. A tolerance is // +// given so that if D[i] < tolerance, D[i] is treated as if it is 0. // +// The default tolerance is D[0] * DBL_EPSILON * ncols, if the user // +// specified tolerance is less than the default tolerance, the default // +// tolerance is used. // +// // +// Arguments: // +// double* U // +// A matrix with mutually orthonormal columns. // +// double* D // +// A diagonal matrix with decreasing non-negative diagonal elements. // +// i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i. // +// double* V // +// An orthogonal matrix. // +// double tolerance // +// An lower bound for non-zero singular values (provided tolerance > // +// ncols * DBL_EPSILON * D[0]). // +// int nrows // +// The number of rows of the matrix U and B. // +// int ncols // +// The number of columns of the matrix U. Also the number of rows and // +// columns of the matrices D and V. // +// double* B // +// A pointer to a vector dimensioned as nrows which is the right-hand // +// side of the equation Ax = B where A = UDV'. // +// double* x // +// A pointer to a vector dimensioned as ncols, which is the least // +// squares solution of the equation Ax = B where A = UDV'. // +// // +// Return Values: // +// The function is of type void. // +// // +// Example: // +// #define M // +// #define N // +// #define NB // +// double U[M][N]; // +// double V[N][N]; // +// double D[N]; // +// double B[M]; // +// double x[N]; // +// double tolerance; // +// // +// (your code to initialize the matrices U,D,V,B) // +// // +// Singular_Value_Decomposition_Solve((double*) U, D, (double*) V, // +// tolerance, M, N, B, x, bcols) // +// // +// printf(" The solution of Ax=B is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // + +void Singular_Value_Decomposition_Solve(double* U, double* D, double* V, + double tolerance, int nrows, int ncols, double *B, double* x) +{ + int i,j,k; + double *pu, *pv; + double dum; + + dum = DBL_EPSILON * D[0] * (double) ncols; + if (tolerance < dum) tolerance = dum; + + for ( i = 0, pv = V; i < ncols; i++, pv += ncols) { + x[i] = 0.0; + for (j = 0; j < ncols; j++) + if (D[j] > tolerance ) { + for (k = 0, dum = 0.0, pu = U; k < nrows; k++, pu += ncols) + dum += *(pu + j) * B[k]; + x[i] += dum * *(pv + j) / D[j]; + } + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,// +// double tolerance, int nrows, int ncols, double *Astar) // +// // +// Description: // +// This routine calculates the pseudo-inverse of the matrix A = UDV'. // +// where U, D, V constitute the singular value decomposition of A. // +// Let Astar be the pseudo-inverse then Astar = V(1/D)U', where 1/D is // +// the pseudo-inverse of D, i.e. if D[i] > 0 then (1/D)[i] = 1/D[i] and // +// if D[i] = 0, then (1/D)[i] = 0. Because the singular values are // +// subject to round-off error. A tolerance is given so that if // +// D[i] < tolerance, D[i] is treated as if it were 0. // +// The default tolerance is D[0] * DBL_EPSILON * ncols, assuming that the // +// diagonal matrix of singular values is sorted from largest to smallest, // +// if the user specified tolerance is less than the default tolerance, // +// then the default tolerance is used. // +// // +// Arguments: // +// double* U // +// A matrix with mutually orthonormal columns. // +// double* D // +// A diagonal matrix with decreasing non-negative diagonal elements. // +// i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i. // +// double* V // +// An orthogonal matrix. // +// double tolerance // +// An lower bound for non-zero singular values (provided tolerance > // +// ncols * DBL_EPSILON * D[0]). // +// int nrows // +// The number of rows of the matrix U and B. // +// int ncols // +// The number of columns of the matrix U. Also the number of rows and // +// columns of the matrices D and V. // +// double* Astar // +// On input, a pointer to the first element of an ncols x nrows matrix.// +// On output, the pseudo-inverse of UDV'. // +// // +// Return Values: // +// The function is of type void. // +// // +// Example: // +// #define M // +// #define N // +// double U[M][N]; // +// double V[N][N]; // +// double D[N]; // +// double Astar[N][M]; // +// double tolerance; // +// // +// (your code to initialize the matrices U,D,V) // +// // +// Singular_Value_Decomposition_Inverse((double*) U, D, (double*) V, // +// tolerance, M, N, (double*) Astar); // +// // +// printf(" The pseudo-inverse of A = UDV' is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // + +void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V, + double tolerance, int nrows, int ncols, double *Astar) +{ + int i,j,k; + double *pu, *pv, *pa; + double dum; + + dum = DBL_EPSILON * D[0] * (double) ncols; + if (tolerance < dum) tolerance = dum; + for ( i = 0, pv = V, pa = Astar; i < ncols; i++, pv += ncols) + for ( j = 0, pu = U; j < nrows; j++, pa++) + for (k = 0, *pa = 0.0; k < ncols; k++, pu++) + if (D[k] > tolerance) *pa += *(pv + k) * *pu / D[k]; +} diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c new file mode 100644 index 000000000..18e35f510 --- /dev/null +++ b/software/apps/svd/main.c @@ -0,0 +1,93 @@ +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "nrutil.h" +#include "svd.c" + + +// Define Matrix dimensions: +#define M 4 +#define N 32 + +int32_t matrix_U[M * N] __attribute__((section(".l1_prio"))); +int32_t matrix_V[M * N] __attribute__((section(".l1_prio"))); +int32_t matrix_W[N] __attribute__((section(".l1_prio"))); + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id, + uint32_t num_cores) { + uint32_t const split = 8; // How many rows/columns to split the matrix into + if (num_columns > num_rows) { + // Parallelize over columns + uint32_t const c_start = (num_rows / split) * (core_id % split); + uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); + for (uint32_t j = (core_id / split); j < num_columns; + j += (num_cores / split)) { + for (uint32_t i = c_start; i < c_end; ++i) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } else { + // Parallelize over rows + uint32_t const c_start = (num_columns / split) * (core_id % split); + uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); + for (uint32_t i = (core_id / split); i < num_rows; + i += (num_cores / split)) { + for (uint32_t j = c_start; j < c_end; ++j) { + matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } +} + +void init_vector(int32_t *vector, uint32_t num_el, + int32_t a, int32_t b, uint32_t core_id) { + uint32_t const split = 8; // How many blocks to split the vector into + uint32_t const reminder = num_el % split; + uint32_t i, j; + for (i = core_id * split; i < core_id * split + split; i++) { + j = i % split; + vector[i] = a * (int32_t)j + b; + } + while (i < reminder) { + j = i % split; + vector[i] = a * (int32_t)j + b; + } +} + +int volatile error __attribute__((section(".l1"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + if (core_id == 0) { + error = 0; + } + + int32_t const U_a = 1; + int32_t const U_b = 1; + int32_t const U_c = -32; + int32_t const V_a = 2; + int32_t const V_b = 1; + int32_t const V_c = 16; + // Init matrix + init_matrix(matrix_U, M, N, U_a, U_b, U_c, core_id, num_cores); + init_matrix(matrix_V, M, N, V_a, V_b, V_c, core_id, num_cores); + init_vector(matrix_W, N, V_a, V_b, core_id); + mempool_barrier(num_cores); + + if (core_id == 0) { + // Test the Matri x SVD + svdcmp(matrix_U, M, N, matrix_W, matrix_V); + } + + // Wait until all cores have finished + mempool_barrier(num_cores); + + return error; +} diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h new file mode 100644 index 000000000..27b55fec2 --- /dev/null +++ b/software/apps/svd/nrutil.h @@ -0,0 +1,65 @@ +//#include +//#include +//#include + +#ifndef NR_UTILS_H +#define NR_UTILS_H + +#define NR_END 1 +#define FREE_ARG char * + +static int32_t sqrarg; +#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg *sqrarg) +static int32_t dsqrarg; +#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg *dsqrarg) +static int32_t dmaxarg1, dmaxarg2; +#define DMAX(a, b) (dmaxarg1 = (a), dmaxarg2 = (b), (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2)) +static int32_t dminarg1, dminarg2; +#define DMIN(a, b) (dminarg1 = (a), dminarg2 = (b), (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2)) +static int32_t maxarg1, maxarg2; +#define FMAX(a, b) (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) +static int32_t minarg1, minarg2; +#define FMIN(a, b) (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2)) +static long lmaxarg1, lmaxarg2; +#define LMAX(a, b) (lmaxarg1 = (a), lmaxarg2 = (b), (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2)) +static long lminarg1, lminarg2; +#define LMIN(a, b) (lminarg1 = (a), lminarg2 = (b), (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2)) +static int32_t imaxarg1, imaxarg2; +#define IMAX(a, b) (imaxarg1 = (a), imaxarg2 = (b), (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2)) +static int32_t iminarg1, iminarg2; +#define IMIN(a, b) (iminarg1 = (a), iminarg2 = (b), (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) +#define ABS(a) (a < 0 ? -a : a) +#define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a)) + +int32_t sqrt_q32 ( const int32_t number, + const uint32_t fracBits); + +#define sqrt2 0b1011010100000100 +int32_t sqrt_q32 ( const int32_t number, + const uint32_t fracBits) { + + int32_t root = 0; + int32_t start = 0; + int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF) + int32_t mid; + + if (number > 0) { + while (start <= end) { + mid = (start + end) >> 1; + if (((mid * mid) >> fracBits) == number) { + root = mid; + break; + } + if (((mid * mid) >> fracBits) < number) { + start = mid + 1; + root = mid; + } else { + end = mid - 1; + } + } + } + + return root; +} + +#endif diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c new file mode 100644 index 000000000..a53c2695b --- /dev/null +++ b/software/apps/svd/svd.c @@ -0,0 +1,237 @@ +int32_t pythag(int32_t a, int32_t b); +void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v); + + +int32_t pythag(int32_t a, int32_t b) { + int32_t absa = ABS(a); + int32_t absb = ABS(b); + if (absa > absb) { + return absa * sqrt_q32(1 + SQR(absb / absa), 4); + } else { + return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4)); + } +} + +void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) { + int32_t flag, i, its, j, jj, k, l, nm; + int32_t anorm, c, f, g, h, s, scale, x, y, z; + int32_t rv1[n]; + + //printf("PROVA\n"); + + g = scale = anorm = 0.0; + for (i = 1; i <= n; i++) { + l = i + 1; + rv1[i] = scale * g; + g = s = scale = 0.0; + if (i <= m) { + for (k = i; k <= m; k++) { + scale += ABS(a[k * m + i]); + } + if (scale) { + for (k = i; k <= m; k++) { + a[k * m + i] /= scale; + s += a[k * m + i] * a[k * m + i]; + } + f = a[i * m + i]; + g = -SIGN(sqrt_q32(s,4), f); + h = f * g - s; + a[i * m + i] = f - g; + for (j = l; j <= n; j++) { + for (s = 0.0, k = i; k <= m; k++) { + s += a[k * m + i] * a[k * m + i]; + } + f = s / h; + for (k = i; k <= m; k++) { + a[k * m + i] += f * a[k * m + i]; + } + } + for (k = i; k <= m; k++) { + a[k * m + i] *= scale; + } + } + } + w[i] = scale * g; + g = s = scale = 0.0; + if (i <= m && i != n) { + for (k = l; k <= n; k++) { + scale += ABS(a[k * m + i]); + } + if (scale) { + for (k = l; k <= n; k++) { + a[k * m + i] /= scale; + s += a[i * m + k] * a[i * m + k]; + } + f = a[i * m + l]; + g = -SIGN(sqrt_q32(s,4), f); + h = f * g - s; + a[i * m + l] = f - g; + for (k = l; k <= n; k++) { + rv1[k] = a[i * m + k] / h; + } + for (j = l; j <= m; j++) { + for (s = 0, k = l; k <= n; k++) { + s += a[j * m + k] * a[i * m + k]; + } + for (k = l; k <= n; k++) { + a[j * m + k] += s * rv1[k]; + } + } + for (k = l; k <= n; k++) { + a[i * m + k] *= scale; + } + } + } + anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i]))); + } + + for (i = n; i >= 1; i--) { + if (i < n) { + if (g) { + for (j = l; j <= n; j++) { + v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g; + } + for (j = l; j <= n; j++) { + for (s = 0, k = l; k <= n; k++) { + s += a[i * m + k] * v[k * m + j]; + } + for (k = l; k <= n; k++) { + v[k * m + j] += s * v[k * m + i]; + } + } + } + for (j = l; j <= n; j++) { + v[i * m + j] = v[j * m + i] = 0; + } + } + v[i * m + i] = 1; + g = rv1[i]; + l = i; + } + +// for (i = IMIN(m, n); i >= 1; i--) { +// l = i + 1; +// g = w[i]; +// for (j = l; j <= n; j++) { +// a[i][j] = 0; +// } +// if (g) { +// g = 1.0 / g; +// for (j = l; j <= n; j++) { +// for (s = 0.0, k = l; k <= m; k++) { +// s += a[k][i] * a[k][j]; +// } +// f = (s / a[i][i]) * g; +// for (k = i; k <= m; k++) { +// a[k][j] += f * a[k][i]; +// } +// } +// for (j = i; j <= m; j++) { +// a[j][i] *= g; +// } +// } else { for (j = i; j <= m; j++) { +// a[j][i] = 0.0; +// } +// } +// ++a[i][i]; +// } +// for (k = n; k >= 1; k--) { +// for (its = 1; its <= 30; its++) { +// flag = 1; +// for (l = k; l >= 1; l--) { +// nm = l - 1; +// if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) { +// flag = 0; +// break; +// } +// if ((int32_t) (ABS(w[nm]) + anorm) == anorm) { +// break; +// } +// } +// if (flag) { +// c = 0.0; +// s = 1.0; +// for (i = l; i <= k; i++) { +// f = s * rv1[i]; +// rv1[i] = c * rv1[i]; +// if ((int32_t) (ABS(f) + anorm) == anorm) { +// break; +// } +// g = w[i]; +// h = pythag(f, g); +// w[i] = h; +// h = 1.0 / h; +// c = g * h; +// s = -f * h; +// for (j = 1; j <= m; j++) { +// y = a[j][nm]; +// z = a[j][i]; +// a[j][nm] = y * c + z * s; +// a[j][i] = z * c - y * s; +// } +// } +// } +// z = w[k]; +// if (l == k) { +// if (z < 0.0) { +// w[k] = -z; +// for (j = 1; j <= n; j++) { +// v[j][k] = -v[j][k]; +// } +// } +// break; +// } +// if (its == 30) { +// exit(1); +// } +// x = w[l]; +// nm = k - 1; +// y = w[nm]; +// g = rv1[nm]; +// h = rv1[k]; +// f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); +// g = pythag(f, 1.0); +// f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; +// c = s = 1.0; +// for (j = l; j <= nm; j++) { +// i = j + 1; +// g = rv1[i]; +// y = w[i]; +// h = s * g; +// g = c * g; +// z = pythag(f, h); +// rv1[j] = z; +// c = f / z; +// s = h / z; +// f = x * c + g * s; +// g = g * c - x * s; +// h = y * s; +// y *= c; +// for (jj = 1; jj <= n; jj++) { +// x = v[jj][j]; +// z = v[jj][i]; +// v[jj][j] = x * c + z * s; +// v[jj][i] = z * c - x * s; +// } +// z = pythag(f, h); +// w[j] = z; +// if (z) { +// z = 1.0 / z; +// c = f * z; +// s = h * z; +// } +// f = c * g + s * y; +// x = c * y - s * g; +// for (jj = 1; jj <= m; jj++) { +// y = a[jj][j]; +// z = a[jj][i]; +// a[jj][j] = y * c + z * s; +// a[jj][i] = z * c - y * s; +// } +// } +// rv1[l] = 0.0; +// rv1[k] = f; +// w[k] = x; +// } +// } +} From 77becf1e3b431ac7a93ef7e8e15f7cfc35152a0f Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Thu, 14 Jul 2022 18:52:02 +0200 Subject: [PATCH 03/22] [software] Clean Gauss Jordan inverse function --- .../apps/MP_matrix_inverse/initialization.h | 34 +-- software/apps/MP_matrix_inverse/inverse.h | 213 ++++++------------ software/apps/MP_matrix_inverse/main.c | 73 +++--- software/runtime/serial.c | 2 +- 4 files changed, 118 insertions(+), 204 deletions(-) diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h index e7e834de9..9046f4be9 100644 --- a/software/apps/MP_matrix_inverse/initialization.h +++ b/software/apps/MP_matrix_inverse/initialization.h @@ -5,41 +5,24 @@ // Author: Marco Bertuletti, ETH Zurich void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores); + int32_t a, int32_t b, int32_t c, uint32_t core_id); void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id, uint32_t num_cores); + uint32_t core_id); void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; + int32_t a, int32_t b, int32_t c, uint32_t core_id) { + if(core_id == 0) { + for(uint32_t i = 0; i < num_columns; i++) { + for(uint32_t j = 0; j < num_rows; j++) { + matrix[j * num_rows + i] = a * (int32_t)i + b * (int32_t)j + c; } } } } void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id, uint32_t num_cores) { + uint32_t core_id) { if(core_id == 0) { for(uint32_t i = 0; i < num_columns; i++) { @@ -47,7 +30,6 @@ void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_column matrix[j * num_rows + i] = 0; } } - printf("SONO QUI\n"); } } diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h index 19dfc9b1e..cb98aadac 100644 --- a/software/apps/MP_matrix_inverse/inverse.h +++ b/software/apps/MP_matrix_inverse/inverse.h @@ -4,21 +4,30 @@ // Author: Marco Bertuletti, ETH Zurich +#define FIXED_POINT 0 +#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT)/b)) +#define FIX_MUL(a,b) ((int32_t)((a*b) >> FIXED_POINT)) + +dump(prova, 1); + void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n); void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n); + + void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n); int32_t determinant(int32_t *A, int32_t n); void adjoint(int32_t *A,int32_t *adj, int32_t n); -int32_t inverse(int32_t *A, int32_t *inverse, int32_t n); +int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n); -int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n); - + +int GJ_inverse(int32_t *pSrc, int32_t *pDst, uint32_t n); + void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { @@ -26,7 +35,6 @@ void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) { } } } - void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) { int k; for (int i = 0; i < n; i++) { @@ -39,6 +47,8 @@ void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, i } } +/* CRAMER MATRIX INVERSION */ + // Function to get cofactor void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) { int32_t i = 0, j = 0; @@ -112,7 +122,7 @@ void adjoint(int32_t *A,int32_t *adj, int32_t n) { // Function to calculate and store inverse, returns false if // matrix is singular -int32_t inverse(int32_t *A, int32_t *inverse, int32_t n) { +int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n) { // Find determinant of A[][] int32_t det = determinant(A, n); if (det == 0) { @@ -127,263 +137,178 @@ int32_t inverse(int32_t *A, int32_t *inverse, int32_t n) { // Find Inverse using formula "inverse(A) = adj(A)/det(A)" for (int32_t i = 0; i < n; i++) for (int32_t j = 0; j < n; j++) - inverse[i * n + j]= adj[i * n + j] / det; + inverse[i * n + j]= FIX_DIV(adj[i * n + j], det); return 1; } +/* GAUSS JORDAN INVERSION */ -int plp_mat_inv_f32s_xpulpv2(int32_t *__restrict__ pSrc, int32_t *__restrict__ pDst, uint32_t n) { +int GJ_inverse(int32_t * pSrc, int32_t * pDst, uint32_t n) { int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t Xchg, in = 0, in1; /* Temporary input values */ - uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */ + int32_t Xchg, x = 0, y; /* Temporary input values */ + uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */ uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + pDstT1 = pDst; /* Working pointer for destination matrix */ + rowCnt = m; /* Loop over the number of rows */ - /* Working pointer for destination matrix */ - pDstT1 = pDst; - - /* Loop over the number of rows */ - rowCnt = m; - - /* Making the destination matrix as identity matrix */ + /* CREATE THE IDENTITY MATRIX */ while (rowCnt > 0U) { - /* Writing all zeroes in lower triangle of the destination matrix */ j = m - rowCnt; while (j > 0U) { *pDstT1++ = 0; j--; } - - /* Writing all ones in the diagonal of the destination matrix */ *pDstT1++ = 1; - - /* Writing all zeroes in upper triangle of the destination matrix */ j = rowCnt - 1U; while (j > 0U) { *pDstT1++ = 0; j--; } - - /* Decrement loop counter */ rowCnt--; } - /* Loop over the number of columns of the input matrix. - All the elements in each column are processed by the row operations */ + /* Loop over the number of columns of the input matrix. */ loopCnt = n; - /* Index modifier to navigate through the columns */ l = 0U; while (loopCnt > 0U) { - /* Check if the pivot element is zero.. + + /* CHECK IF PIVOT ELEMENT IS ZERO... * If it is zero then interchange the row with non zero row below. * If there is no non zero element to replace in the rows below, * then the matrix is Singular. */ - /* Working pointer for the input matrix that points - * to the pivot element of the particular row */ pSrcT1 = pSrc + (l * n); - - /* Working pointer for the destination matrix that points - * to the pivot element of the particular row */ pDstT1 = pDst + (l * n); - - /* Temporary variable to hold the pivot value */ - in = *pSrcT1; - - /* Destination pointer modifier */ + x = *pSrcT1; k = 1U; - - /* Check if the pivot element is zero */ - if (*pSrcT1 == 0) { - /* Loop over the number rows present below */ - + if (x == 0) { + /* Loop over the rows present below */ for (i = (l + 1U); i < m; i++) { - /* Update the input and destination pointers */ pSrcT2 = pSrcT1 + (n * i); pDstT2 = pDstT1 + (n * k); - /* Check if there is a non zero pivot element to - * replace in the rows below */ + /* Check if there is a non zero pivot element to replace in the rows below */ if (*pSrcT2 != 0) { - /* Loop over number of columns - * to the right of the pilot element */ + /* Exchange the row elements of the input matrix at the right of the pivot */ j = n - l; - while (j > 0U) { - /* Exchange the row elements of the input matrix */ Xchg = *pSrcT2; *pSrcT2++ = *pSrcT1; *pSrcT1++ = Xchg; - - /* Decrement the loop counter */ j--; } - - /* Loop over number of columns of the destination matrix */ + /* Exchange the row elements of the destination matrix */ j = n; - while (j > 0U) { - /* Exchange the row elements of the destination matrix */ Xchg = *pDstT2; *pDstT2++ = *pDstT1; *pDstT1++ = Xchg; - - /* Decrement loop counter */ j--; } - - /* Flag to indicate whether exchange is done or not */ flag = 1U; - - /* Break after exchange is done */ break; } - - /* Update the destination pointer modifier */ k++; - - /* Decrement loop counter */ } } - - /* Update the status if the matrix is singular */ - if ((flag != 1U) && (in == 0)) { + /* Return when the matrix is singular */ + if ((flag != 1U) && (x == 0)) { return 1; } + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ pPivotRowIn = pSrc + (l * n); pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ pSrcT1 = pPivotRowIn; pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; + x = *pPivotRowIn; - /* Loop over number of columns - * to the right of the pilot element */ + /* Loop over number of columns to the right of the pilot element */ j = (n - l); - while (j > 0U) { - /* Divide each element of the row of the input matrix - * by the pivot element */ - in1 = *pSrcT1; - *pSrcT1++ = in1 / in; - - /* Decrement the loop counter */ + y = *pSrcT1; + *pSrcT1++ = FIX_DIV(y, x); j--; } - /* Loop over number of columns of the destination matrix */ j = n; - while (j > 0U) { - /* Divide each element of the row of the destination matrix - * by the pivot element */ - in1 = *pSrcT2; - *pSrcT2++ = in1 / in; - - /* Decrement the loop counter */ + y = *pSrcT2; + *pSrcT2++ = FIX_DIV(y, x); j--; } + /* SUM THE MULTIPLE OF A BOTTOM ROW */ /* Replace the rows with the sum of that row and a multiple of row i * so that each new element in column i above row i is zero.*/ - /* Temporary pointers for input and destination matrices */ + pSrcT1 = pSrc; pSrcT2 = pDst; - /* index used to check for pivot element */ - i = 0U; - - /* Loop over number of rows */ - /* to be replaced by the sum of that row and a multiple of row i */ - k = m; - + i = 0U; /* pivot index */ + k = m; /* row index */ while (k > 0U) { - /* Check for the pivot element */ + + /* Only the columns to the right of the pivot are to be processed */ if (i == l) { - /* If the processing element is the pivot element, - only the columns to the right are to be processed */ pSrcT1 += n - l; - pSrcT2 += n; + } else { - /* Element of the reference row */ - in = *pSrcT1; - /* Working pointers for input and destination pivot rows */ + /* Element of the reference row */ + x = *pSrcT1; + /* Reference row pointers */ pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; - /* Loop over the number of columns to the right of the pivot element, - to replace the elements in the input matrix */ - j = (n - l); - + j = (n - l); /* Replace the elements to the right of the pivot */ while (j > 0U) { - /* Replace the element by the sum of that row - and a multiple of the reference row */ - in1 = *pSrcT1; - *pSrcT1++ = in1 - (in * *pPRT_in++); - - /* Decrement the loop counter */ + y = *pSrcT1; + *pSrcT1++ = y - FIX_MUL(x, *pPRT_in++); j--; } - - /* Loop over the number of columns to - replace the elements in the destination matrix */ - j = n; - + j = n; /* Replace the elements in the destination matrix */ while (j > 0U) { - /* Replace the element by the sum of that row - and a multiple of the reference row */ - in1 = *pSrcT2; - *pSrcT2++ = in1 - (in * *pPRT_pDst++); - - /* Decrement loop counter */ + y = *pSrcT2; + *pSrcT2++ = y - FIX_MUL(x, *pPRT_pDst++); j--; } } - /* Increment temporary input pointer */ pSrcT1 = pSrcT1 + l; - /* Decrement loop counter */ k--; - /* Increment pivot index */ i++; } - /* Increment the input pointer */ - pSrc++; - - /* Decrement the loop counter */ - loopCnt--; - - /* Increment the index modifier */ - l++; + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ } - if ((flag != 1U) && (in == 0)) { - for (i = 0; i < m * n; i++) { - if (pSrc[i] != 0) - break; - } - - if (i == m * n) - return 1; - } +// if ((flag != 1U) && (x == 0)) { +// for (i = 0; i < m * n; i++) { +// if (pSrc[i] != 0) +// break; +// } +// if (i == m * n) +// return 1; +// } return 0; } diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c index 587ee06b0..f1f771d89 100644 --- a/software/apps/MP_matrix_inverse/main.c +++ b/software/apps/MP_matrix_inverse/main.c @@ -7,7 +7,7 @@ //#include //#include -#define N 5 +#define N 6 #include "encoding.h" #include "printf.h" @@ -17,18 +17,18 @@ #include "inverse.h" #include "initialization.h" -// C++ program to find Moore-Penrose inverse matrix - +#define GAUSS_JORDAN +// #define CRAMER + // Generic function to display the matrix. We use it to display // both adjoin and inverse. adjoin is integer matrix and inverse // is a int32_t. -void display(int32_t *A, int32_t n) -{ - for (int i = 0; i < n; i++) - { - for (int j = 0; j < n; j++) - printf("%4d ", A[i * n + j]); - printf("\n"); +void display(int32_t *A, int32_t n) { + int32_t volatile i = 0; + while (i < n * n) { + // printf("ciao mamma\n"); + printf("Value %d: %d\n", i, A[i]); + i++; } } @@ -41,40 +41,47 @@ int main() // Initialize barrier and synchronize mempool_barrier_init(core_id); - int32_t matrix[N * N] = { -2, 2, 7, 9, 4, 0, 8, - 1, 0, 0, 3, 1, 0, 9, - -3, 1, 5, 0, 2, 1, 7, - 3,-1,-9, 4, 6, 5, 2, - 1, 0, 4, 4, 1, 0, 9, - 8, 0, 3, 8, 6, 5, 2, - 5, 6, 4, 1, 3, 2, 0 }; +// int32_t matrix[N * N] = { -2, 2, 7, 9, 4, 0, 8, +// 1, 0, 0, 3, 1, 0, 9, +// -3, 1, 5, 0, 2, 1, 7, +// 3,-1,-9, 4, 6, 5, 2, +// 1, 0, 4, 4, 1, 0, 9, +// 8, 0, 3, 8, 6, 5, 2, +// 5, 6, 4, 1, 3, 2, 0 }; int32_t t_matrix[N * N]; int32_t matrix_mult[N * N]; int32_t pseudoinverse[N * N]; int32_t inv[N * N]; // To store inverse -// init_matrix_zeros(t_matrix, N, N, core_id, num_cores); -// init_matrix_zeros(matrix_mult, N, N, core_id, num_cores); -// init_matrix_zeros(pseudoinverse, N, N, core_id, num_cores); -// init_matrix_zeros(adj, N, N, core_id, num_cores); -// init_matrix_zeros(inv, N, N, core_id, num_cores); - if(core_id == 0) - display(matrix, N); + int32_t matrix[N * N]; + init_matrix(matrix, N, N, -125, 2423, -1294, core_id); + init_matrix_zeros(t_matrix, N, N, core_id); + init_matrix_zeros(matrix_mult, N, N, core_id); + init_matrix_zeros(pseudoinverse, N, N, core_id); + init_matrix_zeros(inv, N, N, core_id); if(core_id == 0) { + + //display(matrix, N); Transpose(matrix, t_matrix, N); - printf("\nThe Transpose is :\n"); - display(t_matrix, N); - printf("The product of the matrix is: \n"); + //printf("\nThe Transpose is :\n"); + //display(t_matrix, N); MatrixMult(t_matrix,matrix,matrix_mult, N); - display(matrix_mult, N); - printf("\nThe Inverse is :\n"); - if (inverse(matrix_mult, inv, N)) - display(inv, N); + //printf("The product of the matrix is: \n"); + //display(matrix_mult, N); + //printf("\nThe Inverse is :\n"); + #if defined(CRAMER) + if (C_inverse(matrix_mult, inv, N)) + //display(inv, N); + #elif defined(GAUSS_JORDAN) + GJ_inverse(matrix_mult, inv, N); + //display(inv, N); + #endif MatrixMult(inv,t_matrix,pseudoinverse, N); - printf("\nThe Monroe-penrose inverse is :\n"); - display(pseudoinverse, N); + //printf("\nThe Moore-Penrose inverse is :\n"); + //display(pseudoinverse, N); + } mempool_barrier(num_cores); diff --git a/software/runtime/serial.c b/software/runtime/serial.c index a53ec2e1f..44aa30fe2 100644 --- a/software/runtime/serial.c +++ b/software/runtime/serial.c @@ -4,7 +4,7 @@ #include -extern char fake_uart; +extern volatile char fake_uart; void _putchar(char character) { // send char to console From 157790692503799418e16c14bec49e8a61091a86 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Mon, 18 Jul 2022 15:30:26 +0200 Subject: [PATCH 04/22] [software] Allow use of input N by M rectangular matrices --- .../apps/MP_matrix_inverse/initialization.h | 17 ++- software/apps/MP_matrix_inverse/inverse.h | 58 +++++----- software/apps/MP_matrix_inverse/main.c | 105 ++++++++++-------- 3 files changed, 99 insertions(+), 81 deletions(-) diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h index 9046f4be9..2ba789234 100644 --- a/software/apps/MP_matrix_inverse/initialization.h +++ b/software/apps/MP_matrix_inverse/initialization.h @@ -4,30 +4,29 @@ // Author: Marco Bertuletti, ETH Zurich -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, +void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id); -void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, +void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, +void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id) { if(core_id == 0) { - for(uint32_t i = 0; i < num_columns; i++) { - for(uint32_t j = 0; j < num_rows; j++) { - matrix[j * num_rows + i] = a * (int32_t)i + b * (int32_t)j + c; + for(uint32_t j = 0; j < num_rows; j++) { + for(uint32_t i = 0; i < num_columns; i++) { + matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c; } } } } -void init_matrix_zeros ( int32_t *matrix, uint32_t num_rows, uint32_t num_columns, +void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id) { - if(core_id == 0) { for(uint32_t i = 0; i < num_columns; i++) { for(uint32_t j = 0; j < num_rows; j++) { - matrix[j * num_rows + i] = 0; + matrix[j * num_columns + i] = 0; } } } diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/inverse.h index cb98aadac..4d230c422 100644 --- a/software/apps/MP_matrix_inverse/inverse.h +++ b/software/apps/MP_matrix_inverse/inverse.h @@ -4,53 +4,55 @@ // Author: Marco Bertuletti, ETH Zurich -#define FIXED_POINT 0 -#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT)/b)) -#define FIX_MUL(a,b) ((int32_t)((a*b) >> FIXED_POINT)) +#define FIXED_POINT 16 +#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b)) +#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) dump(prova, 1); -void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n); +void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m); -void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n); +void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o); -void getCofactor(int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n); +void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n); -int32_t determinant(int32_t *A, int32_t n); +int32_t determinant(volatile int32_t *A, int32_t n); -void adjoint(int32_t *A,int32_t *adj, int32_t n); +void adjoint(volatile int32_t *A,int32_t *adj, int32_t n); -int32_t C_inverse(int32_t *A, int32_t *inverse, int32_t n); +int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n); -int GJ_inverse(int32_t *pSrc, int32_t *pDst, uint32_t n); +int GJ_inverse(volatile int32_t *pSrc, volatile int32_t *pDst, uint32_t n); -void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) { - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - t_matrix[j * n + i]=matrix[i * n + j]; +void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m) { + int32_t i, j; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + t_matrix[j * n + i] = matrix[i * m + j]; } } } -void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n) { - int k; - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { // not j //#include -#define N 6 +#define N 32 +#define M 4 +#define O 4 #include "encoding.h" #include "printf.h" @@ -19,19 +21,33 @@ #define GAUSS_JORDAN // #define CRAMER +// #define VERBOSE // Generic function to display the matrix. We use it to display // both adjoin and inverse. adjoin is integer matrix and inverse // is a int32_t. -void display(int32_t *A, int32_t n) { - int32_t volatile i = 0; - while (i < n * n) { - // printf("ciao mamma\n"); - printf("Value %d: %d\n", i, A[i]); - i++; +void display(volatile int32_t *A, int32_t n, int32_t m) { + //int32_t volatile i = 0; + //while (i < n * m) { + // // printf("ciao mamma\n"); + // printf("Value %d: %d\n", i, A[i]); + // i++; + //} + int32_t volatile i, j; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + printf("%5d ", A[i * m + j]); + } + printf("\n"); } } +volatile int32_t matrix[N * M]; +volatile int32_t t_matrix[M * N]; +volatile int32_t matrix_mult[M * M]; +volatile int32_t inv[M * M]; // To store inverse +volatile int32_t pseudoinverse[M * N]; + // Driver program int main() { @@ -41,47 +57,48 @@ int main() // Initialize barrier and synchronize mempool_barrier_init(core_id); -// int32_t matrix[N * N] = { -2, 2, 7, 9, 4, 0, 8, -// 1, 0, 0, 3, 1, 0, 9, -// -3, 1, 5, 0, 2, 1, 7, -// 3,-1,-9, 4, 6, 5, 2, -// 1, 0, 4, 4, 1, 0, 9, -// 8, 0, 3, 8, 6, 5, 2, -// 5, 6, 4, 1, 3, 2, 0 }; - - int32_t t_matrix[N * N]; - int32_t matrix_mult[N * N]; - int32_t pseudoinverse[N * N]; - int32_t inv[N * N]; // To store inverse - - int32_t matrix[N * N]; - init_matrix(matrix, N, N, -125, 2423, -1294, core_id); - init_matrix_zeros(t_matrix, N, N, core_id); - init_matrix_zeros(matrix_mult, N, N, core_id); - init_matrix_zeros(pseudoinverse, N, N, core_id); - init_matrix_zeros(inv, N, N, core_id); + init_matrix(matrix,N, M, -156, 427, -219, core_id); + init_matrix_zeros(t_matrix, M, N, core_id); + init_matrix_zeros(matrix_mult, M, M, core_id); + init_matrix_zeros(inv, M, M, core_id); + init_matrix_zeros(pseudoinverse, M, N, core_id); if(core_id == 0) { - //display(matrix, N); - Transpose(matrix, t_matrix, N); - //printf("\nThe Transpose is :\n"); - //display(t_matrix, N); - MatrixMult(t_matrix,matrix,matrix_mult, N); - //printf("The product of the matrix is: \n"); - //display(matrix_mult, N); - //printf("\nThe Inverse is :\n"); - #if defined(CRAMER) - if (C_inverse(matrix_mult, inv, N)) - //display(inv, N); - #elif defined(GAUSS_JORDAN) - GJ_inverse(matrix_mult, inv, N); - //display(inv, N); + #if defined(VERBOSE) + display(matrix, N, M); + Transpose(matrix, t_matrix, N, M); + printf("\nThe Transpose is :\n"); + display(t_matrix, M, N); + MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); + printf("The product of the matrix is: \n"); + display(matrix_mult, M, M); + printf("\nThe Inverse is :\n"); + #if defined(CRAMER) + if (C_inverse(matrix_mult, inv, N)); + display(inv, N, N); + #elif defined(GAUSS_JORDAN) + GJ_inverse(matrix_mult, inv, N); + display(inv, N, N); + #endif + MatrixMult(t_matrix, inv, pseudoinverse, M, N, N); + printf("\nThe Moore-Penrose inverse is :\n"); + display(pseudoinverse, M, N); + #else + Transpose(matrix, t_matrix, N, M); + MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); + mempool_start_benchmark(); + #if defined(CRAMER) + C_inverse(matrix_mult, inv, M); + #elif defined(GAUSS_JORDAN) + GJ_inverse(matrix_mult, inv, M); + #endif + mempool_stop_benchmark(); + MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); + + MatrixMult(pseudoinverse, matrix, inv, M, N, M); + display(inv, M, M); #endif - MatrixMult(inv,t_matrix,pseudoinverse, N); - //printf("\nThe Moore-Penrose inverse is :\n"); - //display(pseudoinverse, N); - } mempool_barrier(num_cores); From 21ded46cf92d217f1dfd432c190480979f2bb59c Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Tue, 19 Jul 2022 08:51:09 +0200 Subject: [PATCH 05/22] [software] Parallelize Gauss-Jordan matrix inversion --- .../apps/MP_matrix_inverse/initialization.h | 47 ++++- software/apps/MP_matrix_inverse/main.c | 106 +++++++---- .../MP_matrix_inverse/mempool_mat_inv_q16p.h | 166 ++++++++++++++++ .../{inverse.h => mempool_mat_inv_q16s.h} | 178 +++--------------- 4 files changed, 292 insertions(+), 205 deletions(-) create mode 100644 software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h rename software/apps/MP_matrix_inverse/{inverse.h => mempool_mat_inv_q16s.h} (51%) diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h index 2ba789234..1d9d6396f 100644 --- a/software/apps/MP_matrix_inverse/initialization.h +++ b/software/apps/MP_matrix_inverse/initialization.h @@ -4,14 +4,44 @@ // Author: Marco Bertuletti, ETH Zurich -void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id); +#define FIXED_POINT 16 +#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b)) +#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) -void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id); +dump(flag, 1); -void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id) { + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m); + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n, int32_t m, int32_t o); + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id); + +void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id); + + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) { + int32_t i, j; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + t_matrix[j * n + i] = matrix[i * m + j]; + } + } +} + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n, int32_t m, int32_t o) { + int32_t i, j, k; + for (i = 0; i < n; i++) { + for (j = 0; j < o; j++) { + matrix_product[i * o + j] = 0; + for (k = 0; k < m; k++) { + matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]); + } + } + } +} + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id) { if(core_id == 0) { for(uint32_t j = 0; j < num_rows; j++) { for(uint32_t i = 0; i < num_columns; i++) { @@ -21,8 +51,8 @@ void init_matrix(int32_t volatile *matrix, uint32_t num_rows, uint32_t num_colum } } -void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id) { + +void init_matrix_zeros (int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id) { if(core_id == 0) { for(uint32_t i = 0; i < num_columns; i++) { for(uint32_t j = 0; j < num_rows; j++) { @@ -30,5 +60,4 @@ void init_matrix_zeros ( int32_t volatile *matrix, uint32_t num_rows, uint32_t n } } } - } diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/MP_matrix_inverse/main.c index bb076ecd2..4cb660e23 100644 --- a/software/apps/MP_matrix_inverse/main.c +++ b/software/apps/MP_matrix_inverse/main.c @@ -7,7 +7,7 @@ //#include //#include -#define N 32 +#define N 4 #define M 4 #define O 4 @@ -16,24 +16,29 @@ #include "runtime.h" #include "synchronization.h" -#include "inverse.h" #include "initialization.h" +#include "mempool_mat_inv_q16s.h" +#include "mempool_mat_inv_q16p.h" -#define GAUSS_JORDAN -// #define CRAMER // #define VERBOSE +// #define SINGLE +#define PARALLEL -// Generic function to display the matrix. We use it to display -// both adjoin and inverse. adjoin is integer matrix and inverse -// is a int32_t. -void display(volatile int32_t *A, int32_t n, int32_t m) { +int32_t matrix[N * M] __attribute__((section(".l1"))); +int32_t t_matrix[M * N] __attribute__((section(".l1"))); +int32_t matrix_mult[M * M] __attribute__((section(".l1"))); +int32_t inv[M * M] __attribute__((section(".l1"))); +int32_t pseudoinverse[M * N] __attribute__((section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); + +void display(int32_t *A, int32_t n, int32_t m) { //int32_t volatile i = 0; //while (i < n * m) { // // printf("ciao mamma\n"); // printf("Value %d: %d\n", i, A[i]); // i++; //} - int32_t volatile i, j; + int32_t i, j; for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { printf("%5d ", A[i * m + j]); @@ -42,14 +47,8 @@ void display(volatile int32_t *A, int32_t n, int32_t m) { } } -volatile int32_t matrix[N * M]; -volatile int32_t t_matrix[M * N]; -volatile int32_t matrix_mult[M * M]; -volatile int32_t inv[M * M]; // To store inverse -volatile int32_t pseudoinverse[M * N]; - // Driver program -int main() +void single_core() { uint32_t core_id = mempool_get_core_id(); @@ -57,14 +56,14 @@ int main() // Initialize barrier and synchronize mempool_barrier_init(core_id); - init_matrix(matrix,N, M, -156, 427, -219, core_id); - init_matrix_zeros(t_matrix, M, N, core_id); - init_matrix_zeros(matrix_mult, M, M, core_id); - init_matrix_zeros(inv, M, M, core_id); - init_matrix_zeros(pseudoinverse, M, N, core_id); + init_matrix(matrix, N, M, -156, 427, -219, core_id); + //init_matrix_zeros(t_matrix, M, N, core_id); + //init_matrix_zeros(matrix_mult, M, M, core_id); + //init_matrix_zeros(inv, M, M, core_id); + //init_matrix_zeros(pseudoinverse, M, N, core_id); + mempool_barrier(num_cores); if(core_id == 0) { - #if defined(VERBOSE) display(matrix, N, M); Transpose(matrix, t_matrix, N, M); @@ -74,33 +73,62 @@ int main() printf("The product of the matrix is: \n"); display(matrix_mult, M, M); printf("\nThe Inverse is :\n"); - #if defined(CRAMER) - if (C_inverse(matrix_mult, inv, N)); - display(inv, N, N); - #elif defined(GAUSS_JORDAN) - GJ_inverse(matrix_mult, inv, N); - display(inv, N, N); - #endif + mempool_mat_inv_q16s(matrix_mult, inv, N); + display(inv, N, N); MatrixMult(t_matrix, inv, pseudoinverse, M, N, N); printf("\nThe Moore-Penrose inverse is :\n"); display(pseudoinverse, M, N); #else - Transpose(matrix, t_matrix, N, M); - MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); + //Transpose(matrix, t_matrix, N, M); + //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); + mempool_start_benchmark(); - #if defined(CRAMER) - C_inverse(matrix_mult, inv, M); - #elif defined(GAUSS_JORDAN) - GJ_inverse(matrix_mult, inv, M); - #endif + mempool_GJinv_q16s(matrix, inv, M); mempool_stop_benchmark(); - MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); - MatrixMult(pseudoinverse, matrix, inv, M, N, M); - display(inv, M, M); + //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); + //MatrixMult(pseudoinverse, matrix, inv, M, N, M); #endif } + mempool_barrier(num_cores); +} + +void multi_core() +{ + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + init_matrix(matrix, N, M, -156, 427, -219, core_id); + if (core_id == 0) { + flag = 0U; + } + //init_matrix_zeros(t_matrix, M, N, core_id); + //init_matrix_zeros(matrix_mult, M, M, core_id); + //init_matrix_zeros(inv, M, M, core_id); + //init_matrix_zeros(pseudoinverse, M, N, core_id); mempool_barrier(num_cores); + + //Transpose(matrix, t_matrix, N, M); + //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); + + mempool_start_benchmark(); + mempool_GJinv_q16p(matrix, inv, M, &flag); + mempool_stop_benchmark(); + + //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); + //MatrixMult(pseudoinverse, matrix, inv, M, N, M); + + mempool_barrier(num_cores); +} + +int main() { + #if defined(SINGLE) + single_core(); + #elif defined(PARALLEL) + multi_core(); + #endif return 0; } diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h new file mode 100644 index 000000000..c40bafe5b --- /dev/null +++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h @@ -0,0 +1,166 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* GAUSS JORDAN INVERSION */ + +int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); + +int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { + + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t Xchg, in = 0, in1; /* Temporary input values */ + + uint32_t core_id = mempool_get_core_id(); + uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + + pDstT1 = pDst; /* Working pointer for destination matrix */ + for (i = core_id; i < m; i += NUM_CORES) { + for (j = 0; j < m; j++) { + pDstT1[i * m + j] = (uint32_t) (i == j); + } + } + mempool_barrier(NUM_CORES); + + /* Loop over the number of columns of the input matrix. */ + loopCnt = n; + /* Index modifier to navigate through the columns */ + l = 0U; + + while (loopCnt > 0U) { + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + + in = *pSrcT1; + k = 1U; + /* Check if the pivot element is zero */ + if (*pSrcT1 == 0U) { + + /* Loop over the number rows present below */ + for (i = (l + 1U) + core_id; i < m; i += NUM_CORES) { + pSrcT2 = pSrcT1 + (n * i); + /* Check if there is element to exchange */ + //if (*flag != 0U) + // break; + if (*pSrcT2 != 0U) { + __atomic_fetch_add(flag, k, __ATOMIC_RELAXED); + } + } + mempool_barrier(NUM_CORES); + + if (*flag != 0U) { + pSrcT2 = pSrcT1 + (n * *flag + l); + pDstT2 = pDstT1 + (n * *flag); + /* Loop over number of columns + * to the right of the pilot element */ + for (j = core_id; j < n - l; j += NUM_CORES) { + /* Exchange the row elements of the input matrix */ + Xchg = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg; + } + pSrcT1 += n - l; + pSrcT2 += n - l; + /* Loop over number of columns of the destination matrix */ + for(j = core_id; j < n; j += NUM_CORES) { + /* Exchange the row elements of the destination matrix */ + Xchg = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg; + } + pDstT2 += n; + pDstT1 += n; + } + k++; + mempool_barrier(NUM_CORES); + } + + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + + /* DIVIDE BY THE PIVOT */ + + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + /* Loop over number of columns to the right of the pilot element */ + for(j = core_id; j < n - l; j += NUM_CORES) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + } + /* Loop over number of columns of the destination matrix */ + for(j = core_id; j < n; j += NUM_CORES) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + } + mempool_barrier(NUM_CORES); + + /*REPLACE ROWS */ + + pSrcT1 = pSrc + core_id * n; + pSrcT2 = pDst + core_id * n; + i = core_id; + k = m; + for(k = core_id; k < m; k += NUM_CORES) { + if (i != l) { + /* Element of the reference row */ + in = *pSrcT1; + /* Working pointers for input and destination pivot rows */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the number of columns to the right of the pivot element, + to replace the elements in the input matrix */ + for (j = 0; j < n - l; j++) { + in1 = pSrcT1[j]; + pSrcT1[j] = in1 - FIX_MUL(in, pPRT_in[j]); + } + /* Loop over the number of columns to + replace the elements in the destination matrix */ + for (j = 0; j < n; j++) { + in1 = pSrcT2[j]; + pSrcT2[j] = in1 - FIX_MUL(in, pPRT_pDst[j]); + } + } + i += NUM_CORES; + pSrcT1 += NUM_CORES * n; + pSrcT2 += NUM_CORES * n; + } + /* Increment the input pointer */ + pSrc++; + /* Decrement the loop counter */ + loopCnt--; + /* Increment the index modifier */ + l++; + mempool_barrier(NUM_CORES); + } + +// if ((flag != 1U) && (x == 0)) { +// for (i = 0; i < m * n; i++) { +// if (pSrc[i] != 0) +// break; +// } +// if (i == m * n) +// return 1; +// } + + return 0; +} diff --git a/software/apps/MP_matrix_inverse/inverse.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h similarity index 51% rename from software/apps/MP_matrix_inverse/inverse.h rename to software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h index 4d230c422..9f2201224 100644 --- a/software/apps/MP_matrix_inverse/inverse.h +++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h @@ -4,160 +4,25 @@ // Author: Marco Bertuletti, ETH Zurich -#define FIXED_POINT 16 -#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b)) -#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) - -dump(prova, 1); - -void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m); - -void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o); - - - -void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n); - -int32_t determinant(volatile int32_t *A, int32_t n); - -void adjoint(volatile int32_t *A,int32_t *adj, int32_t n); - -int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n); - - - -int GJ_inverse(volatile int32_t *pSrc, volatile int32_t *pDst, uint32_t n); - -void Transpose(volatile int32_t *matrix, volatile int32_t *t_matrix, int32_t n, int32_t m) { - int32_t i, j; - for (i = 0; i < n; i++) { - for (j = 0; j < m; j++) { - t_matrix[j * n + i] = matrix[i * m + j]; - } - } -} -void MatrixMult(volatile int32_t *matrix_1, volatile int32_t *matrix_2, volatile int32_t *matrix_product, int32_t n, int32_t m, int32_t o) { - int32_t i, j, k; - for (i = 0; i < n; i++) { - for (j = 0; j < o; j++) { - matrix_product[i * o + j] = 0; - for (k = 0; k < m; k++) { - matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]); - } - } - } - -} - -/* CRAMER MATRIX INVERSION */ - -// Function to get cofactor -void getCofactor(volatile int32_t *A, int32_t *temp, int32_t p, int32_t q, int32_t n) { - int32_t i = 0, j = 0; - // Looping for each element of the matrix - for (int32_t row = 0; row < n; row++) { - for (int32_t col = 0; col < n; col++) { - // Copying into temporary matrix only those element - // which are not in given row and column - if (row != p && col != q) { - temp[i * N + j++] = A[row * N + col]; - // Row is filled, so increase row index and - // reset col index - if (j == n - 1) { - j = 0; - i++; - } - } - } - } -} - -// Recursive function for finding determinant of matrix. -int32_t determinant(volatile int32_t *A, int32_t n) { - - int32_t D = 0; // Initialize result - // Base case : if matrix contains single element - if (n == 1) - return A[0]; - - int32_t temp[N * N]; // To store cofactors - for(int32_t i =0; i < N*N; i++) - temp[i] = 0; - - int32_t sign = 1; // To store sign multiplier - // Iterate for each element of first row - for (int32_t f = 0; f < n; f++) { - - // Getting Cofactor of A[0][f] - getCofactor(A, temp, 0, f, n); - - D += sign * A[0 * N + f] * determinant(temp, n - 1); - // terms are to be added with alternate sign - sign = -sign; - } - - return D; -} - -// Function to get adjoint -void adjoint(volatile int32_t *A,int32_t *adj, int32_t n) { - if (n == 1) { - adj[0] = 1; - return; - } - // temp is used to store cofactors - int32_t sign = 1; - int32_t temp[N * N]; - for (int32_t i = 0; i < N; i++) { - for (int32_t j = 0; j < N; j++) { - // Get cofactor - getCofactor(A, temp, i, j, N); - // sign of adj positive if sum of row - // and column indexes is even. - sign = ((i + j) % 2 == 0) ? 1 : -1; - // Interchanging rows and columns to get the - // transpose of the cofactor matrix - adj[j * N + i] = (sign)*(determinant(temp, N - 1)); - } - } -} - -// Function to calculate and store inverse, returns false if -// matrix is singular -int32_t C_inverse(volatile int32_t *A, int32_t *inverse, int32_t n) { - // Find determinant of A[][] - int32_t det = determinant(A, n); - if (det == 0) { - printf("Singular matrix, can't find its inverse\n"); - return 0; - } - - // Find adjoint - int32_t adj[n * n]; - adjoint(A, adj, n); - - // Find Inverse using formula "inverse(A) = adj(A)/det(A)" - for (int32_t i = 0; i < n; i++) - for (int32_t j = 0; j < n; j++) - inverse[i * n + j]= FIX_DIV(adj[i * n + j], det); - return 1; -} - /* GAUSS JORDAN INVERSION */ -int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { +int mempool_GJinv_q16s(int32_t *pSrc, int32_t *pDst, uint32_t n); + +int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t Xchg, x = 0, y; /* Temporary input values */ - uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */ + int32_t Xchg, in = 0, in1; /* Temporary input values */ + uint32_t i, rowCnt, j, loopCnt, k, l; /* loop counters */ + uint32_t flag; uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ pDstT1 = pDst; /* Working pointer for destination matrix */ rowCnt = m; /* Loop over the number of rows */ + flag = 0U; /* CREATE THE IDENTITY MATRIX */ while (rowCnt > 0U) { @@ -181,7 +46,6 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { l = 0U; while (loopCnt > 0U) { - /* CHECK IF PIVOT ELEMENT IS ZERO... * If it is zero then interchange the row with non zero row below. * If there is no non zero element to replace in the rows below, @@ -189,9 +53,8 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); - x = *pSrcT1; k = 1U; - if (x == 0) { + if (*pSrcT1 == 0) { /* Loop over the rows present below */ for (i = (l + 1U); i < m; i++) { pSrcT2 = pSrcT1 + (n * i); @@ -222,10 +85,11 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { } } /* Return when the matrix is singular */ - if ((flag != 1U) && (x == 0)) { + if ((flag == 0U) && (in == 0)) { return 1; } + /* DIVIDE BY THE PIVOT */ /* Points to the pivot row of input and destination matrices */ @@ -235,20 +99,20 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { pSrcT1 = pPivotRowIn; pSrcT2 = pPivotRowDst; /* Pivot element of the row */ - x = *pPivotRowIn; + in = *pPivotRowIn; /* Loop over number of columns to the right of the pilot element */ j = (n - l); while (j > 0U) { - y = *pSrcT1; - *pSrcT1++ = FIX_DIV(y, x); + in1 = *pSrcT1; + *pSrcT1++ = FIX_DIV(in1, in); j--; } /* Loop over number of columns of the destination matrix */ j = n; while (j > 0U) { - y = *pSrcT2; - *pSrcT2++ = FIX_DIV(y, x); + in1 = *pSrcT2; + *pSrcT2++ = FIX_DIV(in1, in); j--; } @@ -272,21 +136,21 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { } else { /* Element of the reference row */ - x = *pSrcT1; + in = *pSrcT1; /* Reference row pointers */ pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; j = (n - l); /* Replace the elements to the right of the pivot */ while (j > 0U) { - y = *pSrcT1; - *pSrcT1++ = y - FIX_MUL(x, *pPRT_in++); + in1 = *pSrcT1; + *pSrcT1++ = in1 - FIX_MUL(in, *pPRT_in++); j--; } j = n; /* Replace the elements in the destination matrix */ while (j > 0U) { - y = *pSrcT2; - *pSrcT2++ = y - FIX_MUL(x, *pPRT_pDst++); + in1 = *pSrcT2; + *pSrcT2++ = in1 - FIX_MUL(in, *pPRT_pDst++); j--; } } @@ -303,7 +167,7 @@ int GJ_inverse(volatile int32_t * pSrc, volatile int32_t * pDst, uint32_t n) { l++; /* Increment the index modifier */ } -// if ((flag != 1U) && (x == 0)) { +// if ((flag != 1U) && (in == 0)) { // for (i = 0; i < m * n; i++) { // if (pSrc[i] != 0) // break; From faeca5086e77ed8a96a3e7bec043f8f581d93432 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Tue, 19 Jul 2022 18:09:48 +0200 Subject: [PATCH 06/22] [software] Unroll single core --- .../MP_matrix_inverse/mempool_mat_inv_q16s.h | 167 ++++++++++++++---- 1 file changed, 137 insertions(+), 30 deletions(-) diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h index 9f2201224..e217119cd 100644 --- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h +++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h @@ -15,11 +15,15 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t Xchg, in = 0, in1; /* Temporary input values */ + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + uint32_t i, rowCnt, j, loopCnt, k, l; /* loop counters */ uint32_t flag; - uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + pDstT1 = pDst; /* Working pointer for destination matrix */ rowCnt = m; /* Loop over the number of rows */ flag = 0U; @@ -54,29 +58,69 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); k = 1U; - if (*pSrcT1 == 0) { + + in = *pSrcT1; + if (in == 0) { /* Loop over the rows present below */ for (i = (l + 1U); i < m; i++) { - pSrcT2 = pSrcT1 + (n * i); + pSrcT2 = pSrc + (n * i); pDstT2 = pDstT1 + (n * k); /* Check if there is a non zero pivot element to replace in the rows below */ if (*pSrcT2 != 0) { /* Exchange the row elements of the input matrix at the right of the pivot */ - j = n - l; - while (j > 0U) { - Xchg = *pSrcT2; - *pSrcT2++ = *pSrcT1; - *pSrcT1++ = Xchg; - j--; + j = 0; + while (j < (n - l) - (n - l) % 4) { + Xchg1 = *(pSrcT2); + Xchg2 = *(pSrcT2 + 1); + Xchg3 = *(pSrcT2 + 2); + Xchg4 = *(pSrcT2 + 3); + out1 = *(pSrcT1); + out2 = *(pSrcT1 + 1); + out3 = *(pSrcT1 + 2); + out4 = *(pSrcT1 + 3); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + *pSrcT1++ = Xchg1; + *pSrcT1++ = Xchg2; + *pSrcT1++ = Xchg3; + *pSrcT1++ = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = *pSrcT2; + *pSrcT2++ = *pSrcT1; + *pSrcT1++ = Xchg1; + j++; } /* Exchange the row elements of the destination matrix */ - j = n; - while (j > 0U) { - Xchg = *pDstT2; + j = 0; + while (j < n - n % 4) { + Xchg1 = *(pDstT2); + Xchg2 = *(pDstT2 + 1); + Xchg3 = *(pDstT2 + 2); + Xchg4 = *(pDstT2 + 3); + out1 = *(pDstT1); + out2 = *(pDstT1 + 1); + out3 = *(pDstT1 + 2); + out4 = *(pDstT1 + 3); + *pDstT2++ = out1; + *pDstT2++ = out2; + *pDstT2++ = out3; + *pDstT2++ = out4; + *pDstT1++ = Xchg1; + *pDstT1++ = Xchg2; + *pDstT1++ = Xchg3; + *pDstT1++ = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = *pDstT2; *pDstT2++ = *pDstT1; - *pDstT1++ = Xchg; - j--; + *pDstT1++ = Xchg1; + j++; } flag = 1U; break; @@ -102,20 +146,49 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { in = *pPivotRowIn; /* Loop over number of columns to the right of the pilot element */ - j = (n - l); - while (j > 0U) { + j = 0; + while (j < (n - l) - (n - l) % 4) { + in1 = *pSrcT1; + in2 = *(pSrcT1 + 1); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT1++ = out1; + *pSrcT1++ = out2; + *pSrcT1++ = out3; + *pSrcT1++ = out4; + j += 4; + } + while (j < n - l) { in1 = *pSrcT1; *pSrcT1++ = FIX_DIV(in1, in); - j--; + j++; } /* Loop over number of columns of the destination matrix */ - j = n; - while (j > 0U) { + j = 0; + while (j < n - n % 4) { + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + j += 4; + } + while (j < n) { in1 = *pSrcT2; *pSrcT2++ = FIX_DIV(in1, in); - j--; + j++; } - /* SUM THE MULTIPLE OF A BOTTOM ROW */ /* Replace the rows with the sum of that row and a multiple of row i * so that each new element in column i above row i is zero.*/ @@ -141,17 +214,51 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; - j = (n - l); /* Replace the elements to the right of the pivot */ - while (j > 0U) { + j = 0; + while (j < (n - l) - (n - l) % 2) { in1 = *pSrcT1; - *pSrcT1++ = in1 - FIX_MUL(in, *pPRT_in++); - j--; + in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // in4 = *(pSrcT1 + 3); + out1 = *pPRT_in++; + out2 = *pPRT_in++; + // out3 = *pPRT_in++; + // out4 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + *pSrcT1++ = in2 - FIX_MUL(in, out2); + // *pSrcT1++ = in3 - FIX_MUL(in, out3); + // *pSrcT1++ = in4 - FIX_MUL(in, out4); + j += 2; + } + while (j < n - l) { + in1 = *pSrcT1; + out1 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + j++; + } + /* Loop over the number of columns to + replace the elements in the destination matrix */ + j = 0; + while (j < n - n % 4) { + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = *pPRT_pDst++; + out2 = *pPRT_pDst++; + out3 = *pPRT_pDst++; + out4 = *pPRT_pDst++; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + *pSrcT2++ = in2 - FIX_MUL(in, out2); + *pSrcT2++ = in3 - FIX_MUL(in, out3); + *pSrcT2++ = in4 - FIX_MUL(in, out4); + j += 4; } - j = n; /* Replace the elements in the destination matrix */ - while (j > 0U) { + while (j < n) { in1 = *pSrcT2; - *pSrcT2++ = in1 - FIX_MUL(in, *pPRT_pDst++); - j--; + out1 = *pPRT_pDst; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + j++; } } /* Increment temporary input pointer */ From 972802348515d0a9295c1fd210c392341f153f4e Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Tue, 19 Jul 2022 18:42:29 +0200 Subject: [PATCH 07/22] [software] Unroll parallel core --- .../apps/MP_matrix_inverse/initialization.h | 4 +- .../MP_matrix_inverse/mempool_mat_inv_q16p.h | 295 +++++++++++++----- 2 files changed, 213 insertions(+), 86 deletions(-) diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/MP_matrix_inverse/initialization.h index 1d9d6396f..c8c874ea3 100644 --- a/software/apps/MP_matrix_inverse/initialization.h +++ b/software/apps/MP_matrix_inverse/initialization.h @@ -7,9 +7,7 @@ #define FIXED_POINT 16 #define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b)) #define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) - -dump(flag, 1); - +#define MIN(a,b) (a < b ? a : b) void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m); diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h index c40bafe5b..445d70cd7 100644 --- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h +++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h @@ -6,6 +6,10 @@ /* GAUSS JORDAN INVERSION */ +dump(l, 1); +dump(loopCnt, 2); +dump(i, 3); + int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { @@ -15,7 +19,10 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t Xchg, in = 0, in1; /* Temporary input values */ + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; uint32_t core_id = mempool_get_core_id(); uint32_t i, j, loopCnt, k, l; /* loop counters */ @@ -23,13 +30,16 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla /* CREATE THE IDENTITY MATRIX */ - pDstT1 = pDst; /* Working pointer for destination matrix */ - for (i = core_id; i < m; i += NUM_CORES) { + pDstT1 = pDst; + for (i = core_id * 4; i < m; i += 4 * NUM_CORES) { for (j = 0; j < m; j++) { pDstT1[i * m + j] = (uint32_t) (i == j); + pDstT1[(i + 1) * m + j] = (uint32_t) ((i + 1) == j); + pDstT1[(i + 2) * m + j] = (uint32_t) ((i + 2) == j); + pDstT1[(i + 3) * m + j] = (uint32_t) ((i + 3) == j); } } - mempool_barrier(NUM_CORES); + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); /* Loop over the number of columns of the input matrix. */ loopCnt = n; @@ -38,53 +48,101 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla while (loopCnt > 0U) { - /* CHECK IF PIVOT ELEMENT IS ZERO */ - pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); - in = *pSrcT1; - k = 1U; - /* Check if the pivot element is zero */ - if (*pSrcT1 == 0U) { - - /* Loop over the number rows present below */ - for (i = (l + 1U) + core_id; i < m; i += NUM_CORES) { - pSrcT2 = pSrcT1 + (n * i); - /* Check if there is element to exchange */ - //if (*flag != 0U) - // break; - if (*pSrcT2 != 0U) { - __atomic_fetch_add(flag, k, __ATOMIC_RELAXED); + + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + + if (in == 0U) { + + //if (core_id == 0) { + // k = 1U; + // while (k < m - l) { + // pSrcT2 = pSrcT1 + k * n; + // if (*pSrcT2 != 0) { + // *flag = k; + // break; + // } + // k++; + // } + //} + //mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + + k = 1U + core_id; + while ((k < m - l) && (*flag == 0)) { + pSrcT2 = pSrcT1 + k * n; + if (*pSrcT2 != 0) { + __atomic_store_n(flag, k, __ATOMIC_RELAXED); } + k += MIN(n / 4, NUM_CORES); + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); } - mempool_barrier(NUM_CORES); + + /* EXCHANGE */ if (*flag != 0U) { - pSrcT2 = pSrcT1 + (n * *flag + l); + pSrcT2 = pSrcT1 + (n * *flag); pDstT2 = pDstT1 + (n * *flag); - /* Loop over number of columns - * to the right of the pilot element */ - for (j = core_id; j < n - l; j += NUM_CORES) { - /* Exchange the row elements of the input matrix */ - Xchg = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg; + /* Loop over columns to the right of pivot */ + for (j = core_id * 4; j < (n - l) - (n - l) % 4; j += 4 * NUM_CORES) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + } + if (core_id == (n - l) / 4) { + j = (n - l) - (n - l) % 4; + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < n - n % 4; j += 4 * NUM_CORES) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; } - pSrcT1 += n - l; - pSrcT2 += n - l; - /* Loop over number of columns of the destination matrix */ - for(j = core_id; j < n; j += NUM_CORES) { - /* Exchange the row elements of the destination matrix */ - Xchg = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg; + if (core_id == n / 4) { + j = n - n % 4; + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } } - pDstT2 += n; - pDstT1 += n; } - k++; - mempool_barrier(NUM_CORES); + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); } /* Update the status if the matrix is singular */ @@ -92,6 +150,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla return 1; } + /* DIVIDE BY THE PIVOT */ /* Points to the pivot row of input and destination matrices */ @@ -102,55 +161,126 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT2 = pPivotRowDst; /* Pivot element of the row */ in = *pPivotRowIn; - /* Loop over number of columns to the right of the pilot element */ - for(j = core_id; j < n - l; j += NUM_CORES) { + + ///* Loop over columns to the right of pivot */ + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + if (core_id == (n - l) / 4) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } } - /* Loop over number of columns of the destination matrix */ - for(j = core_id; j < n; j += NUM_CORES) { + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; } - mempool_barrier(NUM_CORES); - - /*REPLACE ROWS */ - - pSrcT1 = pSrc + core_id * n; - pSrcT2 = pDst + core_id * n; - i = core_id; - k = m; - for(k = core_id; k < m; k += NUM_CORES) { - if (i != l) { - /* Element of the reference row */ - in = *pSrcT1; - /* Working pointers for input and destination pivot rows */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over the number of columns to the right of the pivot element, - to replace the elements in the input matrix */ - for (j = 0; j < n - l; j++) { - in1 = pSrcT1[j]; - pSrcT1[j] = in1 - FIX_MUL(in, pPRT_in[j]); - } - /* Loop over the number of columns to - replace the elements in the destination matrix */ - for (j = 0; j < n; j++) { - in1 = pSrcT2[j]; - pSrcT2[j] = in1 - FIX_MUL(in, pPRT_pDst[j]); + if (core_id == n / 4) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + + + /* REPLACE ROWS */ + + pSrcT1 = pSrc; + pSrcT2 = pDst; + + /* Loop over rows */ + for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + i = 0; + while (i < 4) { + if ((i + k) != l) { + pSrcT1 = pSrc + (i + k) * n; + pSrcT2 = pDst + (i + k) * n; + /* Element of the reference row */ + in = *pSrcT1; + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over columns to the right of pivot */ + while (j < (n - l) - (n - l) % 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + /* Loop over columns */ + j = 0; + while (j < n - n % 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } } + i++; } - i += NUM_CORES; - pSrcT1 += NUM_CORES * n; - pSrcT2 += NUM_CORES * n; } - /* Increment the input pointer */ - pSrc++; - /* Decrement the loop counter */ - loopCnt--; - /* Increment the index modifier */ - l++; - mempool_barrier(NUM_CORES); + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ } // if ((flag != 1U) && (x == 0)) { @@ -161,6 +291,5 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla // if (i == m * n) // return 1; // } - return 0; } From ee0119ca63814b204dda840c6a157a5a2c7c7f67 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Thu, 21 Jul 2022 08:31:44 +0200 Subject: [PATCH 08/22] [software] Clean comments on single-core --- .../MP_matrix_inverse/mempool_mat_inv_q16s.h | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h index e217119cd..83c5a3c21 100644 --- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h +++ b/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h @@ -29,6 +29,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { flag = 0U; /* CREATE THE IDENTITY MATRIX */ + while (rowCnt > 0U) { j = m - rowCnt; while (j > 0U) { @@ -50,25 +51,24 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { l = 0U; while (loopCnt > 0U) { - /* CHECK IF PIVOT ELEMENT IS ZERO... - * If it is zero then interchange the row with non zero row below. - * If there is no non zero element to replace in the rows below, - * then the matrix is Singular. */ pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); k = 1U; - in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (in == 0) { /* Loop over the rows present below */ for (i = (l + 1U); i < m; i++) { pSrcT2 = pSrc + (n * i); pDstT2 = pDstT1 + (n * k); - /* Check if there is a non zero pivot element to replace in the rows below */ + /* EXCHANGE */ + if (*pSrcT2 != 0) { - /* Exchange the row elements of the input matrix at the right of the pivot */ + /* Loop over colums to the right of the pivot */ j = 0; while (j < (n - l) - (n - l) % 4) { Xchg1 = *(pSrcT2); @@ -95,7 +95,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { *pSrcT1++ = Xchg1; j++; } - /* Exchange the row elements of the destination matrix */ + /* Loop over colums */ j = 0; while (j < n - n % 4) { Xchg1 = *(pDstT2); @@ -133,7 +133,6 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { return 1; } - /* DIVIDE BY THE PIVOT */ /* Points to the pivot row of input and destination matrices */ @@ -189,10 +188,8 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { *pSrcT2++ = FIX_DIV(in1, in); j++; } - /* SUM THE MULTIPLE OF A BOTTOM ROW */ - /* Replace the rows with the sum of that row and a multiple of row i - * so that each new element in column i above row i is zero.*/ - /* Temporary pointers for input and destination matrices */ + + /* REPLACE ROWS */ pSrcT1 = pSrc; pSrcT2 = pDst; From b412c877395b3606e84a409c6a9ec94874f4b7ff Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Thu, 21 Jul 2022 11:22:30 +0200 Subject: [PATCH 09/22] [software] Change kernel name --- .../initialization.h | 0 .../{MP_matrix_inverse => mat_inv}/main.c | 33 ++++++++--------- .../mempool_mat_inv_q32p.h} | 35 ++++++++++++++++++- .../mempool_mat_inv_q32s.h} | 0 4 files changed, 49 insertions(+), 19 deletions(-) rename software/apps/{MP_matrix_inverse => mat_inv}/initialization.h (100%) rename software/apps/{MP_matrix_inverse => mat_inv}/main.c (86%) rename software/apps/{MP_matrix_inverse/mempool_mat_inv_q16p.h => mat_inv/mempool_mat_inv_q32p.h} (86%) rename software/apps/{MP_matrix_inverse/mempool_mat_inv_q16s.h => mat_inv/mempool_mat_inv_q32s.h} (100%) diff --git a/software/apps/MP_matrix_inverse/initialization.h b/software/apps/mat_inv/initialization.h similarity index 100% rename from software/apps/MP_matrix_inverse/initialization.h rename to software/apps/mat_inv/initialization.h diff --git a/software/apps/MP_matrix_inverse/main.c b/software/apps/mat_inv/main.c similarity index 86% rename from software/apps/MP_matrix_inverse/main.c rename to software/apps/mat_inv/main.c index 4cb660e23..6c14707f2 100644 --- a/software/apps/MP_matrix_inverse/main.c +++ b/software/apps/mat_inv/main.c @@ -7,9 +7,9 @@ //#include //#include -#define N 4 -#define M 4 -#define O 4 +#define N 16 +#define M 16 +#define O 16 #include "encoding.h" #include "printf.h" @@ -17,12 +17,13 @@ #include "synchronization.h" #include "initialization.h" -#include "mempool_mat_inv_q16s.h" -#include "mempool_mat_inv_q16p.h" +#include "mempool_mat_inv_q32p.h" +#include "mempool_mat_inv_q32s.h" + // #define VERBOSE -// #define SINGLE -#define PARALLEL +#define SINGLE +// #define PARALLEL int32_t matrix[N * M] __attribute__((section(".l1"))); int32_t t_matrix[M * N] __attribute__((section(".l1"))); @@ -32,12 +33,6 @@ int32_t pseudoinverse[M * N] __attribute__((section(".l1"))); uint32_t flag __attribute__((section(".l1"))); void display(int32_t *A, int32_t n, int32_t m) { - //int32_t volatile i = 0; - //while (i < n * m) { - // // printf("ciao mamma\n"); - // printf("Value %d: %d\n", i, A[i]); - // i++; - //} int32_t i, j; for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { @@ -56,10 +51,10 @@ void single_core() // Initialize barrier and synchronize mempool_barrier_init(core_id); - init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix(matrix, N, M, -156, 2000, -219, core_id); //init_matrix_zeros(t_matrix, M, N, core_id); //init_matrix_zeros(matrix_mult, M, M, core_id); - //init_matrix_zeros(inv, M, M, core_id); + init_matrix_zeros(inv, M, M, core_id); //init_matrix_zeros(pseudoinverse, M, N, core_id); mempool_barrier(num_cores); @@ -114,9 +109,11 @@ void multi_core() //Transpose(matrix, t_matrix, N, M); //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); - mempool_start_benchmark(); - mempool_GJinv_q16p(matrix, inv, M, &flag); - mempool_stop_benchmark(); + if (core_id < MIN(NUM_CORES, N / 4)) { + mempool_start_benchmark(); + mempool_GJinv_q16p(matrix, inv, M, &flag); + mempool_stop_benchmark(); + } //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); //MatrixMult(pseudoinverse, matrix, inv, M, N, M); diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h similarity index 86% rename from software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h rename to software/apps/mat_inv/mempool_mat_inv_q32p.h index 445d70cd7..4576cde37 100644 --- a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16p.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h @@ -228,6 +228,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; /* Loop over columns to the right of pivot */ + j = 0; while (j < (n - l) - (n - l) % 4) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; @@ -249,6 +250,22 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT1[j] = in1 - FIX_MUL(in, out1); j++; } + //j = 0; + //while (j < MIN(core_id * 4, n - l)) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = pPRT_in[j]; + // out2 = pPRT_in[j + 1]; + // out3 = pPRT_in[j + 2]; + // out4 = pPRT_in[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4; + //} /* Loop over columns */ j = 0; while (j < n - n % 4) { @@ -266,12 +283,28 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); j += 4; } - while (j < n) { + while (j < MIN(core_id * 4, n)) { in1 = pSrcT2[j]; out1 = pPRT_pDst[j]; pSrcT2[j] = in1 - FIX_MUL(in, out1); j++; } + //j = 0; + //while (j < core_id * 4) { + // in1 = pSrcT2[j]; + // in2 = pSrcT2[j + 1]; + // in3 = pSrcT2[j + 2]; + // in4 = pSrcT2[j + 3]; + // out1 = pPRT_pDst[j]; + // out2 = pPRT_pDst[j + 1]; + // out3 = pPRT_pDst[j + 2]; + // out4 = pPRT_pDst[j + 3]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4; + //} } i++; } diff --git a/software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h similarity index 100% rename from software/apps/MP_matrix_inverse/mempool_mat_inv_q16s.h rename to software/apps/mat_inv/mempool_mat_inv_q32s.h From 91380245b2b11d5763d00a66ab5e7a5ad28e006d Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 27 Jul 2022 11:03:39 +0200 Subject: [PATCH 10/22] [software] Add different parallelization schemes --- software/apps/mat_inv/mempool_mat_inv_q32p.h | 192 +++++++++++-------- software/apps/mat_inv/mempool_mat_inv_q32s.h | 17 +- 2 files changed, 122 insertions(+), 87 deletions(-) diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h index 4576cde37..320fe709b 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h @@ -10,9 +10,9 @@ dump(l, 1); dump(loopCnt, 2); dump(i, 3); -int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); +int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); -int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { +int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ @@ -31,12 +31,12 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla /* CREATE THE IDENTITY MATRIX */ pDstT1 = pDst; - for (i = core_id * 4; i < m; i += 4 * NUM_CORES) { + for (k = core_id * 4; k < m; k += 4 * NUM_CORES) { for (j = 0; j < m; j++) { - pDstT1[i * m + j] = (uint32_t) (i == j); - pDstT1[(i + 1) * m + j] = (uint32_t) ((i + 1) == j); - pDstT1[(i + 2) * m + j] = (uint32_t) ((i + 2) == j); - pDstT1[(i + 3) * m + j] = (uint32_t) ((i + 3) == j); + pDstT1[k * m + j] = (uint32_t) (k == j); + pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); + pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); + pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); } } mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); @@ -52,33 +52,32 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pDstT1 = pDst + (l * n); in = *pSrcT1; - /* CHECK IF PIVOT ELEMENT IS ZERO */ if (in == 0U) { - //if (core_id == 0) { - // k = 1U; - // while (k < m - l) { - // pSrcT2 = pSrcT1 + k * n; - // if (*pSrcT2 != 0) { - // *flag = k; - // break; - // } - // k++; - // } - //} - //mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - - k = 1U + core_id; - while ((k < m - l) && (*flag == 0)) { - pSrcT2 = pSrcT1 + k * n; - if (*pSrcT2 != 0) { - __atomic_store_n(flag, k, __ATOMIC_RELAXED); + if (core_id == 0) { + k = 1U; + while (k < m - l) { + pSrcT2 = pSrcT1 + k * n; + if (*pSrcT2 != 0) { + *flag = k; + break; + } + k++; } - k += MIN(n / 4, NUM_CORES); - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + + //k = 1U + core_id; + //while ((k < m - l) && (*flag == 0)) { + // pSrcT2 = pSrcT1 + k * n; + // if (*pSrcT2 != 0) { + // __atomic_store_n(flag, k, __ATOMIC_RELAXED); + // } + // k += MIN(n / 4, NUM_CORES); + // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + //} /* EXCHANGE */ @@ -86,7 +85,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT2 = pSrcT1 + (n * *flag); pDstT2 = pDstT1 + (n * *flag); /* Loop over columns to the right of pivot */ - for (j = core_id * 4; j < (n - l) - (n - l) % 4; j += 4 * NUM_CORES) { + //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); + //while (j < 4 * ((n - l) >> 2U)) { + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) { Xchg1 = pSrcT2[j]; Xchg2 = pSrcT2[j + 1]; Xchg3 = pSrcT2[j + 2]; @@ -103,9 +104,10 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT1[j + 1] = Xchg2; pSrcT1[j + 2] = Xchg3; pSrcT1[j + 3] = Xchg4; + // j += 4 * NUM_CORES; } - if (core_id == (n - l) / 4) { - j = (n - l) - (n - l) % 4; + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); while (j < n - l) { Xchg1 = pSrcT2[j]; pSrcT2[j] = pSrcT1[j]; @@ -114,7 +116,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla } } /* Loop over columns */ - for (j = core_id * 4; j < n - n % 4; j += 4 * NUM_CORES) { + for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) { Xchg1 = pDstT2[j]; Xchg2 = pDstT2[j + 1]; Xchg3 = pDstT2[j + 2]; @@ -132,8 +134,8 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pDstT1[j + 2] = Xchg3; pDstT1[j + 3] = Xchg4; } - if (core_id == n / 4) { - j = n - n % 4; + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); while (j < n) { Xchg1 = pDstT2[j]; pDstT2[j] = pDstT1[j]; @@ -144,7 +146,6 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla } mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); } - /* Update the status if the matrix is singular */ if ((*flag == 0U) && (in == 0U)) { return 1; @@ -163,6 +164,8 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla in = *pPivotRowIn; ///* Loop over columns to the right of pivot */ + // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); + // while (j < 4 * ((n - l) >> 2U)) { for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; @@ -176,8 +179,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT1[j + 1] = out2; pSrcT1[j + 2] = out3; pSrcT1[j + 3] = out4; + // j += NUM_CORES * 4; } - if (core_id == (n - l) / 4) { + if (core_id == (n >> 2U) - 1) { j = 4 * ((n - l) >> 2U); while (j < n - l) { in1 = pSrcT1[j]; @@ -200,11 +204,11 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT2[j + 2] = out3; pSrcT2[j + 3] = out4; } - if (core_id == n / 4) { + if (core_id == (n >> 2U) - 1) { j = 4 * (n >> 2U); while (j < n) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); j++; } } @@ -215,10 +219,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT1 = pSrc; pSrcT2 = pDst; - /* Loop over rows */ for (k = core_id * 4; k < m; k += NUM_CORES * 4) { - i = 0; + i = 0U; while (i < 4) { if ((i + k) != l) { pSrcT1 = pSrc + (i + k) * n; @@ -229,7 +232,7 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pPRT_pDst = pPivotRowDst; /* Loop over columns to the right of pivot */ j = 0; - while (j < (n - l) - (n - l) % 4) { + while (j < 4 * ((n - l) >> 2U)) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; in3 = pSrcT1[j + 2]; @@ -250,25 +253,9 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT1[j] = in1 - FIX_MUL(in, out1); j++; } - //j = 0; - //while (j < MIN(core_id * 4, n - l)) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = pPRT_in[j]; - // out2 = pPRT_in[j + 1]; - // out3 = pPRT_in[j + 2]; - // out4 = pPRT_in[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4; - //} /* Loop over columns */ j = 0; - while (j < n - n % 4) { + while (j < 4 * (n >> 2U)) { in1 = pSrcT2[j]; in2 = pSrcT2[j + 1]; in3 = pSrcT2[j + 2]; @@ -283,34 +270,87 @@ int mempool_GJinv_q16p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); j += 4; } - while (j < MIN(core_id * 4, n)) { + while (j < n) { in1 = pSrcT2[j]; out1 = pPRT_pDst[j]; pSrcT2[j] = in1 - FIX_MUL(in, out1); j++; } - //j = 0; - //while (j < core_id * 4) { - // in1 = pSrcT2[j]; - // in2 = pSrcT2[j + 1]; - // in3 = pSrcT2[j + 2]; - // in4 = pSrcT2[j + 3]; - // out1 = pPRT_pDst[j]; - // out2 = pPRT_pDst[j + 1]; - // out3 = pPRT_pDst[j + 2]; - // out4 = pPRT_pDst[j + 3]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4; - //} } i++; } } mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); +// pSrcT1 = pSrc; +// pSrcT2 = pDst; +// /* Loop over rows */ +// for (k = 0; k < m; k++) { +// if (k != l) { +// pSrcT1 = pSrc + k * n; +// pSrcT2 = pDst + k * n; +// /* Element of the reference row */ +// in = *pSrcT1; +// pPRT_in = pPivotRowIn; +// pPRT_pDst = pPivotRowDst; +// /* Loop over columns to the right of pivot */ +// j = core_id * 4; +// // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); +// while (j < 4 * ((n - l) >> 2U)) { +// in1 = pSrcT1[j]; +// in2 = pSrcT1[j + 1]; +// in3 = pSrcT1[j + 2]; +// in4 = pSrcT1[j + 3]; +// out1 = pPRT_in[j]; +// out2 = pPRT_in[j + 1]; +// out3 = pPRT_in[j + 2]; +// out4 = pPRT_in[j + 3]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4 * NUM_CORES; +// } +// if (core_id == (n >> 2U) - 1) { +// j = 4 * ((n - l) >> 2U); +// while (j < n - l) { +// in1 = pSrcT1[j]; +// out1 = pPRT_in[j]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// } +// /* Loop over columns */ +// j = core_id * 4; +// while (j < 4 * (n >> 2U)) { +// in1 = pSrcT2[j]; +// in2 = pSrcT2[j + 1]; +// in3 = pSrcT2[j + 2]; +// in4 = pSrcT2[j + 3]; +// out1 = pPRT_pDst[j]; +// out2 = pPRT_pDst[j + 1]; +// out3 = pPRT_pDst[j + 2]; +// out4 = pPRT_pDst[j + 3]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4 * NUM_CORES; +// } +// if (core_id == (n >> 2U) - 1) { +// j = 4 * (n >> 2U); +// while (j < n) { +// in1 = pSrcT2[j]; +// out1 = pPRT_pDst[j]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// } +// mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); +// } +// } +// mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + pSrc++; /* Increment the input pointer */ loopCnt--; /* Decrement the loop counter */ l++; /* Increment the index modifier */ diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h index 83c5a3c21..70fff05a2 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32s.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h @@ -6,9 +6,9 @@ /* GAUSS JORDAN INVERSION */ -int mempool_GJinv_q16s(int32_t *pSrc, int32_t *pDst, uint32_t n); +int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n); -int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { +int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ @@ -146,7 +146,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { /* Loop over number of columns to the right of the pilot element */ j = 0; - while (j < (n - l) - (n - l) % 4) { + while (j < 4 * ((n - l) >> 2U)) { in1 = *pSrcT1; in2 = *(pSrcT1 + 1); in3 = *(pSrcT1 + 2); @@ -168,7 +168,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { } /* Loop over number of columns of the destination matrix */ j = 0; - while (j < n - n % 4) { + while (j < 4 * (n >> 2U)) { in1 = *pSrcT2; in2 = *(pSrcT2 + 1); in3 = *(pSrcT2 + 2); @@ -193,26 +193,21 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { pSrcT1 = pSrc; pSrcT2 = pDst; - i = 0U; /* pivot index */ k = m; /* row index */ while (k > 0U) { - /* Only the columns to the right of the pivot are to be processed */ if (i == l) { pSrcT1 += n - l; pSrcT2 += n; - } else { - /* Element of the reference row */ in = *pSrcT1; /* Reference row pointers */ pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; - j = 0; - while (j < (n - l) - (n - l) % 2) { + while (j < 2 * ((n - l) >> 1U)) { in1 = *pSrcT1; in2 = *(pSrcT1 + 1); // in3 = *(pSrcT1 + 2); @@ -236,7 +231,7 @@ int mempool_GJinv_q16s(int32_t * pSrc, int32_t * pDst, uint32_t n) { /* Loop over the number of columns to replace the elements in the destination matrix */ j = 0; - while (j < n - n % 4) { + while (j < 4 * (n >> 2U)) { in1 = *pSrcT2; in2 = *(pSrcT2 + 1); in3 = *(pSrcT2 + 2); From 3aad7fdd3000c472f833e230a221711b1e9716b3 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 27 Jul 2022 11:04:12 +0200 Subject: [PATCH 11/22] [software] Shape memory accesses to mempool --- software/apps/mat_inv/main.c | 114 +++--- .../mat_inv/mempool_mat_inv_q32p_memsized.h | 358 ++++++++++++++++++ 2 files changed, 419 insertions(+), 53 deletions(-) create mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index 6c14707f2..a00f66b99 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -10,6 +10,7 @@ #define N 16 #define M 16 #define O 16 +#define N_BANKS (1024) #include "encoding.h" #include "printf.h" @@ -18,27 +19,30 @@ #include "initialization.h" #include "mempool_mat_inv_q32p.h" +#include "mempool_mat_inv_q32p_memsized.h" #include "mempool_mat_inv_q32s.h" -// #define VERBOSE -#define SINGLE +#define VERBOSE +// #define SINGLE // #define PARALLEL +#define MEMSIZED -int32_t matrix[N * M] __attribute__((section(".l1"))); -int32_t t_matrix[M * N] __attribute__((section(".l1"))); -int32_t matrix_mult[M * M] __attribute__((section(".l1"))); -int32_t inv[M * M] __attribute__((section(".l1"))); -int32_t pseudoinverse[M * N] __attribute__((section(".l1"))); -uint32_t flag __attribute__((section(".l1"))); +int32_t matrix[N * M] __attribute__((aligned(N), section(".l1"))); +int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); void display(int32_t *A, int32_t n, int32_t m) { - int32_t i, j; - for (i = 0; i < n; i++) { - for (j = 0; j < m; j++) { - printf("%5d ", A[i * m + j]); - } - printf("\n"); + //int32_t i, j; + //for (i = 0; i < n; i++) { + // for (j = 0; j < m; j++) { + // printf("%8d ", A[i * m + j]); + // } + // printf("\n"); + //} + int32_t i; + for (i = 0; i < n * m; i++) { + printf("Output[%d] = %8d\n", i, A[i]); } } @@ -51,41 +55,21 @@ void single_core() // Initialize barrier and synchronize mempool_barrier_init(core_id); - init_matrix(matrix, N, M, -156, 2000, -219, core_id); - //init_matrix_zeros(t_matrix, M, N, core_id); - //init_matrix_zeros(matrix_mult, M, M, core_id); + init_matrix(matrix, N, M, -156, 427, -219, core_id); init_matrix_zeros(inv, M, M, core_id); - //init_matrix_zeros(pseudoinverse, M, N, core_id); mempool_barrier(num_cores); if(core_id == 0) { - #if defined(VERBOSE) - display(matrix, N, M); - Transpose(matrix, t_matrix, N, M); - printf("\nThe Transpose is :\n"); - display(t_matrix, M, N); - MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); - printf("The product of the matrix is: \n"); - display(matrix_mult, M, M); - printf("\nThe Inverse is :\n"); - mempool_mat_inv_q16s(matrix_mult, inv, N); - display(inv, N, N); - MatrixMult(t_matrix, inv, pseudoinverse, M, N, N); - printf("\nThe Moore-Penrose inverse is :\n"); - display(pseudoinverse, M, N); - #else - //Transpose(matrix, t_matrix, N, M); - //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); - - mempool_start_benchmark(); - mempool_GJinv_q16s(matrix, inv, M); - mempool_stop_benchmark(); - - //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); - //MatrixMult(pseudoinverse, matrix, inv, M, N, M); - #endif + mempool_start_benchmark(); + mempool_GJinv_q32s(matrix, inv, M); + mempool_stop_benchmark(); } mempool_barrier(num_cores); + #ifdef VERBOSE + if (core_id == 0) + display(inv, N, M); + #endif + mempool_barrier(num_cores); } void multi_core() @@ -97,27 +81,49 @@ void multi_core() mempool_barrier_init(core_id); init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, M, M, core_id); if (core_id == 0) { flag = 0U; } - //init_matrix_zeros(t_matrix, M, N, core_id); - //init_matrix_zeros(matrix_mult, M, M, core_id); - //init_matrix_zeros(inv, M, M, core_id); - //init_matrix_zeros(pseudoinverse, M, N, core_id); mempool_barrier(num_cores); - //Transpose(matrix, t_matrix, N, M); - //MatrixMult(t_matrix, matrix, matrix_mult, M, N, O); - if (core_id < MIN(NUM_CORES, N / 4)) { mempool_start_benchmark(); - mempool_GJinv_q16p(matrix, inv, M, &flag); + mempool_GJinv_q32p(matrix, inv, M, &flag); mempool_stop_benchmark(); } + mempool_barrier(num_cores); + #ifdef VERBOSE + if (core_id == 0) + display(inv, M, N); + #endif + mempool_barrier(num_cores); +} - //MatrixMult(inv, t_matrix, pseudoinverse, M, M, N); - //MatrixMult(pseudoinverse, matrix, inv, M, N, M); +void multi_core_memsized() +{ + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, M, M, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + + mempool_start_benchmark(); + mempool_GJinv_q32p_memsized(matrix, inv, M, &flag); + mempool_stop_benchmark(); + + mempool_barrier(num_cores); + #ifdef VERBOSE + if (core_id == 0) + display(inv, M, N); + #endif mempool_barrier(num_cores); } @@ -126,6 +132,8 @@ int main() { single_core(); #elif defined(PARALLEL) multi_core(); + #elif defined(MEMSIZED) + multi_core_memsized(); #endif return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h new file mode 100644 index 000000000..496459e19 --- /dev/null +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h @@ -0,0 +1,358 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* GAUSS JORDAN INVERSION */ + +int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); + +int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { + + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + + pDstT1 = pDst; + for (k = 0; k < m; k++) { + core_id = absolute_core_id - ((n * k) / 4) % N_BANKS; + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; + for (j = core_id * 4; j < m; j += 4 * NUM_CORES) { + pDstT1[k * m + j] = (uint32_t) (k == j); + pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); + pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); + pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); + } + } + mempool_log_barrier(2, absolute_core_id); + + /* Loop over the number of columns of the input matrix. */ + loopCnt = n; + /* Index modifier to navigate through the columns */ + l = 0U; + + while (loopCnt > 0U) { + + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + core_id = absolute_core_id - ((l * n) / 4) % N_BANKS; + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + + if (in == 0U) { + + if (absolute_core_id == 0) { + k = 1U; + while (k < m - l) { + pSrcT2 = pSrcT1 + k * n; + if (*pSrcT2 != 0) { + *flag = k; + break; + } + k++; + } + } + mempool_log_barrier(2, absolute_core_id); + + /* EXCHANGE */ + + if (*flag != 0U) { + pSrcT2 = pSrcT1 + (n * *flag); + pDstT2 = pDstT1 + (n * *flag); + + /* Loop over columns to the right of pivot */ + //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); + //while (j < 4 * ((n - l) >> 2U)) { + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + // j += 4 * NUM_CORES; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + } + } + mempool_log_barrier(2, absolute_core_id); + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + + + /* DIVIDE BY THE PIVOT */ + + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + ///* Loop over columns to the right of pivot */ + // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); + // while (j < 4 * ((n - l) >> 2U)) { + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + // j += NUM_CORES * 4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + // pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_barrier(2, absolute_core_id); + + + /* REPLACE ROWS */ + core_id = absolute_core_id; + pSrcT1 = pSrc; + pSrcT2 = pDst; + /* Loop over rows */ +// for (k = core_id * 4; k < m; k += NUM_CORES * 4) { +// i = 0U; +// while (i < 4) { +// if ((i + k) != l) { +// pSrcT1 = pSrc + (i + k) * n; +// pSrcT2 = pDst + (i + k) * n; +// /* Element of the reference row */ +// in = *pSrcT1; +// pPRT_in = pPivotRowIn; +// pPRT_pDst = pPivotRowDst; +// /* Loop over columns to the right of pivot */ +// j = 0; +// while (j < 4 * ((n - l) >> 2U)) { +// in1 = pSrcT1[j]; +// in2 = pSrcT1[j + 1]; +// in3 = pSrcT1[j + 2]; +// in4 = pSrcT1[j + 3]; +// out1 = pPRT_in[j]; +// out2 = pPRT_in[j + 1]; +// out3 = pPRT_in[j + 2]; +// out4 = pPRT_in[j + 3]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4; +// } +// while (j < n - l) { +// in1 = pSrcT1[j]; +// out1 = pPRT_in[j]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// /* Loop over columns */ +// j = 0; +// while (j < 4 * (n >> 2U)) { +// in1 = pSrcT2[j]; +// in2 = pSrcT2[j + 1]; +// in3 = pSrcT2[j + 2]; +// in4 = pSrcT2[j + 3]; +// out1 = pPRT_pDst[j]; +// out2 = pPRT_pDst[j + 1]; +// out3 = pPRT_pDst[j + 2]; +// out4 = pPRT_pDst[j + 3]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4; +// } +// while (j < n) { +// in1 = pSrcT2[j]; +// out1 = pPRT_pDst[j]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// } +// i++; +// } +// } +// mempool_log_barrier(2, absolute_core_id); + + for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { + dump_i(absolute_core_id); + if (k != l) { + pSrcT1 = pSrc + k * n; + pSrcT2 = pDst + k * n; + core_id = absolute_core_id % (n >> 2U); + /* Element of the reference row */ + in = *pSrcT1; + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * NUM_CORES; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + /* Loop over columns */ + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * NUM_CORES; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + } + } + mempool_log_barrier(2, absolute_core_id); + + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ + } + mempool_log_barrier(2, absolute_core_id); + +// if ((flag != 1U) && (x == 0)) { +// for (i = 0; i < m * n; i++) { +// if (pSrc[i] != 0) +// break; +// } +// if (i == m * n) +// return 1; +// } + return 0; +} From a045b42cae6eb0abc26aa7e1b2e589ebfd3febba Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Tue, 2 Aug 2022 18:20:46 +0200 Subject: [PATCH 12/22] [software] Add folded kernel --- software/apps/mat_inv/initialization.h | 35 +++ software/apps/mat_inv/main.c | 86 ++++-- .../mat_inv/mempool_mat_inv_q32p_folded.h | 287 ++++++++++++++++++ 3 files changed, 380 insertions(+), 28 deletions(-) create mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_folded.h diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h index c8c874ea3..ec330e766 100644 --- a/software/apps/mat_inv/initialization.h +++ b/software/apps/mat_inv/initialization.h @@ -9,6 +9,16 @@ #define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) #define MIN(a,b) (a < b ? a : b) +dump(l, 1); +dump(loopCnt, 2); +dump(i, 3); + +void display(int32_t *A, int32_t n, int32_t m); + +#ifdef FOLDED +void display_folded(int32_t *A, int32_t n, int32_t m); +#endif + void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m); void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n, int32_t m, int32_t o); @@ -17,6 +27,31 @@ void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, int3 void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id); +void display(int32_t *A, int32_t n, int32_t m) { + //int32_t i, j; + //for (i = 0; i < n; i++) { + // for (j = 0; j < m; j++) { + // printf("%8d ", A[i * m + j]); + // } + // printf("\n"); + //} + int32_t i; + for (i = 0; i < n * m; i++) { + printf("Output[%d] = %8d\n", i, A[i]); + } +} + +#ifdef FOLDED +void display_folded(int32_t *A, int32_t n, int32_t m) { + int32_t i, j, k, shift; + for (i = 0; i < n * m; i++) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + printf("Output[%d] = %8d\n", i, A[shift + j]); + } +} +#endif void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) { int32_t i, j; diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index a00f66b99..f39cd6ac8 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -4,47 +4,39 @@ // Author: Marco Bertuletti, ETH Zurich -//#include -//#include - -#define N 16 -#define M 16 -#define O 16 -#define N_BANKS (1024) - #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "initialization.h" -#include "mempool_mat_inv_q32p.h" -#include "mempool_mat_inv_q32p_memsized.h" -#include "mempool_mat_inv_q32s.h" - +#define N 16 +#define M 16 +#define O 16 +#define N_BANKS (1024) +#define N_USED_BANKS (64) #define VERBOSE // #define SINGLE // #define PARALLEL #define MEMSIZED +// #define FOLDED +#include "initialization.h" +#include "mempool_mat_inv_q32s.h" +#include "mempool_mat_inv_q32p.h" +#include "mempool_mat_inv_q32p_memsized.h" +#include "mempool_mat_inv_q32p_folded.h" + +#ifdef FOLDED +int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)] __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)] __attribute__((aligned(N_BANKS), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); +#else int32_t matrix[N * M] __attribute__((aligned(N), section(".l1"))); int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); uint32_t flag __attribute__((section(".l1"))); - -void display(int32_t *A, int32_t n, int32_t m) { - //int32_t i, j; - //for (i = 0; i < n; i++) { - // for (j = 0; j < m; j++) { - // printf("%8d ", A[i * m + j]); - // } - // printf("\n"); - //} - int32_t i; - for (i = 0; i < n * m; i++) { - printf("Output[%d] = %8d\n", i, A[i]); - } -} +#endif // Driver program void single_core() @@ -109,7 +101,7 @@ void multi_core_memsized() mempool_barrier_init(core_id); init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, M, M, core_id); + init_matrix_zeros(inv, N, M, core_id); if (core_id == 0) { flag = 0U; } @@ -127,6 +119,42 @@ void multi_core_memsized() mempool_barrier(num_cores); } +#ifdef FOLDED +void multi_core_folded() +{ + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t nPE = N_USED_BANKS >> 2U; + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + + mempool_start_benchmark(); + fold_matrix(matrix, folded_matrix, N); + mempool_stop_benchmark(); + if(core_id < nPE) { + mempool_start_benchmark(); + mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + #ifdef VERBOSE + if (core_id == 0) + display_folded(inv, M, N); + #endif + mempool_barrier(num_cores); + +} +#endif + int main() { #if defined(SINGLE) single_core(); @@ -134,6 +162,8 @@ int main() { multi_core(); #elif defined(MEMSIZED) multi_core_memsized(); + #elif defined(FOLDED) + multi_core_folded(); #endif return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h new file mode 100644 index 000000000..5dc0aefc8 --- /dev/null +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h @@ -0,0 +1,287 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* GAUSS JORDAN INVERSION */ + +int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE); +void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n); + + +void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n) { + uint32_t core_id = mempool_get_core_id(); + uint32_t i, j, k, shift; + for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDst[shift + j] = pSrc[i]; + pDst[shift + j + 1] = pSrc[i + 1]; + pDst[shift + j + 2] = pSrc[i + 2]; + pDst[shift + j + 3] = pSrc[i + 3]; + } + mempool_log_barrier(2, core_id); +} + +int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE) { + + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t shift = 0; + uint32_t i, j, k, l; /* loop counters */ + uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + pDstT1 = pDst; + for (i = core_id * 4; i < n * m; i += nPE * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDstT1[shift + j] = (uint32_t) (k == j); + pDstT1[shift + j + 1] = (uint32_t) (k == (j + 1)); + pDstT1[shift + j + 2] = (uint32_t) (k == (j + 2)); + pDstT1[shift + j + 3] = (uint32_t) (k == (j + 3)); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { + + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pDstT1 = pDst + shift; + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT2 = pSrc + shift; + pDstT2 = pDst + shift; + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pPivotRowIn = pSrc + shift; + pPivotRowDst = pDst + shift; + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + + /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { + /* Only the columns to the right of the pivot are to be processed */ + if (k != l) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pSrcT2 = pDst + shift; + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + core_id = absolute_core_id % (n >> 2U); + /* Loop over the columns */ + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + return 0; +} From 4dca2cf15e00e3b486adffe9591dedde9b79edbb Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 3 Aug 2022 11:00:33 +0200 Subject: [PATCH 13/22] [software] Let single core handle exchange in parallel implementation --- software/apps/mat_inv/mempool_mat_inv_q32p.h | 165 ++++++++----------- 1 file changed, 67 insertions(+), 98 deletions(-) diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h index 320fe709b..952d06fc4 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h @@ -6,10 +6,6 @@ /* GAUSS JORDAN INVERSION */ -dump(l, 1); -dump(loopCnt, 2); -dump(i, 3); - int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { @@ -53,107 +49,82 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla in = *pSrcT1; /* CHECK IF PIVOT ELEMENT IS ZERO */ - - if (in == 0U) { - - if (core_id == 0) { - k = 1U; - while (k < m - l) { - pSrcT2 = pSrcT1 + k * n; + if (core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ if (*pSrcT2 != 0) { - *flag = k; + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; break; } - k++; } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - - //k = 1U + core_id; - //while ((k < m - l) && (*flag == 0)) { - // pSrcT2 = pSrcT1 + k * n; - // if (*pSrcT2 != 0) { - // __atomic_store_n(flag, k, __ATOMIC_RELAXED); - // } - // k += MIN(n / 4, NUM_CORES); - // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - //} - - /* EXCHANGE */ - - if (*flag != 0U) { - pSrcT2 = pSrcT1 + (n * *flag); - pDstT2 = pDstT1 + (n * *flag); - /* Loop over columns to the right of pivot */ - //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); - //while (j < 4 * ((n - l) >> 2U)) { - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - // j += 4 * NUM_CORES; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - } - /* Loop over columns */ - for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ pPivotRowIn = pSrc + (l * n); pPivotRowDst = pDst + (l * n); @@ -164,8 +135,6 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla in = *pPivotRowIn; ///* Loop over columns to the right of pivot */ - // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); - // while (j < 4 * ((n - l) >> 2U)) { for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; @@ -216,7 +185,6 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla /* REPLACE ROWS */ - pSrcT1 = pSrc; pSrcT2 = pDst; /* Loop over rows */ @@ -282,6 +250,7 @@ int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *fla } mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); +// /* REPLACE ROWS */ // pSrcT1 = pSrc; // pSrcT2 = pDst; // /* Loop over rows */ From 0ca5b681ebe3d6281d20094a7540f491abddead3 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 3 Aug 2022 11:01:34 +0200 Subject: [PATCH 14/22] [software] Add code for unrolling in single-core --- software/apps/mat_inv/mempool_mat_inv_q32s.h | 111 ++++++++++++------- 1 file changed, 70 insertions(+), 41 deletions(-) diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h index 70fff05a2..21aadbe39 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32s.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h @@ -20,37 +20,24 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { int32_t in1, in2, in3, in4; int32_t out1, out2, out3, out4; - uint32_t i, rowCnt, j, loopCnt, k, l; /* loop counters */ - uint32_t flag; - uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */ + uint32_t i, j, k, l; /* loop counters */ + uint32_t flag = 0U; /* Flag to check if the matrix is singular */ pDstT1 = pDst; /* Working pointer for destination matrix */ - rowCnt = m; /* Loop over the number of rows */ - flag = 0U; - /* CREATE THE IDENTITY MATRIX */ - - while (rowCnt > 0U) { - j = m - rowCnt; - while (j > 0U) { - *pDstT1++ = 0; - j--; - } - *pDstT1++ = 1; - j = rowCnt - 1U; - while (j > 0U) { - *pDstT1++ = 0; - j--; + for (k = 0; k < m; k += 4) { + for (j = 0; j < n; j++) { + pDstT1[k * m + j] = (uint32_t) (k == j); + pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); + pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); + pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); } - rowCnt--; } - /* Loop over the number of columns of the input matrix. */ - loopCnt = n; /* Index modifier to navigate through the columns */ l = 0U; - - while (loopCnt > 0U) { + while (l < n) { pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); @@ -166,6 +153,32 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { *pSrcT1++ = FIX_DIV(in1, in); j++; } + //switch ((n - l) % 4) { + // case 3: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // *pSrcT1++ = out1; + // *pSrcT1++ = out2; + // *pSrcT1++ = out3; + // break; + // case 2: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // *pSrcT1++ = out1; + // *pSrcT1++ = out2; + // break; + // case 1: + // in1 = *pSrcT1; + // out1 = FIX_DIV(in1, in); + // *pSrcT1++ = out1; + // break; + //} /* Loop over number of columns of the destination matrix */ j = 0; while (j < 4 * (n >> 2U)) { @@ -207,20 +220,20 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; j = 0; - while (j < 2 * ((n - l) >> 1U)) { + while (j < 4 * ((n - l) >> 2U)) { in1 = *pSrcT1; in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // in4 = *(pSrcT1 + 3); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); out1 = *pPRT_in++; out2 = *pPRT_in++; - // out3 = *pPRT_in++; - // out4 = *pPRT_in++; + out3 = *pPRT_in++; + out4 = *pPRT_in++; *pSrcT1++ = in1 - FIX_MUL(in, out1); *pSrcT1++ = in2 - FIX_MUL(in, out2); - // *pSrcT1++ = in3 - FIX_MUL(in, out3); - // *pSrcT1++ = in4 - FIX_MUL(in, out4); - j += 2; + *pSrcT1++ = in3 - FIX_MUL(in, out3); + *pSrcT1++ = in4 - FIX_MUL(in, out4); + j += 4; } while (j < n - l) { in1 = *pSrcT1; @@ -228,6 +241,32 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { *pSrcT1++ = in1 - FIX_MUL(in, out1); j++; } + //switch ((n - l) % 4) { + // case 3: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // out1 = *pPRT_in++; + // out2 = *pPRT_in++; + // out3 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // *pSrcT1++ = in2 - FIX_MUL(in, out2); + // *pSrcT1++ = in3 - FIX_MUL(in, out3); + // break; + // case 2: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // out1 = *pPRT_in++; + // out2 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // *pSrcT1++ = in2 - FIX_MUL(in, out2); + // break; + // case 1: + // in1 = *pSrcT1; + // out1 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // break; + //} /* Loop over the number of columns to replace the elements in the destination matrix */ j = 0; @@ -262,19 +301,9 @@ int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { } pSrc++; /* Increment the input pointer */ - loopCnt--; /* Decrement the loop counter */ l++; /* Increment the index modifier */ } -// if ((flag != 1U) && (in == 0)) { -// for (i = 0; i < m * n; i++) { -// if (pSrc[i] != 0) -// break; -// } -// if (i == m * n) -// return 1; -// } - return 0; } From b42e968713d47bc536c03518f5eb6741eca1e9bc Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 3 Aug 2022 11:02:43 +0200 Subject: [PATCH 15/22] [software] Add parallelization schemes in memory shaped version --- .../mat_inv/mempool_mat_inv_q32p_memsized.h | 638 +++++++++++------- 1 file changed, 409 insertions(+), 229 deletions(-) diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h index 496459e19..961aefd58 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h @@ -22,167 +22,230 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint uint32_t absolute_core_id = mempool_get_core_id(); uint32_t core_id = absolute_core_id; - uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t i, j, k, l; /* loop counters */ uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ /* CREATE THE IDENTITY MATRIX */ pDstT1 = pDst; - for (k = 0; k < m; k++) { - core_id = absolute_core_id - ((n * k) / 4) % N_BANKS; - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; - for (j = core_id * 4; j < m; j += 4 * NUM_CORES) { - pDstT1[k * m + j] = (uint32_t) (k == j); - pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); - pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); - pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); + for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + for (j = 0; j < n; j++) { + pDstT1[k * n + j] = (uint32_t) (k == j); + pDstT1[(k + 1) * n + j] = (uint32_t) ((k + 1) == j); + pDstT1[(k + 2) * n + j] = (uint32_t) ((k + 2) == j); + pDstT1[(k + 3) * n + j] = (uint32_t) ((k + 3) == j); } } - mempool_log_barrier(2, absolute_core_id); +// pDstT1 = pDst; +// for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) { +// k = i / n; +// j = i % n; +// pDstT1[k * n + j] = (uint32_t) (k == j); +// pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1)); +// pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2)); +// pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3)); +// } +// mempool_log_barrier(2, absolute_core_id); - /* Loop over the number of columns of the input matrix. */ - loopCnt = n; /* Index modifier to navigate through the columns */ l = 0U; - - while (loopCnt > 0U) { + while (l < n) { pSrcT1 = pSrc + (l * n); pDstT1 = pDst + (l * n); - core_id = absolute_core_id - ((l * n) / 4) % N_BANKS; - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; in = *pSrcT1; /* CHECK IF PIVOT ELEMENT IS ZERO */ - - if (in == 0U) { - - if (absolute_core_id == 0) { - k = 1U; - while (k < m - l) { - pSrcT2 = pSrcT1 + k * n; + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ if (*pSrcT2 != 0) { - *flag = k; + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; break; } - k++; } } - mempool_log_barrier(2, absolute_core_id); - - /* EXCHANGE */ - - if (*flag != 0U) { - pSrcT2 = pSrcT1 + (n * *flag); - pDstT2 = pDstT1 + (n * *flag); - - /* Loop over columns to the right of pivot */ - //j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); - //while (j < 4 * ((n - l) >> 2U)) { - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += 4 * NUM_CORES) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - // j += 4 * NUM_CORES; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - } - /* Loop over columns */ - for (j = core_id * 4; j < 4 * (n >> 2U); j += 4 * NUM_CORES) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; } - mempool_log_barrier(2, absolute_core_id); - } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; + // /* DIVIDE BY THE PIVOT */ + // /* Points to the pivot row of input and destination matrices */ + // pPivotRowIn = pSrc + (l * n); + // pPivotRowDst = pDst + (l * n); + // /* Temporary pointers to the pivot row pointers */ + // pSrcT1 = pPivotRowIn; + // pSrcT2 = pPivotRowDst; + // /* Pivot element of the row */ + // in = *pPivotRowIn; + // /* Loop over number of columns to the right of the pilot element */ + // j = 0; + // while (j < 4 * ((n - l) >> 2U)) { + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // in4 = *(pSrcT1 + 3); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // out4 = FIX_DIV(in4, in); + // *pSrcT1++ = out1; + // *pSrcT1++ = out2; + // *pSrcT1++ = out3; + // *pSrcT1++ = out4; + // j += 4; + // } + // while (j < n - l) { + // in1 = *pSrcT1; + // *pSrcT1++ = FIX_DIV(in1, in); + // j++; + // } + // /* Loop over number of columns of the destination matrix */ + // j = 0; + // while (j < 4 * (n >> 2U)) { + // in1 = *pSrcT2; + // in2 = *(pSrcT2 + 1); + // in3 = *(pSrcT2 + 2); + // in4 = *(pSrcT2 + 3); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // out4 = FIX_DIV(in4, in); + // *pSrcT2++ = out1; + // *pSrcT2++ = out2; + // *pSrcT2++ = out3; + // *pSrcT2++ = out4; + // j += 4; + // } + // while (j < n) { + // in1 = *pSrcT2; + // *pSrcT2++ = FIX_DIV(in1, in); + // j++; + // } } - + mempool_log_barrier(2, absolute_core_id); + //pPivotRowIn = pSrc + (l * n); + //pPivotRowDst = pDst + (l * n); /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ pPivotRowIn = pSrc + (l * n); pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ pSrcT1 = pPivotRowIn; pSrcT2 = pPivotRowDst; /* Pivot element of the row */ in = *pPivotRowIn; - - ///* Loop over columns to the right of pivot */ - // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); - // while (j < 4 * ((n - l) >> 2U)) { - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; - // j += NUM_CORES * 4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U); + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; + //for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // out4 = FIX_DIV(in4, in); + // pSrcT1[j] = out1; + // pSrcT1[j + 1] = out2; + // pSrcT1[j + 2] = out3; + // pSrcT1[j + 3] = out4; + //} + //if (core_id == 0) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // in1 = pSrcT1[j]; + // pSrcT1[j] = FIX_DIV(in1, in); + // j++; + // } + //} + if(core_id == 0) { + j = 0; + while (j < 4 - l % 4) { in1 = pSrcT1[j]; - // pSrcT1[j] = FIX_DIV(in1, in); + pSrcT1[j] = FIX_DIV(in1, in); j++; } + } else { + j = core_id * 4 - l % 4; + if (j < (n - l)) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } } /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U); + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { in1 = pSrcT2[j]; in2 = pSrcT2[j + 1]; @@ -197,111 +260,51 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint pSrcT2[j + 2] = out3; pSrcT2[j + 3] = out4; } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); - j++; - } - } + //if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pSrcT2[j]; + // pSrcT2[j] = FIX_DIV(in1, in); + // j++; + // } + //} mempool_log_barrier(2, absolute_core_id); - /* REPLACE ROWS */ - core_id = absolute_core_id; pSrcT1 = pSrc; pSrcT2 = pDst; - /* Loop over rows */ -// for (k = core_id * 4; k < m; k += NUM_CORES * 4) { -// i = 0U; -// while (i < 4) { -// if ((i + k) != l) { -// pSrcT1 = pSrc + (i + k) * n; -// pSrcT2 = pDst + (i + k) * n; -// /* Element of the reference row */ -// in = *pSrcT1; -// pPRT_in = pPivotRowIn; -// pPRT_pDst = pPivotRowDst; -// /* Loop over columns to the right of pivot */ -// j = 0; -// while (j < 4 * ((n - l) >> 2U)) { -// in1 = pSrcT1[j]; -// in2 = pSrcT1[j + 1]; -// in3 = pSrcT1[j + 2]; -// in4 = pSrcT1[j + 3]; -// out1 = pPRT_in[j]; -// out2 = pPRT_in[j + 1]; -// out3 = pPRT_in[j + 2]; -// out4 = pPRT_in[j + 3]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4; -// } -// while (j < n - l) { -// in1 = pSrcT1[j]; -// out1 = pPRT_in[j]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// /* Loop over columns */ -// j = 0; -// while (j < 4 * (n >> 2U)) { -// in1 = pSrcT2[j]; -// in2 = pSrcT2[j + 1]; -// in3 = pSrcT2[j + 2]; -// in4 = pSrcT2[j + 3]; -// out1 = pPRT_pDst[j]; -// out2 = pPRT_pDst[j + 1]; -// out3 = pPRT_pDst[j + 2]; -// out4 = pPRT_pDst[j + 3]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4; -// } -// while (j < n) { -// in1 = pSrcT2[j]; -// out1 = pPRT_pDst[j]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// } -// i++; -// } -// } -// mempool_log_barrier(2, absolute_core_id); - for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { - dump_i(absolute_core_id); + /* Only the columns to the right of the pivot are to be processed */ if (k != l) { + pSrcT1 = pSrc + k * n; pSrcT2 = pDst + k * n; - core_id = absolute_core_id % (n >> 2U); /* Element of the reference row */ in = *pSrcT1; + /* Reference row pointers */ pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; + + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); j = core_id * 4; while (j < 4 * ((n - l) >> 2U)) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; out1 = pPRT_in[j]; out2 = pPRT_in[j + 1]; out3 = pPRT_in[j + 2]; out4 = pPRT_in[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * NUM_CORES; + j += 4 * (n >> 2U); } - if (core_id == (n >> 2U) - 1) { + if (core_id == 0) { j = 4 * ((n - l) >> 2U); while (j < n - l) { in1 = pSrcT1[j]; @@ -310,49 +313,226 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint j++; } } - /* Loop over columns */ + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); j = core_id * 4; while (j < 4 * (n >> 2U)) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; out1 = pPRT_pDst[j]; out2 = pPRT_pDst[j + 1]; out3 = pPRT_pDst[j + 2]; out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; pSrcT2[j] = in1 - FIX_MUL(in, out1); pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * NUM_CORES; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - out1 = pPRT_pDst[j]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - j++; - } + j += 4 * (n >> 2U); } + //if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pSrcT2[j]; + // out1 = pPRT_pDst[j]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + //} + + //uint32_t core_id_in; + //uint32_t core_id_Dst; + //int32_t p1_in, p2_in, p3_in, p4_in; + //int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst; + //core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U); + //core_id_Dst = absolute_core_id % (n >> 2U); + //j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4); + //i = core_id_Dst * 4; + //p1_in = pPRT_in[j]; + //p2_in = pPRT_in[j + 1]; + //p3_in = pPRT_in[j + 2]; + //p4_in = pPRT_in[j + 3]; + //p1_Dst = pPRT_pDst[i]; + //p2_Dst = pPRT_pDst[i + 1]; + //p3_Dst = pPRT_pDst[i + 2]; + //p4_Dst = pPRT_pDst[i + 3]; + //if(core_id_in == 0) { + // switch (4 - l % 4) { + // case (1): + // in1 = pSrcT1[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // break; + // case (2): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // break; + // case (3): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // break; + // case (4): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); + // break; + // } + //} else { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); + //} + //in1 = pSrcT2[i]; + //in2 = pSrcT2[i + 1]; + //in3 = pSrcT2[i + 2]; + //in4 = pSrcT2[i + 3]; + //pSrcT2[i] = in1 - FIX_MUL(in, p1_Dst); + //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst); + //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst); + //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst); + } } mempool_log_barrier(2, absolute_core_id); +// /* REPLACE ROWS */ +// pSrcT1 = pSrc; +// pSrcT2 = pDst; +// for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) { +// k = i / n; +// if (k != l) { +// in = *(pSrc + k * n); +// j = i - (k * n); +// if (j >= 4 * (l >> 2U)) { +// if (j == 4 * (l >> 2U)) { +// pSrcT1 = pSrc + k * n; +// pPRT_in = pPivotRowIn; +// uint32_t bound = j + 4 - l; +// j = 0; +// while (j < bound) { +// in1 = *pSrcT1; +// out1 = *pPRT_in++; +// *pSrcT1++ = in1 - FIX_MUL(in, out1); +// j++; +// } +// } else { +// pSrcT1 = pSrc + (i - l); +// pPRT_in = pPivotRowIn + (j - l); +// in1 = *pSrcT1; +// in2 = *(pSrcT1 + 1); +// in3 = *(pSrcT1 + 2); +// in4 = *(pSrcT1 + 3); +// out1 = *pPRT_in++; +// out2 = *pPRT_in++; +// out3 = *pPRT_in++; +// out4 = *pPRT_in++; +// *pSrcT1++ = in1 - FIX_MUL(in, out1); +// *pSrcT1++ = in2 - FIX_MUL(in, out2); +// *pSrcT1++ = in3 - FIX_MUL(in, out3); +// *pSrcT1++ = in4 - FIX_MUL(in, out4); +// } +// } +// pSrcT2 = pDst + i; +// pPRT_pDst = pPivotRowDst + j; +// in1 = *pSrcT2; +// in2 = *(pSrcT2 + 1); +// in3 = *(pSrcT2 + 2); +// in4 = *(pSrcT2 + 3); +// out1 = *pPRT_pDst++; +// out2 = *pPRT_pDst++; +// out3 = *pPRT_pDst++; +// out4 = *pPRT_pDst++; +// *pSrcT2++ = in1 - FIX_MUL(in, out1); +// *pSrcT2++ = in2 - FIX_MUL(in, out2); +// *pSrcT2++ = in3 - FIX_MUL(in, out3); +// *pSrcT2++ = in4 - FIX_MUL(in, out4); +// } +// } +// mempool_log_barrier(2, absolute_core_id); +// /* REPLACE ROWS */ +// pSrcT1 = pSrc; +// pSrcT2 = pDst; +// core_id = absolute_core_id; +// for (k = core_id; k < m; k += NUM_CORES) { +// /* Only the columns to the right of the pivot are to be processed */ +// if (k != l) { +// pSrcT1 = pSrc + k * n; +// pSrcT2 = pDst + k * n; +// /* Element of the reference row */ +// in = *pSrcT1; +// /* Reference row pointers */ +// pPRT_in = pPivotRowIn; +// pPRT_pDst = pPivotRowDst; +// /* Loop over the columns */ +// j = 0; +// while (j < 4 * ((n - l) >> 2U)) { +// in1 = pSrcT1[j]; +// in2 = pSrcT1[j + 1]; +// in3 = pSrcT1[j + 2]; +// in4 = pSrcT1[j + 3]; +// out1 = pPRT_in[j]; +// out2 = pPRT_in[j + 1]; +// out3 = pPRT_in[j + 2]; +// out4 = pPRT_in[j + 3]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4; +// } +// while (j < n - l) { +// in1 = pSrcT1[j]; +// out1 = pPRT_in[j]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// /* Loop over the columns */ +// j = 0; +// while (j < 4 * (n >> 2U)) { +// in1 = pSrcT2[j]; +// in2 = pSrcT2[j + 1]; +// in3 = pSrcT2[j + 2]; +// in4 = pSrcT2[j + 3]; +// out1 = pPRT_pDst[j]; +// out2 = pPRT_pDst[j + 1]; +// out3 = pPRT_pDst[j + 2]; +// out4 = pPRT_pDst[j + 3]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4; +// } +// while (j < n) { +// in1 = pSrcT2[j]; +// out1 = pPRT_pDst[j]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// } +// } +// mempool_log_barrier(2, absolute_core_id); + pSrc++; /* Increment the input pointer */ - loopCnt--; /* Decrement the loop counter */ l++; /* Increment the index modifier */ } mempool_log_barrier(2, absolute_core_id); -// if ((flag != 1U) && (x == 0)) { -// for (i = 0; i < m * n; i++) { -// if (pSrc[i] != 0) -// break; -// } -// if (i == m * n) -// return 1; -// } return 0; } From 82f8f518cc6930b85959e54b716a74c9c4d148e2 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Thu, 4 Aug 2022 09:35:18 +0200 Subject: [PATCH 16/22] [software] Merge the two final steps of matrix inversion --- software/apps/mat_inv/main.c | 1 + .../mat_inv/mempool_mat_inv_q32p_memsized.h | 203 +++++++++++++----- 2 files changed, 145 insertions(+), 59 deletions(-) diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index f39cd6ac8..e0cb8741b 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -134,6 +134,7 @@ void multi_core_folded() init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); if (core_id == 0) { flag = 0U; + __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED); } mempool_barrier(num_cores); diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h index 961aefd58..6ec20a91b 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h @@ -6,6 +6,8 @@ /* GAUSS JORDAN INVERSION */ +uint32_t volatile pivot_barrier __attribute__((section(".l1"))); + int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { @@ -127,63 +129,8 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint if ((*flag == 0U) && (in == 0U)) { return 1; } - // /* DIVIDE BY THE PIVOT */ - // /* Points to the pivot row of input and destination matrices */ - // pPivotRowIn = pSrc + (l * n); - // pPivotRowDst = pDst + (l * n); - // /* Temporary pointers to the pivot row pointers */ - // pSrcT1 = pPivotRowIn; - // pSrcT2 = pPivotRowDst; - // /* Pivot element of the row */ - // in = *pPivotRowIn; - // /* Loop over number of columns to the right of the pilot element */ - // j = 0; - // while (j < 4 * ((n - l) >> 2U)) { - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // in4 = *(pSrcT1 + 3); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // out4 = FIX_DIV(in4, in); - // *pSrcT1++ = out1; - // *pSrcT1++ = out2; - // *pSrcT1++ = out3; - // *pSrcT1++ = out4; - // j += 4; - // } - // while (j < n - l) { - // in1 = *pSrcT1; - // *pSrcT1++ = FIX_DIV(in1, in); - // j++; - // } - // /* Loop over number of columns of the destination matrix */ - // j = 0; - // while (j < 4 * (n >> 2U)) { - // in1 = *pSrcT2; - // in2 = *(pSrcT2 + 1); - // in3 = *(pSrcT2 + 2); - // in4 = *(pSrcT2 + 3); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // out4 = FIX_DIV(in4, in); - // *pSrcT2++ = out1; - // *pSrcT2++ = out2; - // *pSrcT2++ = out3; - // *pSrcT2++ = out4; - // j += 4; - // } - // while (j < n) { - // in1 = *pSrcT2; - // *pSrcT2++ = FIX_DIV(in1, in); - // j++; - // } } mempool_log_barrier(2, absolute_core_id); - //pPivotRowIn = pSrc + (l * n); - //pPivotRowDst = pDst + (l * n); /* DIVIDE BY THE PIVOT */ /* Points to the pivot row of input and destination matrices */ @@ -276,7 +223,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { /* Only the columns to the right of the pivot are to be processed */ if (k != l) { - pSrcT1 = pSrc + k * n; pSrcT2 = pDst + k * n; /* Element of the reference row */ @@ -284,7 +230,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint /* Reference row pointers */ pPRT_in = pPivotRowIn; pPRT_pDst = pPivotRowDst; - /* Loop over the columns */ core_id = absolute_core_id % (n >> 2U); core_id = core_id - (l >> 2U); @@ -340,7 +285,6 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint // j++; // } //} - //uint32_t core_id_in; //uint32_t core_id_Dst; //int32_t p1_in, p2_in, p3_in, p4_in; @@ -406,11 +350,152 @@ int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst); //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst); //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst); - } } mempool_log_barrier(2, absolute_core_id); +// /* REPLACE ROWS */ +// pSrcT1 = pSrc; +// pSrcT2 = pDst; +// /* Reference row pointers */ +// pPRT_in = pSrc + (l * n); +// pPRT_pDst = pDst + (l * n); +// int32_t pivot = *pPRT_in; +// uint32_t nPE = (n >> 2U); +// uint32_t check = 0; +// if (absolute_core_id >= m * nPE) +// mempool_wfi(); +// for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) { +// /* Only the columns to the right of the pivot are to be processed */ +// if (k != l) { +// pSrcT1 = pSrc + k * n; +// pSrcT2 = pDst + k * n; +// /* Element of the reference row */ +// in = *pSrcT1; +// /* Loop over the columns */ +// core_id = absolute_core_id % nPE; +// core_id = core_id - (l >> 2U); +// j = core_id * 4; +// while (j < 4 * ((n - l) >> 2U)) { +// out1 = pPRT_in[j]; +// out2 = pPRT_in[j + 1]; +// out3 = pPRT_in[j + 2]; +// out4 = pPRT_in[j + 3]; +// out1 = FIX_DIV(out1, pivot); +// out2 = FIX_DIV(out2, pivot); +// out3 = FIX_DIV(out3, pivot); +// out4 = FIX_DIV(out4, pivot); +// in1 = pSrcT1[j]; +// in2 = pSrcT1[j + 1]; +// in3 = pSrcT1[j + 2]; +// in4 = pSrcT1[j + 3]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4 * (n >> 2U); +// } +// if (core_id == 0) { +// j = 4 * ((n - l) >> 2U); +// while (j < n - l) { +// out1 = pPRT_in[j]; +// out1 = FIX_DIV(out1, pivot); +// in1 = pSrcT1[j]; +// pSrcT1[j] = in1 - FIX_MUL(in, out1); +// j++; +// } +// } +// /* Loop over the columns */ +// core_id = absolute_core_id % nPE; +// j = core_id * 4; +// while (j < 4 * (n >> 2U)) { +// out1 = pPRT_pDst[j]; +// out2 = pPRT_pDst[j + 1]; +// out3 = pPRT_pDst[j + 2]; +// out4 = pPRT_pDst[j + 3]; +// out1 = FIX_DIV(out1, pivot); +// out2 = FIX_DIV(out2, pivot); +// out3 = FIX_DIV(out3, pivot); +// out4 = FIX_DIV(out4, pivot); +// in1 = pSrcT2[j]; +// in2 = pSrcT2[j + 1]; +// in3 = pSrcT2[j + 2]; +// in4 = pSrcT2[j + 3]; +// pSrcT2[j] = in1 - FIX_MUL(in, out1); +// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); +// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); +// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); +// j += 4 * nPE; +// } +// __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED); +// mempool_wfi(); +// } else { +// do { +// check = __atomic_fetch_add(&pivot_barrier, 0, __ATOMIC_RELAXED); +// mempool_wait(20); +// } while (check < ((m - 1) * nPE)); +// /* Loop over the columns */ +// core_id = absolute_core_id % (n >> 2U); +// core_id = core_id - (l >> 2U); +// j = core_id * 4; +// while (j < 4 * ((n - l) >> 2U)) { +// in1 = pPRT_in[j]; +// in2 = pPRT_in[j + 1]; +// in3 = pPRT_in[j + 2]; +// in4 = pPRT_in[j + 3]; +// out1 = FIX_DIV(in1, pivot); +// out2 = FIX_DIV(in2, pivot); +// out3 = FIX_DIV(in3, pivot); +// out4 = FIX_DIV(in4, pivot); +// pPRT_in[j] = out1; +// pPRT_in[j + 1] = out2; +// pPRT_in[j + 2] = out3; +// pPRT_in[j + 3] = out4; +// j += 4 * (n >> 2U); +// } +// if (core_id == 0) { +// j = 4 * ((n - l) >> 2U); +// while (j < n - l) { +// in1 = pPRT_in[j]; +// pPRT_in[j] = FIX_DIV(in1, pivot); +// j++; +// } +// } +// /* Loop over the columns */ +// core_id = absolute_core_id % (n >> 2U); +// j = core_id * 4; +// while (j < 4 * (n >> 2U)) { +// in1 = pPRT_pDst[j]; +// in2 = pPRT_pDst[j + 1]; +// in3 = pPRT_pDst[j + 2]; +// in4 = pPRT_pDst[j + 3]; +// out1 = FIX_DIV(in1, pivot); +// out2 = FIX_DIV(in2, pivot); +// out3 = FIX_DIV(in3, pivot); +// out4 = FIX_DIV(in4, pivot); +// pPRT_pDst[j] = out1; +// pPRT_pDst[j + 1] = out2; +// pPRT_pDst[j + 2] = out3; +// pPRT_pDst[j + 3] = out4; +// j += 4 * (n >> 2U); +// } +// if (core_id == (n >> 2U) - 1) { +// j = 4 * (n >> 2U); +// while (j < n) { +// in1 = pPRT_pDst[j]; +// pPRT_pDst[j] = FIX_DIV(in1, pivot); +// j++; +// } +// } +// if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED)) { +// __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED); +// __sync_synchronize(); +// wake_up_all(); +// } +// mempool_wfi(); +// } +// } + // /* REPLACE ROWS */ // pSrcT1 = pSrc; // pSrcT2 = pDst; From 8acd2602fa37c713da6e028771a21e9c53763d99 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Tue, 27 Sep 2022 09:40:03 +0200 Subject: [PATCH 17/22] [software] Correct lint errors --- software/apps/mat_inv/initialization.h | 94 +- software/apps/mat_inv/main.c | 235 ++-- software/apps/mat_inv/mempool_mat_inv_q32p.h | 601 ++++----- .../mat_inv/mempool_mat_inv_q32p_folded.h | 511 +++---- .../mat_inv/mempool_mat_inv_q32p_memsized.h | 1175 +++++++++-------- software/apps/mat_inv/mempool_mat_inv_q32s.h | 517 ++++---- software/apps/svd/main.c | 11 +- software/apps/svd/nrutil.h | 82 +- software/apps/svd/svd.c | 445 ++++--- 9 files changed, 1857 insertions(+), 1814 deletions(-) diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h index ec330e766..6e48e7951 100644 --- a/software/apps/mat_inv/initialization.h +++ b/software/apps/mat_inv/initialization.h @@ -5,9 +5,9 @@ // Author: Marco Bertuletti, ETH Zurich #define FIXED_POINT 16 -#define FIX_DIV(a,b) ((int32_t)((a << FIXED_POINT) / b)) -#define FIX_MUL(a,b) ((int32_t)((a * b) >> FIXED_POINT)) -#define MIN(a,b) (a < b ? a : b) +#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) +#define FIX_MUL(a, b) ((int32_t)((a * b) >> FIXED_POINT)) +#define MIN(a, b) (a < b ? a : b) dump(l, 1); dump(loopCnt, 2); @@ -21,75 +21,81 @@ void display_folded(int32_t *A, int32_t n, int32_t m); void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m); -void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n, int32_t m, int32_t o); +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, + int32_t n, int32_t m, int32_t o); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id); +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id); -void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id); +void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id); void display(int32_t *A, int32_t n, int32_t m) { - //int32_t i, j; - //for (i = 0; i < n; i++) { - // for (j = 0; j < m; j++) { - // printf("%8d ", A[i * m + j]); - // } - // printf("\n"); - //} - int32_t i; - for (i = 0; i < n * m; i++) { - printf("Output[%d] = %8d\n", i, A[i]); - } + // int32_t i, j; + // for (i = 0; i < n; i++) { + // for (j = 0; j < m; j++) { + // printf("%8d ", A[i * m + j]); + // } + // printf("\n"); + //} + int32_t i; + for (i = 0; i < n * m; i++) { + printf("Output[%d] = %8d\n", i, A[i]); + } } #ifdef FOLDED void display_folded(int32_t *A, int32_t n, int32_t m) { - int32_t i, j, k, shift; - for (i = 0; i < n * m; i++) { - k = i / n; - j = i % n; - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - printf("Output[%d] = %8d\n", i, A[shift + j]); - } + int32_t i, j, k, shift; + for (i = 0; i < n * m; i++) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + printf("Output[%d] = %8d\n", i, A[shift + j]); + } } #endif -void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) { +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) { int32_t i, j; for (i = 0; i < n; i++) { - for (j = 0; j < m; j++) { - t_matrix[j * n + i] = matrix[i * m + j]; - } + for (j = 0; j < m; j++) { + t_matrix[j * n + i] = matrix[i * m + j]; + } } } -void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, int32_t n, int32_t m, int32_t o) { +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, + int32_t n, int32_t m, int32_t o) { int32_t i, j, k; for (i = 0; i < n; i++) { - for (j = 0; j < o; j++) { - matrix_product[i * o + j] = 0; - for (k = 0; k < m; k++) { - matrix_product[i * o + j] += FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]); + for (j = 0; j < o; j++) { + matrix_product[i * o + j] = 0; + for (k = 0; k < m; k++) { + matrix_product[i * o + j] += + FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]); } } } } -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, int32_t a, int32_t b, int32_t c, uint32_t core_id) { - if(core_id == 0) { - for(uint32_t j = 0; j < num_rows; j++) { - for(uint32_t i = 0; i < num_columns; i++) { - matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c; +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id) { + if (core_id == 0) { + for (uint32_t j = 0; j < num_rows; j++) { + for (uint32_t i = 0; i < num_columns; i++) { + matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c; } } } } - -void init_matrix_zeros (int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id) { - if(core_id == 0) { - for(uint32_t i = 0; i < num_columns; i++) { - for(uint32_t j = 0; j < num_rows; j++) { - matrix[j * num_columns + i] = 0; +void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id) { + if (core_id == 0) { + for (uint32_t i = 0; i < num_columns; i++) { + for (uint32_t j = 0; j < num_rows; j++) { + matrix[j * num_columns + i] = 0; } } } diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index e0cb8741b..ebe4eca06 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -22,149 +22,146 @@ // #define FOLDED #include "initialization.h" -#include "mempool_mat_inv_q32s.h" #include "mempool_mat_inv_q32p.h" -#include "mempool_mat_inv_q32p_memsized.h" #include "mempool_mat_inv_q32p_folded.h" +#include "mempool_mat_inv_q32p_memsized.h" +#include "mempool_mat_inv_q32s.h" #ifdef FOLDED -int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)] __attribute__((aligned(N_BANKS), section(".l1"))); -uint32_t flag __attribute__((section(".l1"))); +int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)] + __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)] + __attribute__((aligned(N_BANKS), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); #else -int32_t matrix[N * M] __attribute__((aligned(N), section(".l1"))); -int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); -uint32_t flag __attribute__((section(".l1"))); +int32_t matrix[N * M] __attribute__((aligned(N), section(".l1"))); +int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); #endif // Driver program -void single_core() -{ - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, M, M, core_id); - mempool_barrier(num_cores); - - if(core_id == 0) { - mempool_start_benchmark(); - mempool_GJinv_q32s(matrix, inv, M); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); - #ifdef VERBOSE - if (core_id == 0) - display(inv, N, M); - #endif - mempool_barrier(num_cores); -} +void single_core() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); -void multi_core() -{ - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, M, M, core_id); - if (core_id == 0) { - flag = 0U; - } - mempool_barrier(num_cores); - - if (core_id < MIN(NUM_CORES, N / 4)) { - mempool_start_benchmark(); - mempool_GJinv_q32p(matrix, inv, M, &flag); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); - #ifdef VERBOSE - if (core_id == 0) - display(inv, M, N); - #endif - mempool_barrier(num_cores); + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, M, M, core_id); + mempool_barrier(num_cores); + + if (core_id == 0) { + mempool_start_benchmark(); + mempool_GJinv_q32s(matrix, inv, M); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#ifdef VERBOSE + if (core_id == 0) + display(inv, N, M); +#endif + mempool_barrier(num_cores); } -void multi_core_memsized() -{ +void multi_core() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, N, M, core_id); - if (core_id == 0) { - flag = 0U; - } - mempool_barrier(num_cores); + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, M, M, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + if (core_id < MIN(NUM_CORES, N / 4)) { mempool_start_benchmark(); - mempool_GJinv_q32p_memsized(matrix, inv, M, &flag); + mempool_GJinv_q32p(matrix, inv, M, &flag); mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#ifdef VERBOSE + if (core_id == 0) + display(inv, M, N); +#endif + mempool_barrier(num_cores); +} + +void multi_core_memsized() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, N, M, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + + mempool_start_benchmark(); + mempool_GJinv_q32p_memsized(matrix, inv, M, &flag); + mempool_stop_benchmark(); - mempool_barrier(num_cores); - #ifdef VERBOSE - if (core_id == 0) - display(inv, M, N); - #endif - mempool_barrier(num_cores); + mempool_barrier(num_cores); +#ifdef VERBOSE + if (core_id == 0) + display(inv, M, N); +#endif + mempool_barrier(num_cores); } #ifdef FOLDED -void multi_core_folded() -{ - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t nPE = N_USED_BANKS >> 2U; - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); - init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); - if (core_id == 0) { - flag = 0U; - __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED); - } - mempool_barrier(num_cores); - +void multi_core_folded() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t nPE = N_USED_BANKS >> 2U; + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + if (core_id == 0) { + flag = 0U; + __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED); + } + mempool_barrier(num_cores); + + mempool_start_benchmark(); + fold_matrix(matrix, folded_matrix, N); + mempool_stop_benchmark(); + if (core_id < nPE) { mempool_start_benchmark(); - fold_matrix(matrix, folded_matrix, N); + mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE); mempool_stop_benchmark(); - if(core_id < nPE) { - mempool_start_benchmark(); - mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); - #ifdef VERBOSE - if (core_id == 0) - display_folded(inv, M, N); - #endif - mempool_barrier(num_cores); - + } + mempool_barrier(num_cores); +#ifdef VERBOSE + if (core_id == 0) + display_folded(inv, M, N); +#endif + mempool_barrier(num_cores); } #endif int main() { - #if defined(SINGLE) - single_core(); - #elif defined(PARALLEL) - multi_core(); - #elif defined(MEMSIZED) - multi_core_memsized(); - #elif defined(FOLDED) - multi_core_folded(); - #endif - return 0; +#if defined(SINGLE) + single_core(); +#elif defined(PARALLEL) + multi_core(); +#elif defined(MEMSIZED) + multi_core_memsized(); +#elif defined(FOLDED) + multi_core_folded(); +#endif + return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h index 952d06fc4..09e2b449f 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h @@ -6,332 +6,335 @@ /* GAUSS JORDAN INVERSION */ -int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); +int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag); -int mempool_GJinv_q32p(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { +int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag) { - int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; - uint32_t core_id = mempool_get_core_id(); - uint32_t i, j, loopCnt, k, l; /* loop counters */ - uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + uint32_t core_id = mempool_get_core_id(); + uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matirces must be square. */ - /* CREATE THE IDENTITY MATRIX */ + /* CREATE THE IDENTITY MATRIX */ - pDstT1 = pDst; - for (k = core_id * 4; k < m; k += 4 * NUM_CORES) { - for (j = 0; j < m; j++) { - pDstT1[k * m + j] = (uint32_t) (k == j); - pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); - pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); - pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); - } + pDstT1 = pDst; + for (k = core_id * 4; k < m; k += 4 * NUM_CORES) { + for (j = 0; j < m; j++) { + pDstT1[k * m + j] = (uint32_t)(k == j); + pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j); } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - /* Loop over the number of columns of the input matrix. */ - loopCnt = n; - /* Index modifier to navigate through the columns */ - l = 0U; + /* Loop over the number of columns of the input matrix. */ + loopCnt = n; + /* Index modifier to navigate through the columns */ + l = 0U; - while (loopCnt > 0U) { + while (loopCnt > 0U) { - pSrcT1 = pSrc + (l * n); - pDstT1 = pDst + (l * n); - in = *pSrcT1; + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + in = *pSrcT1; - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - pSrcT2 = pSrc + (n * k); - pDstT2 = pDst + (n * k); - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - pPivotRowIn = pSrc + (l * n); - pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; + ///* Loop over columns to the right of pivot */ + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + // j += NUM_CORES * 4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - ///* Loop over columns to the right of pivot */ - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + /* Loop over rows */ + for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + i = 0U; + while (i < 4) { + if ((i + k) != l) { + pSrcT1 = pSrc + (i + k) * n; + pSrcT2 = pDst + (i + k) * n; + /* Element of the reference row */ + in = *pSrcT1; + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over columns to the right of pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; in3 = pSrcT1[j + 2]; in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; - // j += NUM_CORES * 4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; - } - } - /* Loop over columns */ - for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + /* Loop over columns */ + j = 0; + while (j < 4 * (n >> 2U)) { in1 = pSrcT2[j]; in2 = pSrcT2[j + 1]; in3 = pSrcT2[j + 2]; in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); - j++; - } - } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - /* Loop over rows */ - for (k = core_id * 4; k < m; k += NUM_CORES * 4) { - i = 0U; - while (i < 4) { - if ((i + k) != l) { - pSrcT1 = pSrc + (i + k) * n; - pSrcT2 = pDst + (i + k) * n; - /* Element of the reference row */ - in = *pSrcT1; - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over columns to the right of pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - /* Loop over columns */ - j = 0; - while (j < 4 * (n >> 2U)) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n) { - in1 = pSrcT2[j]; - out1 = pPRT_pDst[j]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - i++; - } + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + i++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); -// /* REPLACE ROWS */ -// pSrcT1 = pSrc; -// pSrcT2 = pDst; -// /* Loop over rows */ -// for (k = 0; k < m; k++) { -// if (k != l) { -// pSrcT1 = pSrc + k * n; -// pSrcT2 = pDst + k * n; -// /* Element of the reference row */ -// in = *pSrcT1; -// pPRT_in = pPivotRowIn; -// pPRT_pDst = pPivotRowDst; -// /* Loop over columns to the right of pivot */ -// j = core_id * 4; -// // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - l) >> 2U); -// while (j < 4 * ((n - l) >> 2U)) { -// in1 = pSrcT1[j]; -// in2 = pSrcT1[j + 1]; -// in3 = pSrcT1[j + 2]; -// in4 = pSrcT1[j + 3]; -// out1 = pPRT_in[j]; -// out2 = pPRT_in[j + 1]; -// out3 = pPRT_in[j + 2]; -// out4 = pPRT_in[j + 3]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4 * NUM_CORES; -// } -// if (core_id == (n >> 2U) - 1) { -// j = 4 * ((n - l) >> 2U); -// while (j < n - l) { -// in1 = pSrcT1[j]; -// out1 = pPRT_in[j]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// } -// /* Loop over columns */ -// j = core_id * 4; -// while (j < 4 * (n >> 2U)) { -// in1 = pSrcT2[j]; -// in2 = pSrcT2[j + 1]; -// in3 = pSrcT2[j + 2]; -// in4 = pSrcT2[j + 3]; -// out1 = pPRT_pDst[j]; -// out2 = pPRT_pDst[j + 1]; -// out3 = pPRT_pDst[j + 2]; -// out4 = pPRT_pDst[j + 3]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4 * NUM_CORES; -// } -// if (core_id == (n >> 2U) - 1) { -// j = 4 * (n >> 2U); -// while (j < n) { -// in1 = pSrcT2[j]; -// out1 = pPRT_pDst[j]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// } -// mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); -// } -// } -// mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + // /* REPLACE ROWS */ + // pSrcT1 = pSrc; + // pSrcT2 = pDst; + // /* Loop over rows */ + // for (k = 0; k < m; k++) { + // if (k != l) { + // pSrcT1 = pSrc + k * n; + // pSrcT2 = pDst + k * n; + // /* Element of the reference row */ + // in = *pSrcT1; + // pPRT_in = pPivotRowIn; + // pPRT_pDst = pPivotRowDst; + // /* Loop over columns to the right of pivot */ + // j = core_id * 4; + // // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n + // - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = pPRT_in[j]; + // out2 = pPRT_in[j + 1]; + // out3 = pPRT_in[j + 2]; + // out4 = pPRT_in[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * NUM_CORES; + // } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // in1 = pSrcT1[j]; + // out1 = pPRT_in[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // /* Loop over columns */ + // j = core_id * 4; + // while (j < 4 * (n >> 2U)) { + // in1 = pSrcT2[j]; + // in2 = pSrcT2[j + 1]; + // in3 = pSrcT2[j + 2]; + // in4 = pSrcT2[j + 3]; + // out1 = pPRT_pDst[j]; + // out2 = pPRT_pDst[j + 1]; + // out3 = pPRT_pDst[j + 2]; + // out4 = pPRT_pDst[j + 3]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * NUM_CORES; + // } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pSrcT2[j]; + // out1 = pPRT_pDst[j]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / + // 4)); + // } + // } + // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); - pSrc++; /* Increment the input pointer */ - loopCnt--; /* Decrement the loop counter */ - l++; /* Increment the index modifier */ - } + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ + } -// if ((flag != 1U) && (x == 0)) { -// for (i = 0; i < m * n; i++) { -// if (pSrc[i] != 0) -// break; -// } -// if (i == m * n) -// return 1; -// } - return 0; + // if ((flag != 1U) && (x == 0)) { + // for (i = 0; i < m * n; i++) { + // if (pSrc[i] != 0) + // break; + // } + // if (i == m * n) + // return 1; + // } + return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h index 5dc0aefc8..6064a1faf 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h @@ -6,282 +6,285 @@ /* GAUSS JORDAN INVERSION */ -int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE); -void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n); +int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag, uint32_t nPE); +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); - -void fold_matrix(int32_t* pSrc, int32_t* pDst, uint32_t n) { - uint32_t core_id = mempool_get_core_id(); - uint32_t i, j, k, shift; - for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) { - k = i / n; - j = i % n; - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pDst[shift + j] = pSrc[i]; - pDst[shift + j + 1] = pSrc[i + 1]; - pDst[shift + j + 2] = pSrc[i + 2]; - pDst[shift + j + 3] = pSrc[i + 3]; - } - mempool_log_barrier(2, core_id); +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { + uint32_t core_id = mempool_get_core_id(); + uint32_t i, j, k, shift; + for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDst[shift + j] = pSrc[i]; + pDst[shift + j + 1] = pSrc[i + 1]; + pDst[shift + j + 2] = pSrc[i + 2]; + pDst[shift + j + 3] = pSrc[i + 3]; + } + mempool_log_barrier(2, core_id); } -int mempool_GJinv_q32p_folded(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag, uint32_t nPE) { +int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag, uint32_t nPE) { - int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id; - uint32_t shift = 0; - uint32_t i, j, k, l; /* loop counters */ - uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */ + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t shift = 0; + uint32_t i, j, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matrices must be square. */ - /* CREATE THE IDENTITY MATRIX */ - pDstT1 = pDst; - for (i = core_id * 4; i < n * m; i += nPE * 4) { - k = i / n; - j = i % n; - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pDstT1[shift + j] = (uint32_t) (k == j); - pDstT1[shift + j + 1] = (uint32_t) (k == (j + 1)); - pDstT1[shift + j + 2] = (uint32_t) (k == (j + 2)); - pDstT1[shift + j + 3] = (uint32_t) (k == (j + 3)); - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); + /* CREATE THE IDENTITY MATRIX */ + pDstT1 = pDst; + for (i = core_id * 4; i < n * m; i += nPE * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDstT1[shift + j] = (uint32_t)(k == j); + pDstT1[shift + j + 1] = (uint32_t)(k == (j + 1)); + pDstT1[shift + j + 2] = (uint32_t)(k == (j + 2)); + pDstT1[shift + j + 3] = (uint32_t)(k == (j + 3)); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); - /* Index modifier to navigate through the columns */ - l = 0U; - while (l < n) { + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { - shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; - pSrcT1 = pSrc + shift; - pDstT1 = pDst + shift; - in = *pSrcT1; + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pDstT1 = pDst + shift; + in = *pSrcT1; - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (absolute_core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pSrcT2 = pSrc + shift; - pDstT2 = pDst + shift; - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT2 = pSrc + shift; + pDstT2 = pDst + shift; + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } } - mempool_log_partial_barrier(2, absolute_core_id, nPE); + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; - pPivotRowIn = pSrc + shift; - pPivotRowDst = pDst + shift; - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pPivotRowIn = pSrc + shift; + pPivotRowDst = pDst + shift; + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; - /* Loop over columns to the right of pivot */ - core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); - core_id = core_id > nPE ? core_id + nPE : core_id; - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + + /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { + /* Only the columns to the right of the pivot are to be processed */ + if (k != l) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pSrcT2 = pDst + shift; + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); } if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; - } + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } } - - /* Loop over columns */ - core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); - core_id = core_id > nPE ? core_id + nPE : core_id; - for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; + core_id = absolute_core_id % (n >> 2U); + /* Loop over the columns */ + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); } if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); - j++; - } - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { - /* Only the columns to the right of the pivot are to be processed */ - if (k != l) { - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pSrcT1 = pSrc + shift; - pSrcT2 = pDst + shift; - /* Element of the reference row */ - in = *pSrcT1; - /* Reference row pointers */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - core_id = core_id - (l >> 2U); - j = core_id * 4; - while (j < 4 * ((n - l) >> 2U)) { - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - core_id = absolute_core_id % (n >> 2U); - /* Loop over the columns */ - j = core_id * 4; - while (j < 4 * (n >> 2U)) { - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - out1 = pPRT_pDst[j]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - } + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - pSrc++; /* Increment the input pointer */ - l++; /* Increment the index modifier */ + } } mempool_log_partial_barrier(2, absolute_core_id, nPE); - return 0; + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h index 6ec20a91b..b697f9d24 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h @@ -8,616 +8,621 @@ uint32_t volatile pivot_barrier __attribute__((section(".l1"))); -int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag); +int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag); -int mempool_GJinv_q32p_memsized(int32_t * pSrc, int32_t * pDst, uint32_t n, uint32_t *flag) { +int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag) { - int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id; - uint32_t i, j, k, l; /* loop counters */ - uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t i, j, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matirces must be square. */ - /* CREATE THE IDENTITY MATRIX */ + /* CREATE THE IDENTITY MATRIX */ - pDstT1 = pDst; - for (k = core_id * 4; k < m; k += NUM_CORES * 4) { - for (j = 0; j < n; j++) { - pDstT1[k * n + j] = (uint32_t) (k == j); - pDstT1[(k + 1) * n + j] = (uint32_t) ((k + 1) == j); - pDstT1[(k + 2) * n + j] = (uint32_t) ((k + 2) == j); - pDstT1[(k + 3) * n + j] = (uint32_t) ((k + 3) == j); - } + pDstT1 = pDst; + for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + for (j = 0; j < n; j++) { + pDstT1[k * n + j] = (uint32_t)(k == j); + pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j); + pDstT1[(k + 2) * n + j] = (uint32_t)((k + 2) == j); + pDstT1[(k + 3) * n + j] = (uint32_t)((k + 3) == j); } -// pDstT1 = pDst; -// for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) { -// k = i / n; -// j = i % n; -// pDstT1[k * n + j] = (uint32_t) (k == j); -// pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1)); -// pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2)); -// pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3)); -// } -// mempool_log_barrier(2, absolute_core_id); + } + // pDstT1 = pDst; + // for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) { + // k = i / n; + // j = i % n; + // pDstT1[k * n + j] = (uint32_t) (k == j); + // pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1)); + // pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2)); + // pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3)); + // } + // mempool_log_barrier(2, absolute_core_id); - /* Index modifier to navigate through the columns */ - l = 0U; - while (l < n) { + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { - pSrcT1 = pSrc + (l * n); - pDstT1 = pDst + (l * n); - in = *pSrcT1; + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + in = *pSrcT1; - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (absolute_core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - pSrcT2 = pSrc + (n * k); - pDstT2 = pDst + (n * k); - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; } - } - mempool_log_barrier(2, absolute_core_id); - - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - pPivotRowIn = pSrc + (l * n); - pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; - /* Loop over columns to the right of pivot */ - core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U); - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; - //for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // out4 = FIX_DIV(in4, in); - // pSrcT1[j] = out1; - // pSrcT1[j + 1] = out2; - // pSrcT1[j + 2] = out3; - // pSrcT1[j + 3] = out4; - //} - //if (core_id == 0) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // in1 = pSrcT1[j]; - // pSrcT1[j] = FIX_DIV(in1, in); - // j++; - // } - //} - if(core_id == 0) { + /* Loop over colums */ j = 0; - while (j < 4 - l % 4) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; } - } else { - j = core_id * 4 - l % 4; - if (j < (n - l)) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_barrier(2, absolute_core_id); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U); + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; + // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // out4 = FIX_DIV(in4, in); + // pSrcT1[j] = out1; + // pSrcT1[j + 1] = out2; + // pSrcT1[j + 2] = out3; + // pSrcT1[j + 3] = out4; + //} + // if (core_id == 0) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // in1 = pSrcT1[j]; + // pSrcT1[j] = FIX_DIV(in1, in); + // j++; + // } + //} + if (core_id == 0) { + j = 0; + while (j < 4 - l % 4) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } else { + j = core_id * 4 - l % 4; + if (j < (n - l)) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + } + /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U); + core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pSrcT2[j]; + // pSrcT2[j] = FIX_DIV(in1, in); + // j++; + // } + //} + mempool_log_barrier(2, absolute_core_id); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { + /* Only the columns to the right of the pivot are to be processed */ + if (k != l) { + pSrcT1 = pSrc + k * n; + pSrcT2 = pDst + k * n; + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } } - /* Loop over columns */ - core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U); - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; - for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); } - //if (core_id == (n >> 2U) - 1) { + // if (core_id == (n >> 2U) - 1) { // j = 4 * (n >> 2U); // while (j < n) { // in1 = pSrcT2[j]; - // pSrcT2[j] = FIX_DIV(in1, in); + // out1 = pPRT_pDst[j]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); // j++; // } //} - mempool_log_barrier(2, absolute_core_id); - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { - /* Only the columns to the right of the pivot are to be processed */ - if (k != l) { - pSrcT1 = pSrc + k * n; - pSrcT2 = pDst + k * n; - /* Element of the reference row */ - in = *pSrcT1; - /* Reference row pointers */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - core_id = core_id - (l >> 2U); - j = core_id * 4; - while (j < 4 * ((n - l) >> 2U)) { - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - j = core_id * 4; - while (j < 4 * (n >> 2U)) { - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - //if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pSrcT2[j]; - // out1 = pPRT_pDst[j]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - //} - //uint32_t core_id_in; - //uint32_t core_id_Dst; - //int32_t p1_in, p2_in, p3_in, p4_in; - //int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst; - //core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U); - //core_id_Dst = absolute_core_id % (n >> 2U); - //j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4); - //i = core_id_Dst * 4; - //p1_in = pPRT_in[j]; - //p2_in = pPRT_in[j + 1]; - //p3_in = pPRT_in[j + 2]; - //p4_in = pPRT_in[j + 3]; - //p1_Dst = pPRT_pDst[i]; - //p2_Dst = pPRT_pDst[i + 1]; - //p3_Dst = pPRT_pDst[i + 2]; - //p4_Dst = pPRT_pDst[i + 3]; - //if(core_id_in == 0) { - // switch (4 - l % 4) { - // case (1): - // in1 = pSrcT1[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // break; - // case (2): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // break; - // case (3): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // break; - // case (4): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); - // break; - // } - //} else { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); - //} - //in1 = pSrcT2[i]; - //in2 = pSrcT2[i + 1]; - //in3 = pSrcT2[i + 2]; - //in4 = pSrcT2[i + 3]; - //pSrcT2[i] = in1 - FIX_MUL(in, p1_Dst); - //pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst); - //pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst); - //pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst); - } - } - mempool_log_barrier(2, absolute_core_id); + // uint32_t core_id_in; + // uint32_t core_id_Dst; + // int32_t p1_in, p2_in, p3_in, p4_in; + // int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst; + // core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U); + // core_id_Dst = absolute_core_id % (n >> 2U); + // j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4); + // i = core_id_Dst * 4; + // p1_in = pPRT_in[j]; + // p2_in = pPRT_in[j + 1]; + // p3_in = pPRT_in[j + 2]; + // p4_in = pPRT_in[j + 3]; + // p1_Dst = pPRT_pDst[i]; + // p2_Dst = pPRT_pDst[i + 1]; + // p3_Dst = pPRT_pDst[i + 2]; + // p4_Dst = pPRT_pDst[i + 3]; + // if(core_id_in == 0) { + // switch (4 - l % 4) { + // case (1): + // in1 = pSrcT1[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // break; + // case (2): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // break; + // case (3): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // break; + // case (4): + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); + // break; + // } + //} else { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); + //} + // in1 = pSrcT2[i]; + // in2 = pSrcT2[i + 1]; + // in3 = pSrcT2[i + 2]; + // in4 = pSrcT2[i + 3]; + // pSrcT2[i] = in1 - FIX_MUL(in, p1_Dst); + // pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst); + // pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst); + // pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst); + } + } + mempool_log_barrier(2, absolute_core_id); -// /* REPLACE ROWS */ -// pSrcT1 = pSrc; -// pSrcT2 = pDst; -// /* Reference row pointers */ -// pPRT_in = pSrc + (l * n); -// pPRT_pDst = pDst + (l * n); -// int32_t pivot = *pPRT_in; -// uint32_t nPE = (n >> 2U); -// uint32_t check = 0; -// if (absolute_core_id >= m * nPE) -// mempool_wfi(); -// for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) { -// /* Only the columns to the right of the pivot are to be processed */ -// if (k != l) { -// pSrcT1 = pSrc + k * n; -// pSrcT2 = pDst + k * n; -// /* Element of the reference row */ -// in = *pSrcT1; -// /* Loop over the columns */ -// core_id = absolute_core_id % nPE; -// core_id = core_id - (l >> 2U); -// j = core_id * 4; -// while (j < 4 * ((n - l) >> 2U)) { -// out1 = pPRT_in[j]; -// out2 = pPRT_in[j + 1]; -// out3 = pPRT_in[j + 2]; -// out4 = pPRT_in[j + 3]; -// out1 = FIX_DIV(out1, pivot); -// out2 = FIX_DIV(out2, pivot); -// out3 = FIX_DIV(out3, pivot); -// out4 = FIX_DIV(out4, pivot); -// in1 = pSrcT1[j]; -// in2 = pSrcT1[j + 1]; -// in3 = pSrcT1[j + 2]; -// in4 = pSrcT1[j + 3]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4 * (n >> 2U); -// } -// if (core_id == 0) { -// j = 4 * ((n - l) >> 2U); -// while (j < n - l) { -// out1 = pPRT_in[j]; -// out1 = FIX_DIV(out1, pivot); -// in1 = pSrcT1[j]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// } -// /* Loop over the columns */ -// core_id = absolute_core_id % nPE; -// j = core_id * 4; -// while (j < 4 * (n >> 2U)) { -// out1 = pPRT_pDst[j]; -// out2 = pPRT_pDst[j + 1]; -// out3 = pPRT_pDst[j + 2]; -// out4 = pPRT_pDst[j + 3]; -// out1 = FIX_DIV(out1, pivot); -// out2 = FIX_DIV(out2, pivot); -// out3 = FIX_DIV(out3, pivot); -// out4 = FIX_DIV(out4, pivot); -// in1 = pSrcT2[j]; -// in2 = pSrcT2[j + 1]; -// in3 = pSrcT2[j + 2]; -// in4 = pSrcT2[j + 3]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4 * nPE; -// } -// __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED); -// mempool_wfi(); -// } else { -// do { -// check = __atomic_fetch_add(&pivot_barrier, 0, __ATOMIC_RELAXED); -// mempool_wait(20); -// } while (check < ((m - 1) * nPE)); -// /* Loop over the columns */ -// core_id = absolute_core_id % (n >> 2U); -// core_id = core_id - (l >> 2U); -// j = core_id * 4; -// while (j < 4 * ((n - l) >> 2U)) { -// in1 = pPRT_in[j]; -// in2 = pPRT_in[j + 1]; -// in3 = pPRT_in[j + 2]; -// in4 = pPRT_in[j + 3]; -// out1 = FIX_DIV(in1, pivot); -// out2 = FIX_DIV(in2, pivot); -// out3 = FIX_DIV(in3, pivot); -// out4 = FIX_DIV(in4, pivot); -// pPRT_in[j] = out1; -// pPRT_in[j + 1] = out2; -// pPRT_in[j + 2] = out3; -// pPRT_in[j + 3] = out4; -// j += 4 * (n >> 2U); -// } -// if (core_id == 0) { -// j = 4 * ((n - l) >> 2U); -// while (j < n - l) { -// in1 = pPRT_in[j]; -// pPRT_in[j] = FIX_DIV(in1, pivot); -// j++; -// } -// } -// /* Loop over the columns */ -// core_id = absolute_core_id % (n >> 2U); -// j = core_id * 4; -// while (j < 4 * (n >> 2U)) { -// in1 = pPRT_pDst[j]; -// in2 = pPRT_pDst[j + 1]; -// in3 = pPRT_pDst[j + 2]; -// in4 = pPRT_pDst[j + 3]; -// out1 = FIX_DIV(in1, pivot); -// out2 = FIX_DIV(in2, pivot); -// out3 = FIX_DIV(in3, pivot); -// out4 = FIX_DIV(in4, pivot); -// pPRT_pDst[j] = out1; -// pPRT_pDst[j + 1] = out2; -// pPRT_pDst[j + 2] = out3; -// pPRT_pDst[j + 3] = out4; -// j += 4 * (n >> 2U); -// } -// if (core_id == (n >> 2U) - 1) { -// j = 4 * (n >> 2U); -// while (j < n) { -// in1 = pPRT_pDst[j]; -// pPRT_pDst[j] = FIX_DIV(in1, pivot); -// j++; -// } -// } -// if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED)) { -// __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED); -// __sync_synchronize(); -// wake_up_all(); -// } -// mempool_wfi(); -// } -// } + // /* REPLACE ROWS */ + // pSrcT1 = pSrc; + // pSrcT2 = pDst; + // /* Reference row pointers */ + // pPRT_in = pSrc + (l * n); + // pPRT_pDst = pDst + (l * n); + // int32_t pivot = *pPRT_in; + // uint32_t nPE = (n >> 2U); + // uint32_t check = 0; + // if (absolute_core_id >= m * nPE) + // mempool_wfi(); + // for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) { + // /* Only the columns to the right of the pivot are to be + // processed */ if (k != l) { + // pSrcT1 = pSrc + k * n; + // pSrcT2 = pDst + k * n; + // /* Element of the reference row */ + // in = *pSrcT1; + // /* Loop over the columns */ + // core_id = absolute_core_id % nPE; + // core_id = core_id - (l >> 2U); + // j = core_id * 4; + // while (j < 4 * ((n - l) >> 2U)) { + // out1 = pPRT_in[j]; + // out2 = pPRT_in[j + 1]; + // out3 = pPRT_in[j + 2]; + // out4 = pPRT_in[j + 3]; + // out1 = FIX_DIV(out1, pivot); + // out2 = FIX_DIV(out2, pivot); + // out3 = FIX_DIV(out3, pivot); + // out4 = FIX_DIV(out4, pivot); + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * (n >> 2U); + // } + // if (core_id == 0) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // out1 = pPRT_in[j]; + // out1 = FIX_DIV(out1, pivot); + // in1 = pSrcT1[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // /* Loop over the columns */ + // core_id = absolute_core_id % nPE; + // j = core_id * 4; + // while (j < 4 * (n >> 2U)) { + // out1 = pPRT_pDst[j]; + // out2 = pPRT_pDst[j + 1]; + // out3 = pPRT_pDst[j + 2]; + // out4 = pPRT_pDst[j + 3]; + // out1 = FIX_DIV(out1, pivot); + // out2 = FIX_DIV(out2, pivot); + // out3 = FIX_DIV(out3, pivot); + // out4 = FIX_DIV(out4, pivot); + // in1 = pSrcT2[j]; + // in2 = pSrcT2[j + 1]; + // in3 = pSrcT2[j + 2]; + // in4 = pSrcT2[j + 3]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * nPE; + // } + // __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED); + // mempool_wfi(); + // } else { + // do { + // check = __atomic_fetch_add(&pivot_barrier, 0, + // __ATOMIC_RELAXED); mempool_wait(20); + // } while (check < ((m - 1) * nPE)); + // /* Loop over the columns */ + // core_id = absolute_core_id % (n >> 2U); + // core_id = core_id - (l >> 2U); + // j = core_id * 4; + // while (j < 4 * ((n - l) >> 2U)) { + // in1 = pPRT_in[j]; + // in2 = pPRT_in[j + 1]; + // in3 = pPRT_in[j + 2]; + // in4 = pPRT_in[j + 3]; + // out1 = FIX_DIV(in1, pivot); + // out2 = FIX_DIV(in2, pivot); + // out3 = FIX_DIV(in3, pivot); + // out4 = FIX_DIV(in4, pivot); + // pPRT_in[j] = out1; + // pPRT_in[j + 1] = out2; + // pPRT_in[j + 2] = out3; + // pPRT_in[j + 3] = out4; + // j += 4 * (n >> 2U); + // } + // if (core_id == 0) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // in1 = pPRT_in[j]; + // pPRT_in[j] = FIX_DIV(in1, pivot); + // j++; + // } + // } + // /* Loop over the columns */ + // core_id = absolute_core_id % (n >> 2U); + // j = core_id * 4; + // while (j < 4 * (n >> 2U)) { + // in1 = pPRT_pDst[j]; + // in2 = pPRT_pDst[j + 1]; + // in3 = pPRT_pDst[j + 2]; + // in4 = pPRT_pDst[j + 3]; + // out1 = FIX_DIV(in1, pivot); + // out2 = FIX_DIV(in2, pivot); + // out3 = FIX_DIV(in3, pivot); + // out4 = FIX_DIV(in4, pivot); + // pPRT_pDst[j] = out1; + // pPRT_pDst[j + 1] = out2; + // pPRT_pDst[j + 2] = out3; + // pPRT_pDst[j + 3] = out4; + // j += 4 * (n >> 2U); + // } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pPRT_pDst[j]; + // pPRT_pDst[j] = FIX_DIV(in1, pivot); + // j++; + // } + // } + // if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, + // __ATOMIC_RELAXED)) { + // __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED); + // __sync_synchronize(); + // wake_up_all(); + // } + // mempool_wfi(); + // } + // } -// /* REPLACE ROWS */ -// pSrcT1 = pSrc; -// pSrcT2 = pDst; -// for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) { -// k = i / n; -// if (k != l) { -// in = *(pSrc + k * n); -// j = i - (k * n); -// if (j >= 4 * (l >> 2U)) { -// if (j == 4 * (l >> 2U)) { -// pSrcT1 = pSrc + k * n; -// pPRT_in = pPivotRowIn; -// uint32_t bound = j + 4 - l; -// j = 0; -// while (j < bound) { -// in1 = *pSrcT1; -// out1 = *pPRT_in++; -// *pSrcT1++ = in1 - FIX_MUL(in, out1); -// j++; -// } -// } else { -// pSrcT1 = pSrc + (i - l); -// pPRT_in = pPivotRowIn + (j - l); -// in1 = *pSrcT1; -// in2 = *(pSrcT1 + 1); -// in3 = *(pSrcT1 + 2); -// in4 = *(pSrcT1 + 3); -// out1 = *pPRT_in++; -// out2 = *pPRT_in++; -// out3 = *pPRT_in++; -// out4 = *pPRT_in++; -// *pSrcT1++ = in1 - FIX_MUL(in, out1); -// *pSrcT1++ = in2 - FIX_MUL(in, out2); -// *pSrcT1++ = in3 - FIX_MUL(in, out3); -// *pSrcT1++ = in4 - FIX_MUL(in, out4); -// } -// } -// pSrcT2 = pDst + i; -// pPRT_pDst = pPivotRowDst + j; -// in1 = *pSrcT2; -// in2 = *(pSrcT2 + 1); -// in3 = *(pSrcT2 + 2); -// in4 = *(pSrcT2 + 3); -// out1 = *pPRT_pDst++; -// out2 = *pPRT_pDst++; -// out3 = *pPRT_pDst++; -// out4 = *pPRT_pDst++; -// *pSrcT2++ = in1 - FIX_MUL(in, out1); -// *pSrcT2++ = in2 - FIX_MUL(in, out2); -// *pSrcT2++ = in3 - FIX_MUL(in, out3); -// *pSrcT2++ = in4 - FIX_MUL(in, out4); -// } -// } -// mempool_log_barrier(2, absolute_core_id); -// /* REPLACE ROWS */ -// pSrcT1 = pSrc; -// pSrcT2 = pDst; -// core_id = absolute_core_id; -// for (k = core_id; k < m; k += NUM_CORES) { -// /* Only the columns to the right of the pivot are to be processed */ -// if (k != l) { -// pSrcT1 = pSrc + k * n; -// pSrcT2 = pDst + k * n; -// /* Element of the reference row */ -// in = *pSrcT1; -// /* Reference row pointers */ -// pPRT_in = pPivotRowIn; -// pPRT_pDst = pPivotRowDst; -// /* Loop over the columns */ -// j = 0; -// while (j < 4 * ((n - l) >> 2U)) { -// in1 = pSrcT1[j]; -// in2 = pSrcT1[j + 1]; -// in3 = pSrcT1[j + 2]; -// in4 = pSrcT1[j + 3]; -// out1 = pPRT_in[j]; -// out2 = pPRT_in[j + 1]; -// out3 = pPRT_in[j + 2]; -// out4 = pPRT_in[j + 3]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4; -// } -// while (j < n - l) { -// in1 = pSrcT1[j]; -// out1 = pPRT_in[j]; -// pSrcT1[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// /* Loop over the columns */ -// j = 0; -// while (j < 4 * (n >> 2U)) { -// in1 = pSrcT2[j]; -// in2 = pSrcT2[j + 1]; -// in3 = pSrcT2[j + 2]; -// in4 = pSrcT2[j + 3]; -// out1 = pPRT_pDst[j]; -// out2 = pPRT_pDst[j + 1]; -// out3 = pPRT_pDst[j + 2]; -// out4 = pPRT_pDst[j + 3]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); -// pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); -// pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); -// j += 4; -// } -// while (j < n) { -// in1 = pSrcT2[j]; -// out1 = pPRT_pDst[j]; -// pSrcT2[j] = in1 - FIX_MUL(in, out1); -// j++; -// } -// } -// } -// mempool_log_barrier(2, absolute_core_id); + // /* REPLACE ROWS */ + // pSrcT1 = pSrc; + // pSrcT2 = pDst; + // for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) { + // k = i / n; + // if (k != l) { + // in = *(pSrc + k * n); + // j = i - (k * n); + // if (j >= 4 * (l >> 2U)) { + // if (j == 4 * (l >> 2U)) { + // pSrcT1 = pSrc + k * n; + // pPRT_in = pPivotRowIn; + // uint32_t bound = j + 4 - l; + // j = 0; + // while (j < bound) { + // in1 = *pSrcT1; + // out1 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // j++; + // } + // } else { + // pSrcT1 = pSrc + (i - l); + // pPRT_in = pPivotRowIn + (j - l); + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // in4 = *(pSrcT1 + 3); + // out1 = *pPRT_in++; + // out2 = *pPRT_in++; + // out3 = *pPRT_in++; + // out4 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // *pSrcT1++ = in2 - FIX_MUL(in, out2); + // *pSrcT1++ = in3 - FIX_MUL(in, out3); + // *pSrcT1++ = in4 - FIX_MUL(in, out4); + // } + // } + // pSrcT2 = pDst + i; + // pPRT_pDst = pPivotRowDst + j; + // in1 = *pSrcT2; + // in2 = *(pSrcT2 + 1); + // in3 = *(pSrcT2 + 2); + // in4 = *(pSrcT2 + 3); + // out1 = *pPRT_pDst++; + // out2 = *pPRT_pDst++; + // out3 = *pPRT_pDst++; + // out4 = *pPRT_pDst++; + // *pSrcT2++ = in1 - FIX_MUL(in, out1); + // *pSrcT2++ = in2 - FIX_MUL(in, out2); + // *pSrcT2++ = in3 - FIX_MUL(in, out3); + // *pSrcT2++ = in4 - FIX_MUL(in, out4); + // } + // } + // mempool_log_barrier(2, absolute_core_id); + // /* REPLACE ROWS */ + // pSrcT1 = pSrc; + // pSrcT2 = pDst; + // core_id = absolute_core_id; + // for (k = core_id; k < m; k += NUM_CORES) { + // /* Only the columns to the right of the pivot are to be + // processed */ if (k != l) { + // pSrcT1 = pSrc + k * n; + // pSrcT2 = pDst + k * n; + // /* Element of the reference row */ + // in = *pSrcT1; + // /* Reference row pointers */ + // pPRT_in = pPivotRowIn; + // pPRT_pDst = pPivotRowDst; + // /* Loop over the columns */ + // j = 0; + // while (j < 4 * ((n - l) >> 2U)) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = pPRT_in[j]; + // out2 = pPRT_in[j + 1]; + // out3 = pPRT_in[j + 2]; + // out4 = pPRT_in[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4; + // } + // while (j < n - l) { + // in1 = pSrcT1[j]; + // out1 = pPRT_in[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // /* Loop over the columns */ + // j = 0; + // while (j < 4 * (n >> 2U)) { + // in1 = pSrcT2[j]; + // in2 = pSrcT2[j + 1]; + // in3 = pSrcT2[j + 2]; + // in4 = pSrcT2[j + 3]; + // out1 = pPRT_pDst[j]; + // out2 = pPRT_pDst[j + 1]; + // out3 = pPRT_pDst[j + 2]; + // out4 = pPRT_pDst[j + 3]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4; + // } + // while (j < n) { + // in1 = pSrcT2[j]; + // out1 = pPRT_pDst[j]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // } + // mempool_log_barrier(2, absolute_core_id); - pSrc++; /* Increment the input pointer */ - l++; /* Increment the index modifier */ - } - mempool_log_barrier(2, absolute_core_id); + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + mempool_log_barrier(2, absolute_core_id); - return 0; + return 0; } diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/apps/mat_inv/mempool_mat_inv_q32s.h index 21aadbe39..a20b918e0 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32s.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32s.h @@ -8,302 +8,303 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n); -int mempool_GJinv_q32s(int32_t * pSrc, int32_t * pDst, uint32_t n) { +int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { - int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */ + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; - uint32_t m = n; /* M is the number of rows. However, the matrices must be square. */ - uint32_t i, j, k, l; /* loop counters */ - uint32_t flag = 0U; /* Flag to check if the matrix is singular */ + uint32_t m = + n; /* M is the number of rows. However, the matrices must be square. */ + uint32_t i, j, k, l; /* loop counters */ + uint32_t flag = 0U; /* Flag to check if the matrix is singular */ - pDstT1 = pDst; /* Working pointer for destination matrix */ - /* CREATE THE IDENTITY MATRIX */ - for (k = 0; k < m; k += 4) { - for (j = 0; j < n; j++) { - pDstT1[k * m + j] = (uint32_t) (k == j); - pDstT1[(k + 1) * m + j] = (uint32_t) ((k + 1) == j); - pDstT1[(k + 2) * m + j] = (uint32_t) ((k + 2) == j); - pDstT1[(k + 3) * m + j] = (uint32_t) ((k + 3) == j); - } + pDstT1 = pDst; /* Working pointer for destination matrix */ + /* CREATE THE IDENTITY MATRIX */ + for (k = 0; k < m; k += 4) { + for (j = 0; j < n; j++) { + pDstT1[k * m + j] = (uint32_t)(k == j); + pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j); } + } - /* Index modifier to navigate through the columns */ - l = 0U; - while (l < n) { + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { - pSrcT1 = pSrc + (l * n); - pDstT1 = pDst + (l * n); - k = 1U; - in = *pSrcT1; + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + k = 1U; + in = *pSrcT1; - /* CHECK IF PIVOT ELEMENT IS ZERO */ + /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (in == 0) { - /* Loop over the rows present below */ - for (i = (l + 1U); i < m; i++) { - pSrcT2 = pSrc + (n * i); - pDstT2 = pDstT1 + (n * k); + if (in == 0) { + /* Loop over the rows present below */ + for (i = (l + 1U); i < m; i++) { + pSrcT2 = pSrc + (n * i); + pDstT2 = pDstT1 + (n * k); - /* EXCHANGE */ + /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < (n - l) - (n - l) % 4) { - Xchg1 = *(pSrcT2); - Xchg2 = *(pSrcT2 + 1); - Xchg3 = *(pSrcT2 + 2); - Xchg4 = *(pSrcT2 + 3); - out1 = *(pSrcT1); - out2 = *(pSrcT1 + 1); - out3 = *(pSrcT1 + 2); - out4 = *(pSrcT1 + 3); - *pSrcT2++ = out1; - *pSrcT2++ = out2; - *pSrcT2++ = out3; - *pSrcT2++ = out4; - *pSrcT1++ = Xchg1; - *pSrcT1++ = Xchg2; - *pSrcT1++ = Xchg3; - *pSrcT1++ = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = *pSrcT2; - *pSrcT2++ = *pSrcT1; - *pSrcT1++ = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < n - n % 4) { - Xchg1 = *(pDstT2); - Xchg2 = *(pDstT2 + 1); - Xchg3 = *(pDstT2 + 2); - Xchg4 = *(pDstT2 + 3); - out1 = *(pDstT1); - out2 = *(pDstT1 + 1); - out3 = *(pDstT1 + 2); - out4 = *(pDstT1 + 3); - *pDstT2++ = out1; - *pDstT2++ = out2; - *pDstT2++ = out3; - *pDstT2++ = out4; - *pDstT1++ = Xchg1; - *pDstT1++ = Xchg2; - *pDstT1++ = Xchg3; - *pDstT1++ = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = *pDstT2; - *pDstT2++ = *pDstT1; - *pDstT1++ = Xchg1; - j++; - } - flag = 1U; - break; - } - k++; - } - } - /* Return when the matrix is singular */ - if ((flag == 0U) && (in == 0)) { - return 1; + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < (n - l) - (n - l) % 4) { + Xchg1 = *(pSrcT2); + Xchg2 = *(pSrcT2 + 1); + Xchg3 = *(pSrcT2 + 2); + Xchg4 = *(pSrcT2 + 3); + out1 = *(pSrcT1); + out2 = *(pSrcT1 + 1); + out3 = *(pSrcT1 + 2); + out4 = *(pSrcT1 + 3); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + *pSrcT1++ = Xchg1; + *pSrcT1++ = Xchg2; + *pSrcT1++ = Xchg3; + *pSrcT1++ = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = *pSrcT2; + *pSrcT2++ = *pSrcT1; + *pSrcT1++ = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < n - n % 4) { + Xchg1 = *(pDstT2); + Xchg2 = *(pDstT2 + 1); + Xchg3 = *(pDstT2 + 2); + Xchg4 = *(pDstT2 + 3); + out1 = *(pDstT1); + out2 = *(pDstT1 + 1); + out3 = *(pDstT1 + 2); + out4 = *(pDstT1 + 3); + *pDstT2++ = out1; + *pDstT2++ = out2; + *pDstT2++ = out3; + *pDstT2++ = out4; + *pDstT1++ = Xchg1; + *pDstT1++ = Xchg2; + *pDstT1++ = Xchg3; + *pDstT1++ = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = *pDstT2; + *pDstT2++ = *pDstT1; + *pDstT1++ = Xchg1; + j++; + } + flag = 1U; + break; } + k++; + } + } + /* Return when the matrix is singular */ + if ((flag == 0U) && (in == 0)) { + return 1; + } - /* DIVIDE BY THE PIVOT */ + /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - pPivotRowIn = pSrc + (l * n); - pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; - /* Loop over number of columns to the right of the pilot element */ + /* Loop over number of columns to the right of the pilot element */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + in1 = *pSrcT1; + in2 = *(pSrcT1 + 1); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT1++ = out1; + *pSrcT1++ = out2; + *pSrcT1++ = out3; + *pSrcT1++ = out4; + j += 4; + } + while (j < n - l) { + in1 = *pSrcT1; + *pSrcT1++ = FIX_DIV(in1, in); + j++; + } + // switch ((n - l) % 4) { + // case 3: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // in3 = *(pSrcT1 + 2); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // out3 = FIX_DIV(in3, in); + // *pSrcT1++ = out1; + // *pSrcT1++ = out2; + // *pSrcT1++ = out3; + // break; + // case 2: + // in1 = *pSrcT1; + // in2 = *(pSrcT1 + 1); + // out1 = FIX_DIV(in1, in); + // out2 = FIX_DIV(in2, in); + // *pSrcT1++ = out1; + // *pSrcT1++ = out2; + // break; + // case 1: + // in1 = *pSrcT1; + // out1 = FIX_DIV(in1, in); + // *pSrcT1++ = out1; + // break; + //} + /* Loop over number of columns of the destination matrix */ + j = 0; + while (j < 4 * (n >> 2U)) { + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + j += 4; + } + while (j < n) { + in1 = *pSrcT2; + *pSrcT2++ = FIX_DIV(in1, in); + j++; + } + + /* REPLACE ROWS */ + + pSrcT1 = pSrc; + pSrcT2 = pDst; + i = 0U; /* pivot index */ + k = m; /* row index */ + while (k > 0U) { + /* Only the columns to the right of the pivot are to be processed */ + if (i == l) { + pSrcT1 += n - l; + pSrcT2 += n; + } else { + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; j = 0; while (j < 4 * ((n - l) >> 2U)) { - in1 = *pSrcT1; - in2 = *(pSrcT1 + 1); - in3 = *(pSrcT1 + 2); - in4 = *(pSrcT1 + 3); - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - *pSrcT1++ = out1; - *pSrcT1++ = out2; - *pSrcT1++ = out3; - *pSrcT1++ = out4; - j += 4; + in1 = *pSrcT1; + in2 = *(pSrcT1 + 1); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); + out1 = *pPRT_in++; + out2 = *pPRT_in++; + out3 = *pPRT_in++; + out4 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + *pSrcT1++ = in2 - FIX_MUL(in, out2); + *pSrcT1++ = in3 - FIX_MUL(in, out3); + *pSrcT1++ = in4 - FIX_MUL(in, out4); + j += 4; } while (j < n - l) { - in1 = *pSrcT1; - *pSrcT1++ = FIX_DIV(in1, in); - j++; + in1 = *pSrcT1; + out1 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + j++; } - //switch ((n - l) % 4) { + // switch ((n - l) % 4) { // case 3: // in1 = *pSrcT1; // in2 = *(pSrcT1 + 1); // in3 = *(pSrcT1 + 2); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // *pSrcT1++ = out1; - // *pSrcT1++ = out2; - // *pSrcT1++ = out3; + // out1 = *pPRT_in++; + // out2 = *pPRT_in++; + // out3 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // *pSrcT1++ = in2 - FIX_MUL(in, out2); + // *pSrcT1++ = in3 - FIX_MUL(in, out3); // break; // case 2: // in1 = *pSrcT1; // in2 = *(pSrcT1 + 1); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // *pSrcT1++ = out1; - // *pSrcT1++ = out2; + // out1 = *pPRT_in++; + // out2 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); + // *pSrcT1++ = in2 - FIX_MUL(in, out2); // break; // case 1: // in1 = *pSrcT1; - // out1 = FIX_DIV(in1, in); - // *pSrcT1++ = out1; + // out1 = *pPRT_in++; + // *pSrcT1++ = in1 - FIX_MUL(in, out1); // break; //} - /* Loop over number of columns of the destination matrix */ + /* Loop over the number of columns to + replace the elements in the destination matrix */ j = 0; while (j < 4 * (n >> 2U)) { - in1 = *pSrcT2; - in2 = *(pSrcT2 + 1); - in3 = *(pSrcT2 + 2); - in4 = *(pSrcT2 + 3); - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - *pSrcT2++ = out1; - *pSrcT2++ = out2; - *pSrcT2++ = out3; - *pSrcT2++ = out4; - j += 4; + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = *pPRT_pDst++; + out2 = *pPRT_pDst++; + out3 = *pPRT_pDst++; + out4 = *pPRT_pDst++; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + *pSrcT2++ = in2 - FIX_MUL(in, out2); + *pSrcT2++ = in3 - FIX_MUL(in, out3); + *pSrcT2++ = in4 - FIX_MUL(in, out4); + j += 4; } while (j < n) { - in1 = *pSrcT2; - *pSrcT2++ = FIX_DIV(in1, in); - j++; - } - - /* REPLACE ROWS */ - - pSrcT1 = pSrc; - pSrcT2 = pDst; - i = 0U; /* pivot index */ - k = m; /* row index */ - while (k > 0U) { - /* Only the columns to the right of the pivot are to be processed */ - if (i == l) { - pSrcT1 += n - l; - pSrcT2 += n; - } else { - /* Element of the reference row */ - in = *pSrcT1; - /* Reference row pointers */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - in1 = *pSrcT1; - in2 = *(pSrcT1 + 1); - in3 = *(pSrcT1 + 2); - in4 = *(pSrcT1 + 3); - out1 = *pPRT_in++; - out2 = *pPRT_in++; - out3 = *pPRT_in++; - out4 = *pPRT_in++; - *pSrcT1++ = in1 - FIX_MUL(in, out1); - *pSrcT1++ = in2 - FIX_MUL(in, out2); - *pSrcT1++ = in3 - FIX_MUL(in, out3); - *pSrcT1++ = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n - l) { - in1 = *pSrcT1; - out1 = *pPRT_in++; - *pSrcT1++ = in1 - FIX_MUL(in, out1); - j++; - } - //switch ((n - l) % 4) { - // case 3: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // out1 = *pPRT_in++; - // out2 = *pPRT_in++; - // out3 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // *pSrcT1++ = in2 - FIX_MUL(in, out2); - // *pSrcT1++ = in3 - FIX_MUL(in, out3); - // break; - // case 2: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // out1 = *pPRT_in++; - // out2 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // *pSrcT1++ = in2 - FIX_MUL(in, out2); - // break; - // case 1: - // in1 = *pSrcT1; - // out1 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // break; - //} - /* Loop over the number of columns to - replace the elements in the destination matrix */ - j = 0; - while (j < 4 * (n >> 2U)) { - in1 = *pSrcT2; - in2 = *(pSrcT2 + 1); - in3 = *(pSrcT2 + 2); - in4 = *(pSrcT2 + 3); - out1 = *pPRT_pDst++; - out2 = *pPRT_pDst++; - out3 = *pPRT_pDst++; - out4 = *pPRT_pDst++; - *pSrcT2++ = in1 - FIX_MUL(in, out1); - *pSrcT2++ = in2 - FIX_MUL(in, out2); - *pSrcT2++ = in3 - FIX_MUL(in, out3); - *pSrcT2++ = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n) { - in1 = *pSrcT2; - out1 = *pPRT_pDst; - *pSrcT2++ = in1 - FIX_MUL(in, out1); - j++; - } - } - /* Increment temporary input pointer */ - pSrcT1 = pSrcT1 + l; - /* Decrement loop counter */ - k--; - /* Increment pivot index */ - i++; + in1 = *pSrcT2; + out1 = *pPRT_pDst; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + j++; } - - pSrc++; /* Increment the input pointer */ - l++; /* Increment the index modifier */ + } + /* Increment temporary input pointer */ + pSrcT1 = pSrcT1 + l; + /* Decrement loop counter */ + k--; + /* Increment pivot index */ + i++; } - return 0; + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + + return 0; } - diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c index 18e35f510..8a217c0cd 100644 --- a/software/apps/svd/main.c +++ b/software/apps/svd/main.c @@ -1,3 +1,9 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + #include "encoding.h" #include "printf.h" #include "runtime.h" @@ -6,7 +12,6 @@ #include "nrutil.h" #include "svd.c" - // Define Matrix dimensions: #define M 4 #define N 32 @@ -42,8 +47,8 @@ void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, } } -void init_vector(int32_t *vector, uint32_t num_el, - int32_t a, int32_t b, uint32_t core_id) { +void init_vector(int32_t *vector, uint32_t num_el, int32_t a, int32_t b, + uint32_t core_id) { uint32_t const split = 8; // How many blocks to split the vector into uint32_t const reminder = num_el % split; uint32_t i, j; diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h index 27b55fec2..a137444ab 100644 --- a/software/apps/svd/nrutil.h +++ b/software/apps/svd/nrutil.h @@ -1,3 +1,9 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + //#include //#include //#include @@ -9,57 +15,69 @@ #define FREE_ARG char * static int32_t sqrarg; -#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg *sqrarg) +#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg * sqrarg) static int32_t dsqrarg; -#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg *dsqrarg) +#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg * dsqrarg) static int32_t dmaxarg1, dmaxarg2; -#define DMAX(a, b) (dmaxarg1 = (a), dmaxarg2 = (b), (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2)) +#define DMAX(a, b) \ + (dmaxarg1 = (a), dmaxarg2 = (b), \ + (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2)) static int32_t dminarg1, dminarg2; -#define DMIN(a, b) (dminarg1 = (a), dminarg2 = (b), (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2)) +#define DMIN(a, b) \ + (dminarg1 = (a), dminarg2 = (b), \ + (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2)) static int32_t maxarg1, maxarg2; -#define FMAX(a, b) (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) +#define FMAX(a, b) \ + (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) static int32_t minarg1, minarg2; -#define FMIN(a, b) (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2)) +#define FMIN(a, b) \ + (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2)) static long lmaxarg1, lmaxarg2; -#define LMAX(a, b) (lmaxarg1 = (a), lmaxarg2 = (b), (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2)) +#define LMAX(a, b) \ + (lmaxarg1 = (a), lmaxarg2 = (b), \ + (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2)) static long lminarg1, lminarg2; -#define LMIN(a, b) (lminarg1 = (a), lminarg2 = (b), (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2)) +#define LMIN(a, b) \ + (lminarg1 = (a), lminarg2 = (b), \ + (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2)) static int32_t imaxarg1, imaxarg2; -#define IMAX(a, b) (imaxarg1 = (a), imaxarg2 = (b), (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2)) +#define IMAX(a, b) \ + (imaxarg1 = (a), imaxarg2 = (b), \ + (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2)) static int32_t iminarg1, iminarg2; -#define IMIN(a, b) (iminarg1 = (a), iminarg2 = (b), (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) +#define IMIN(a, b) \ + (iminarg1 = (a), iminarg2 = (b), \ + (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) #define ABS(a) (a < 0 ? -a : a) #define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a)) -int32_t sqrt_q32 ( const int32_t number, - const uint32_t fracBits); +int32_t sqrt_q32(const int32_t number, const uint32_t fracBits); #define sqrt2 0b1011010100000100 -int32_t sqrt_q32 ( const int32_t number, - const uint32_t fracBits) { +int32_t sqrt_q32(const int32_t number, const uint32_t fracBits) { - int32_t root = 0; - int32_t start = 0; - int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF) - int32_t mid; + int32_t root = 0; + int32_t start = 0; + int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF) + int32_t mid; - if (number > 0) { - while (start <= end) { - mid = (start + end) >> 1; - if (((mid * mid) >> fracBits) == number) { - root = mid; - break; - } - if (((mid * mid) >> fracBits) < number) { - start = mid + 1; - root = mid; - } else { - end = mid - 1; - } + if (number > 0) { + while (start <= end) { + mid = (start + end) >> 1; + if (((mid * mid) >> fracBits) == number) { + root = mid; + break; + } + if (((mid * mid) >> fracBits) < number) { + start = mid + 1; + root = mid; + } else { + end = mid - 1; } } + } - return root; + return root; } #endif diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c index a53c2695b..fa2fcbd0c 100644 --- a/software/apps/svd/svd.c +++ b/software/apps/svd/svd.c @@ -1,237 +1,242 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + int32_t pythag(int32_t a, int32_t b); void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v); - int32_t pythag(int32_t a, int32_t b) { - int32_t absa = ABS(a); - int32_t absb = ABS(b); - if (absa > absb) { - return absa * sqrt_q32(1 + SQR(absb / absa), 4); - } else { - return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4)); - } + int32_t absa = ABS(a); + int32_t absb = ABS(b); + if (absa > absb) { + return absa * sqrt_q32(1 + SQR(absb / absa), 4); + } else { + return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4)); + } } void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) { - int32_t flag, i, its, j, jj, k, l, nm; - int32_t anorm, c, f, g, h, s, scale, x, y, z; - int32_t rv1[n]; + int32_t flag, i, its, j, jj, k, l, nm; + int32_t anorm, c, f, g, h, s, scale, x, y, z; + int32_t rv1[n]; - //printf("PROVA\n"); + // printf("PROVA\n"); - g = scale = anorm = 0.0; - for (i = 1; i <= n; i++) { - l = i + 1; - rv1[i] = scale * g; - g = s = scale = 0.0; - if (i <= m) { - for (k = i; k <= m; k++) { - scale += ABS(a[k * m + i]); - } - if (scale) { - for (k = i; k <= m; k++) { - a[k * m + i] /= scale; - s += a[k * m + i] * a[k * m + i]; - } - f = a[i * m + i]; - g = -SIGN(sqrt_q32(s,4), f); - h = f * g - s; - a[i * m + i] = f - g; - for (j = l; j <= n; j++) { - for (s = 0.0, k = i; k <= m; k++) { - s += a[k * m + i] * a[k * m + i]; - } - f = s / h; - for (k = i; k <= m; k++) { - a[k * m + i] += f * a[k * m + i]; - } - } - for (k = i; k <= m; k++) { - a[k * m + i] *= scale; - } - } + g = scale = anorm = 0.0; + for (i = 1; i <= n; i++) { + l = i + 1; + rv1[i] = scale * g; + g = s = scale = 0.0; + if (i <= m) { + for (k = i; k <= m; k++) { + scale += ABS(a[k * m + i]); + } + if (scale) { + for (k = i; k <= m; k++) { + a[k * m + i] /= scale; + s += a[k * m + i] * a[k * m + i]; + } + f = a[i * m + i]; + g = -SIGN(sqrt_q32(s, 4), f); + h = f * g - s; + a[i * m + i] = f - g; + for (j = l; j <= n; j++) { + for (s = 0.0, k = i; k <= m; k++) { + s += a[k * m + i] * a[k * m + i]; + } + f = s / h; + for (k = i; k <= m; k++) { + a[k * m + i] += f * a[k * m + i]; + } } - w[i] = scale * g; - g = s = scale = 0.0; - if (i <= m && i != n) { - for (k = l; k <= n; k++) { - scale += ABS(a[k * m + i]); - } - if (scale) { - for (k = l; k <= n; k++) { - a[k * m + i] /= scale; - s += a[i * m + k] * a[i * m + k]; - } - f = a[i * m + l]; - g = -SIGN(sqrt_q32(s,4), f); - h = f * g - s; - a[i * m + l] = f - g; - for (k = l; k <= n; k++) { - rv1[k] = a[i * m + k] / h; - } - for (j = l; j <= m; j++) { - for (s = 0, k = l; k <= n; k++) { - s += a[j * m + k] * a[i * m + k]; - } - for (k = l; k <= n; k++) { - a[j * m + k] += s * rv1[k]; - } - } - for (k = l; k <= n; k++) { - a[i * m + k] *= scale; - } - } + for (k = i; k <= m; k++) { + a[k * m + i] *= scale; } - anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i]))); + } } + w[i] = scale * g; + g = s = scale = 0.0; + if (i <= m && i != n) { + for (k = l; k <= n; k++) { + scale += ABS(a[k * m + i]); + } + if (scale) { + for (k = l; k <= n; k++) { + a[k * m + i] /= scale; + s += a[i * m + k] * a[i * m + k]; + } + f = a[i * m + l]; + g = -SIGN(sqrt_q32(s, 4), f); + h = f * g - s; + a[i * m + l] = f - g; + for (k = l; k <= n; k++) { + rv1[k] = a[i * m + k] / h; + } + for (j = l; j <= m; j++) { + for (s = 0, k = l; k <= n; k++) { + s += a[j * m + k] * a[i * m + k]; + } + for (k = l; k <= n; k++) { + a[j * m + k] += s * rv1[k]; + } + } + for (k = l; k <= n; k++) { + a[i * m + k] *= scale; + } + } + } + anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i]))); + } - for (i = n; i >= 1; i--) { - if (i < n) { - if (g) { - for (j = l; j <= n; j++) { - v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g; - } - for (j = l; j <= n; j++) { - for (s = 0, k = l; k <= n; k++) { - s += a[i * m + k] * v[k * m + j]; - } - for (k = l; k <= n; k++) { - v[k * m + j] += s * v[k * m + i]; - } - } - } - for (j = l; j <= n; j++) { - v[i * m + j] = v[j * m + i] = 0; - } + for (i = n; i >= 1; i--) { + if (i < n) { + if (g) { + for (j = l; j <= n; j++) { + v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g; + } + for (j = l; j <= n; j++) { + for (s = 0, k = l; k <= n; k++) { + s += a[i * m + k] * v[k * m + j]; + } + for (k = l; k <= n; k++) { + v[k * m + j] += s * v[k * m + i]; + } } - v[i * m + i] = 1; - g = rv1[i]; - l = i; + } + for (j = l; j <= n; j++) { + v[i * m + j] = v[j * m + i] = 0; + } } + v[i * m + i] = 1; + g = rv1[i]; + l = i; + } -// for (i = IMIN(m, n); i >= 1; i--) { -// l = i + 1; -// g = w[i]; -// for (j = l; j <= n; j++) { -// a[i][j] = 0; -// } -// if (g) { -// g = 1.0 / g; -// for (j = l; j <= n; j++) { -// for (s = 0.0, k = l; k <= m; k++) { -// s += a[k][i] * a[k][j]; -// } -// f = (s / a[i][i]) * g; -// for (k = i; k <= m; k++) { -// a[k][j] += f * a[k][i]; -// } -// } -// for (j = i; j <= m; j++) { -// a[j][i] *= g; -// } -// } else { for (j = i; j <= m; j++) { -// a[j][i] = 0.0; -// } -// } -// ++a[i][i]; -// } -// for (k = n; k >= 1; k--) { -// for (its = 1; its <= 30; its++) { -// flag = 1; -// for (l = k; l >= 1; l--) { -// nm = l - 1; -// if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) { -// flag = 0; -// break; -// } -// if ((int32_t) (ABS(w[nm]) + anorm) == anorm) { -// break; -// } -// } -// if (flag) { -// c = 0.0; -// s = 1.0; -// for (i = l; i <= k; i++) { -// f = s * rv1[i]; -// rv1[i] = c * rv1[i]; -// if ((int32_t) (ABS(f) + anorm) == anorm) { -// break; -// } -// g = w[i]; -// h = pythag(f, g); -// w[i] = h; -// h = 1.0 / h; -// c = g * h; -// s = -f * h; -// for (j = 1; j <= m; j++) { -// y = a[j][nm]; -// z = a[j][i]; -// a[j][nm] = y * c + z * s; -// a[j][i] = z * c - y * s; -// } -// } -// } -// z = w[k]; -// if (l == k) { -// if (z < 0.0) { -// w[k] = -z; -// for (j = 1; j <= n; j++) { -// v[j][k] = -v[j][k]; -// } -// } -// break; -// } -// if (its == 30) { -// exit(1); -// } -// x = w[l]; -// nm = k - 1; -// y = w[nm]; -// g = rv1[nm]; -// h = rv1[k]; -// f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); -// g = pythag(f, 1.0); -// f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; -// c = s = 1.0; -// for (j = l; j <= nm; j++) { -// i = j + 1; -// g = rv1[i]; -// y = w[i]; -// h = s * g; -// g = c * g; -// z = pythag(f, h); -// rv1[j] = z; -// c = f / z; -// s = h / z; -// f = x * c + g * s; -// g = g * c - x * s; -// h = y * s; -// y *= c; -// for (jj = 1; jj <= n; jj++) { -// x = v[jj][j]; -// z = v[jj][i]; -// v[jj][j] = x * c + z * s; -// v[jj][i] = z * c - x * s; -// } -// z = pythag(f, h); -// w[j] = z; -// if (z) { -// z = 1.0 / z; -// c = f * z; -// s = h * z; -// } -// f = c * g + s * y; -// x = c * y - s * g; -// for (jj = 1; jj <= m; jj++) { -// y = a[jj][j]; -// z = a[jj][i]; -// a[jj][j] = y * c + z * s; -// a[jj][i] = z * c - y * s; -// } -// } -// rv1[l] = 0.0; -// rv1[k] = f; -// w[k] = x; -// } -// } + // for (i = IMIN(m, n); i >= 1; i--) { + // l = i + 1; + // g = w[i]; + // for (j = l; j <= n; j++) { + // a[i][j] = 0; + // } + // if (g) { + // g = 1.0 / g; + // for (j = l; j <= n; j++) { + // for (s = 0.0, k = l; k <= m; k++) { + // s += a[k][i] * a[k][j]; + // } + // f = (s / a[i][i]) * g; + // for (k = i; k <= m; k++) { + // a[k][j] += f * a[k][i]; + // } + // } + // for (j = i; j <= m; j++) { + // a[j][i] *= g; + // } + // } else { for (j = i; j <= m; j++) { + // a[j][i] = 0.0; + // } + // } + // ++a[i][i]; + // } + // for (k = n; k >= 1; k--) { + // for (its = 1; its <= 30; its++) { + // flag = 1; + // for (l = k; l >= 1; l--) { + // nm = l - 1; + // if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) { + // flag = 0; + // break; + // } + // if ((int32_t) (ABS(w[nm]) + anorm) == anorm) { + // break; + // } + // } + // if (flag) { + // c = 0.0; + // s = 1.0; + // for (i = l; i <= k; i++) { + // f = s * rv1[i]; + // rv1[i] = c * rv1[i]; + // if ((int32_t) (ABS(f) + anorm) == anorm) { + // break; + // } + // g = w[i]; + // h = pythag(f, g); + // w[i] = h; + // h = 1.0 / h; + // c = g * h; + // s = -f * h; + // for (j = 1; j <= m; j++) { + // y = a[j][nm]; + // z = a[j][i]; + // a[j][nm] = y * c + z * s; + // a[j][i] = z * c - y * s; + // } + // } + // } + // z = w[k]; + // if (l == k) { + // if (z < 0.0) { + // w[k] = -z; + // for (j = 1; j <= n; j++) { + // v[j][k] = -v[j][k]; + // } + // } + // break; + // } + // if (its == 30) { + // exit(1); + // } + // x = w[l]; + // nm = k - 1; + // y = w[nm]; + // g = rv1[nm]; + // h = rv1[k]; + // f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); + // g = pythag(f, 1.0); + // f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; + // c = s = 1.0; + // for (j = l; j <= nm; j++) { + // i = j + 1; + // g = rv1[i]; + // y = w[i]; + // h = s * g; + // g = c * g; + // z = pythag(f, h); + // rv1[j] = z; + // c = f / z; + // s = h / z; + // f = x * c + g * s; + // g = g * c - x * s; + // h = y * s; + // y *= c; + // for (jj = 1; jj <= n; jj++) { + // x = v[jj][j]; + // z = v[jj][i]; + // v[jj][j] = x * c + z * s; + // v[jj][i] = z * c - x * s; + // } + // z = pythag(f, h); + // w[j] = z; + // if (z) { + // z = 1.0 / z; + // c = f * z; + // s = h * z; + // } + // f = c * g + s * y; + // x = c * y - s * g; + // for (jj = 1; jj <= m; jj++) { + // y = a[jj][j]; + // z = a[jj][i]; + // a[jj][j] = y * c + z * s; + // a[jj][i] = z * c - y * s; + // } + // } + // rv1[l] = 0.0; + // rv1[k] = f; + // w[k] = x; + // } + // } } From ae56dc47eac231f0541d7b3e26c78c0de935aad5 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Mon, 31 Oct 2022 08:56:09 +0100 Subject: [PATCH 18/22] [software] Erase SVD folder --- software/apps/svd/SVD_Householder.txt | 781 -------------------------- software/apps/svd/main.c | 98 ---- software/apps/svd/nrutil.h | 83 --- software/apps/svd/svd.c | 242 -------- 4 files changed, 1204 deletions(-) delete mode 100644 software/apps/svd/SVD_Householder.txt delete mode 100644 software/apps/svd/main.c delete mode 100644 software/apps/svd/nrutil.h delete mode 100644 software/apps/svd/svd.c diff --git a/software/apps/svd/SVD_Householder.txt b/software/apps/svd/SVD_Householder.txt deleted file mode 100644 index 1631212de..000000000 --- a/software/apps/svd/SVD_Householder.txt +++ /dev/null @@ -1,781 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// File: singular_value_decomposition.c // -// Contents: // -// Singular_Value_Decomposition // -// Singular_Value_Decomposition_Solve // -// Singular_Value_Decomposition_Inverse // -//////////////////////////////////////////////////////////////////////////////// - -#include // required for memcpy() -#include // required for DBL_EPSILON -#include // required for fabs(), sqrt(); - -#define MAX_ITERATION_COUNT 30 // Maximum number of iterations - -// Internally Defined Routines -static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows, - int ncols, double* U, double* V, double* diagonal, double* superdiagonal ); -static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, - double* U, double* V, double* diagonal, double* superdiagonal ); -static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, - double* singular_value, double* U, double* V); - -//////////////////////////////////////////////////////////////////////////////// -// int Singular_Value_Decomposition(double* A, int nrows, int ncols, // -// double* U, double* singular_values, double* V, double* dummy_array) // -// // -// Description: // -// This routine decomposes an m x n matrix A, with m >= n, into a product // -// of the three matrices U, D, and V', i.e. A = UDV', where U is an m x n // -// matrix whose columns are orthogonal, D is a n x n diagonal matrix, and // -// V is an n x n orthogonal matrix. V' denotes the transpose of V. If // -// m < n, then the procedure may be used for the matrix A'. The singular // -// values of A are the diagonal elements of the diagonal matrix D and // -// correspond to the positive square roots of the eigenvalues of the // -// matrix A'A. // -// // -// This procedure programmed here is based on the method of Golub and // -// Reinsch as given on pages 134 - 151 of the "Handbook for Automatic // -// Computation vol II - Linear Algebra" edited by Wilkinson and Reinsch // -// and published by Springer-Verlag, 1971. // -// // -// The Golub and Reinsch's method for decomposing the matrix A into the // -// product U, D, and V' is performed in three stages: // -// Stage 1: Decompose A into the product of three matrices U1, B, V1' // -// A = U1 B V1' where B is a bidiagonal matrix, and U1, and V1 are a // -// product of Householder transformations. // -// Stage 2: Use Given' transformations to reduce the bidiagonal matrix // -// B into the product of the three matrices U2, D, V2'. The singular // -// value decomposition is then UDV'where U = U2 U1 and V' = V1' V2'. // -// Stage 3: Sort the matrix D in decreasing order of the singular // -// values and interchange the columns of both U and V to reflect any // -// change in the order of the singular values. // -// // -// After performing the singular value decomposition for A, call // -// Singular_Value_Decomposition to solve the equation Ax = B or call // -// Singular_Value_Decomposition_Inverse to calculate the pseudo-inverse // -// of A. // -// // -// Arguments: // -// double* A // -// On input, the pointer to the first element of the matrix // -// A[nrows][ncols]. The matrix A is unchanged. // -// int nrows // -// The number of rows of the matrix A. // -// int ncols // -// The number of columns of the matrix A. // -// double* U // -// On input, a pointer to a matrix with the same number of rows and // -// columns as the matrix A. On output, the matrix with mutually // -// orthogonal columns which is the left-most factor in the singular // -// value decomposition of A. // -// double* singular_values // -// On input, a pointer to an array dimensioned to same as the number // -// of columns of the matrix A, ncols. On output, the singular values // -// of the matrix A sorted in decreasing order. This array corresponds // -// to the diagonal matrix in the singular value decomposition of A. // -// double* V // -// On input, a pointer to a square matrix with the same number of rows // -// and columns as the columns of the matrix A, i.e. V[ncols][ncols]. // -// On output, the orthogonal matrix whose transpose is the right-most // -// factor in the singular value decomposition of A. // -// double* dummy_array // -// On input, a pointer to an array dimensioned to same as the number // -// of columns of the matrix A, ncols. This array is used to store // -// the super-diagonal elements resulting from the Householder reduction// -// of the matrix A to bidiagonal form. And as an input to the Given's // -// procedure to reduce the bidiagonal form to diagonal form. // -// // -// Return Values: // -// 0 Success // -// -1 Failure - During the Given's reduction of the bidiagonal form to // -// diagonal form the procedure failed to terminate within // -// MAX_ITERATION_COUNT iterations. // -// // -// Example: // -// #define M // -// #define N // -// double A[M][N]; // -// double U[M][N]; // -// double V[N][N]; // -// double singular_values[N]; // -// double* dummy_array; // -// // -// (your code to initialize the matrix A) // -// dummy_array = (double*) malloc(N * sizeof(double)); // -// if (dummy_array == NULL) {printf(" No memory available\n"); exit(0); } // -// // -// err = Singular_Value_Decomposition((double*) A, M, N, (double*) U, // -// singular_values, (double*) V, dummy_array); // -// // -// free(dummy_array); // -// if (err < 0) printf(" Failed to converge\n"); // -// else { printf(" The singular value decomposition of A is \n"); // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // -int Singular_Value_Decomposition(double* A, int nrows, int ncols, double* U, - double* singular_values, double* V, double* dummy_array) -{ - Householders_Reduction_to_Bidiagonal_Form( A, nrows, ncols, U, V, - singular_values, dummy_array); - - if (Givens_Reduction_to_Diagonal_Form( nrows, ncols, U, V, - singular_values, dummy_array ) < 0) return -1; - - Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, U, V); - - return 0; -} - - -//////////////////////////////////////////////////////////////////////////////// -// static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows,// -// int ncols, double* U, double* V, double* diagonal, double* superdiagonal )// -// // -// Description: // -// This routine decomposes an m x n matrix A, with m >= n, into a product // -// of the three matrices U, B, and V', i.e. A = UBV', where U is an m x n // -// matrix whose columns are orthogonal, B is a n x n bidiagonal matrix, // -// and V is an n x n orthogonal matrix. V' denotes the transpose of V. // -// If m < n, then the procedure may be used for the matrix A'. The // -// // -// The matrix U is the product of Householder transformations which // -// annihilate the subdiagonal components of A while the matrix V is // -// the product of Householder transformations which annihilate the // -// components of A to the right of the superdiagonal. // -// // -// The Householder transformation which leaves invariant the first k-1 // -// elements of the k-th column and annihilates the all the elements below // -// the diagonal element is P = I - (2/u'u)uu', u is an nrows-dimensional // -// vector the first k-1 components of which are zero and the last // -// components agree with the current transformed matrix below the diagonal// -// diagonal, the remaining k-th element is the diagonal element - s, where// -// s = (+/-)sqrt(sum of squares of the elements below the diagonal), the // -// sign is chosen opposite that of the diagonal element. // -// // -// Arguments: // -// double* A // -// On input, the pointer to the first element of the matrix // -// A[nrows][ncols]. The matrix A is unchanged. // -// int nrows // -// The number of rows of the matrix A. // -// int ncols // -// The number of columns of the matrix A. // -// double* U // -// On input, a pointer to a matrix with the same number of rows and // -// columns as the matrix A. On output, the matrix with mutually // -// orthogonal columns which is the left-most factor in the bidiagonal // -// decomposition of A. // -// double* V // -// On input, a pointer to a square matrix with the same number of rows // -// and columns as the columns of the matrix A, i.e. V[ncols][ncols]. // -// On output, the orthogonal matrix whose transpose is the right-most // -// factor in the bidiagonal decomposition of A. // -// double* diagonal // -// On input, a pointer to an array dimensioned to same as the number // -// of columns of the matrix A, ncols. On output, the diagonal of the // -// bidiagonal matrix. // -// double* superdiagonal // -// On input, a pointer to an array dimensioned to same as the number // -// of columns of the matrix A, ncols. On output, the superdiagonal // -// of the bidiagonal matrix. // -// // -// Return Values: // -// The function is of type void and therefore does not return a value. // -// The matrices U, V, and the diagonal and superdiagonal are calculated // -// using the addresses passed in the argument list. // -// // -// Example: // -// #define M // -// #define N // -// double A[M][N]; // -// double U[M][N]; // -// double V[N][N]; // -// double diagonal[N]; // -// double superdiagonal[N]; // -// // -// (your code to initialize the matrix A - Note this routine is not // -// (accessible from outside i.e. it is declared static) // -// // -// Householders_Reduction_to_Bidiagonal_Form((double*) A, nrows, ncols, // -// (double*) U, (double*) V, diagonal, superdiagonal ) // -// // -// free(dummy_array); // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // -static void Householders_Reduction_to_Bidiagonal_Form(double* A, int nrows, - int ncols, double* U, double* V, double* diagonal, double* superdiagonal ) -{ - int i,j,k,ip1; - double s, s2, si, scale; - double dum; - double *pu, *pui, *pv, *pvi; - double half_norm_squared; - -// Copy A to U - - memcpy(U,A, sizeof(double) * nrows * ncols); - -// - - diagonal[0] = 0.0; - s = 0.0; - scale = 0.0; - for ( i = 0, pui = U, ip1 = 1; i < ncols; pui += ncols, i++, ip1++ ) { - superdiagonal[i] = scale * s; -// -// Perform Householder transform on columns. -// -// Calculate the normed squared of the i-th column vector starting at -// row i. -// - for (j = i, pu = pui, scale = 0.0; j < nrows; j++, pu += ncols) - scale += fabs( *(pu + i) ); - - if (scale > 0.0) { - for (j = i, pu = pui, s2 = 0.0; j < nrows; j++, pu += ncols) { - *(pu + i) /= scale; - s2 += *(pu + i) * *(pu + i); - } -// -// -// Chose sign of s which maximizes the norm -// - s = ( *(pui + i) < 0.0 ) ? sqrt(s2) : -sqrt(s2); -// -// Calculate -2/u'u -// - half_norm_squared = *(pui + i) * s - s2; -// -// Transform remaining columns by the Householder transform. -// - *(pui + i) -= s; - - for (j = ip1; j < ncols; j++) { - for (k = i, si = 0.0, pu = pui; k < nrows; k++, pu += ncols) - si += *(pu + i) * *(pu + j); - si /= half_norm_squared; - for (k = i, pu = pui; k < nrows; k++, pu += ncols) { - *(pu + j) += si * *(pu + i); - } - } - } - for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) *= scale; - diagonal[i] = s * scale; -// -// Perform Householder transform on rows. -// -// Calculate the normed squared of the i-th row vector starting at -// column i. -// - s = 0.0; - scale = 0.0; - if (i >= nrows || i == (ncols - 1) ) continue; - for (j = ip1; j < ncols; j++) scale += fabs ( *(pui + j) ); - if ( scale > 0.0 ) { - for (j = ip1, s2 = 0.0; j < ncols; j++) { - *(pui + j) /= scale; - s2 += *(pui + j) * *(pui + j); - } - s = ( *(pui + ip1) < 0.0 ) ? sqrt(s2) : -sqrt(s2); -// -// Calculate -2/u'u -// - half_norm_squared = *(pui + ip1) * s - s2; -// -// Transform the rows by the Householder transform. -// - *(pui + ip1) -= s; - for (k = ip1; k < ncols; k++) - superdiagonal[k] = *(pui + k) / half_norm_squared; - if ( i < (nrows - 1) ) { - for (j = ip1, pu = pui + ncols; j < nrows; j++, pu += ncols) { - for (k = ip1, si = 0.0; k < ncols; k++) - si += *(pui + k) * *(pu + k); - for (k = ip1; k < ncols; k++) { - *(pu + k) += si * superdiagonal[k]; - } - } - } - for (k = ip1; k < ncols; k++) *(pui + k) *= scale; - } - } - -// Update V - pui = U + ncols * (ncols - 2); - pvi = V + ncols * (ncols - 1); - *(pvi + ncols - 1) = 1.0; - s = superdiagonal[ncols - 1]; - pvi -= ncols; - for (i = ncols - 2, ip1 = ncols - 1; i >= 0; i--, pui -= ncols, - pvi -= ncols, ip1-- ) { - if ( s != 0.0 ) { - pv = pvi + ncols; - for (j = ip1; j < ncols; j++, pv += ncols) - *(pv + i) = ( *(pui + j) / *(pui + ip1) ) / s; - for (j = ip1; j < ncols; j++) { - si = 0.0; - for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols) - si += *(pui + k) * *(pv + j); - for (k = ip1, pv = pvi + ncols; k < ncols; k++, pv += ncols) - *(pv + j) += si * *(pv + i); - } - } - pv = pvi + ncols; - for ( j = ip1; j < ncols; j++, pv += ncols ) { - *(pvi + j) = 0.0; - *(pv + i) = 0.0; - } - *(pvi + i) = 1.0; - s = superdiagonal[i]; - } - -// Update U - - pui = U + ncols * (ncols - 1); - for (i = ncols - 1, ip1 = ncols; i >= 0; ip1 = i, i--, pui -= ncols ) { - s = diagonal[i]; - for ( j = ip1; j < ncols; j++) *(pui + j) = 0.0; - if ( s != 0.0 ) { - for (j = ip1; j < ncols; j++) { - si = 0.0; - pu = pui + ncols; - for (k = ip1; k < nrows; k++, pu += ncols) - si += *(pu + i) * *(pu + j); - si = (si / *(pui + i) ) / s; - for (k = i, pu = pui; k < nrows; k++, pu += ncols) - *(pu + j) += si * *(pu + i); - } - for (j = i, pu = pui; j < nrows; j++, pu += ncols){ - *(pu + i) /= s; - } - } - else - for (j = i, pu = pui; j < nrows; j++, pu += ncols) *(pu + i) = 0.0; - *(pui + i) += 1.0; - } -} - - -//////////////////////////////////////////////////////////////////////////////// -// static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, // -// double* U, double* V, double* diagonal, double* superdiagonal ) // -// // -// Description: // -// This routine decomposes a bidiagonal matrix given by the arrays // -// diagonal and superdiagonal into a product of three matrices U1, D and // -// V1', the matrix U1 premultiplies U and is returned in U, the matrix // -// V1 premultiplies V and is returned in V. The matrix D is a diagonal // -// matrix and replaces the array diagonal. // -// // -// The method used to annihilate the offdiagonal elements is a variant // -// of the QR transformation. The method consists of applying Givens // -// rotations to the right and the left of the current matrix until // -// the new off-diagonal elements are chased out of the matrix. // -// // -// The process is an iterative process which due to roundoff errors may // -// not converge within a predefined number of iterations. (This should // -// be unusual.) // -// // -// Arguments: // -// int nrows // -// The number of rows of the matrix U. // -// int ncols // -// The number of columns of the matrix U. // -// double* U // -// On input, a pointer to a matrix already initialized to a matrix // -// with mutually orthogonal columns. On output, the matrix with // -// mutually orthogonal columns. // -// double* V // -// On input, a pointer to a square matrix with the same number of rows // -// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. // -// The matrix V is assumed to be initialized to an orthogonal matrix. // -// On output, V is an orthogonal matrix. // -// double* diagonal // -// On input, a pointer to an array of dimension ncols which initially // -// contains the diagonal of the bidiagonal matrix. On output, the // -// it contains the diagonal of the diagonal matrix. // -// double* superdiagonal // -// On input, a pointer to an array of dimension ncols which initially // -// the first component is zero and the successive components form the // -// superdiagonal of the bidiagonal matrix. // -// // -// Return Values: // -// 0 Success // -// -1 Failure - The procedure failed to terminate within // -// MAX_ITERATION_COUNT iterations. // -// // -// Example: // -// #define M // -// #define N // -// double U[M][N]; // -// double V[N][N]; // -// double diagonal[N]; // -// double superdiagonal[N]; // -// int err; // -// // -// (your code to initialize the matrices U, V, diagonal, and ) // -// ( superdiagonal. - Note this routine is not accessible from outside) // -// ( i.e. it is declared static.) // -// // -// err = Givens_Reduction_to_Diagonal_Form( M,N,(double*)U,(double*)V, // -// diagonal, superdiagonal ); // -// if ( err < 0 ) printf("Failed to converge\n"); // -// else { ... } // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // -static int Givens_Reduction_to_Diagonal_Form( int nrows, int ncols, - double* U, double* V, double* diagonal, double* superdiagonal ) -{ - - double epsilon; - double c, s; - double f,g,h; - double x,y,z; - double *pu, *pv; - int i,j,k,m; - int rotation_test; - int iteration_count; - - for (i = 0, x = 0.0; i < ncols; i++) { - y = fabs(diagonal[i]) + fabs(superdiagonal[i]); - if ( x < y ) x = y; - } - epsilon = x * DBL_EPSILON; - for (k = ncols - 1; k >= 0; k--) { - iteration_count = 0; - while(1) { - rotation_test = 1; - for (m = k; m >= 0; m--) { - if (fabs(superdiagonal[m]) <= epsilon) {rotation_test = 0; break;} - if (fabs(diagonal[m-1]) <= epsilon) break; - } - if (rotation_test) { - c = 0.0; - s = 1.0; - for (i = m; i <= k; i++) { - f = s * superdiagonal[i]; - superdiagonal[i] *= c; - if (fabs(f) <= epsilon) break; - g = diagonal[i]; - h = sqrt(f*f + g*g); - diagonal[i] = h; - c = g / h; - s = -f / h; - for (j = 0, pu = U; j < nrows; j++, pu += ncols) { - y = *(pu + m - 1); - z = *(pu + i); - *(pu + m - 1 ) = y * c + z * s; - *(pu + i) = -y * s + z * c; - } - } - } - z = diagonal[k]; - if (m == k ) { - if ( z < 0.0 ) { - diagonal[k] = -z; - for ( j = 0, pv = V; j < ncols; j++, pv += ncols) - *(pv + k) = - *(pv + k); - } - break; - } - else { - if ( iteration_count >= MAX_ITERATION_COUNT ) return -1; - iteration_count++; - x = diagonal[m]; - y = diagonal[k-1]; - g = superdiagonal[k-1]; - h = superdiagonal[k]; - f = ( (y - z) * ( y + z ) + (g - h) * (g + h) )/(2.0 * h * y); - g = sqrt( f * f + 1.0 ); - if ( f < 0.0 ) g = -g; - f = ( (x - z) * (x + z) + h * (y / (f + g) - h) ) / x; -// Next QR Transformtion - c = 1.0; - s = 1.0; - for (i = m + 1; i <= k; i++) { - g = superdiagonal[i]; - y = diagonal[i]; - h = s * g; - g *= c; - z = sqrt( f * f + h * h ); - superdiagonal[i-1] = z; - c = f / z; - s = h / z; - f = x * c + g * s; - g = -x * s + g * c; - h = y * s; - y *= c; - for (j = 0, pv = V; j < ncols; j++, pv += ncols) { - x = *(pv + i - 1); - z = *(pv + i); - *(pv + i - 1) = x * c + z * s; - *(pv + i) = -x * s + z * c; - } - z = sqrt( f * f + h * h ); - diagonal[i - 1] = z; - if (z != 0.0) { - c = f / z; - s = h / z; - } - f = c * g + s * y; - x = -s * g + c * y; - for (j = 0, pu = U; j < nrows; j++, pu += ncols) { - y = *(pu + i - 1); - z = *(pu + i); - *(pu + i - 1) = c * y + s * z; - *(pu + i) = -s * y + c * z; - } - } - superdiagonal[m] = 0.0; - superdiagonal[k] = f; - diagonal[k] = x; - } - } - } - return 0; -} - - -//////////////////////////////////////////////////////////////////////////////// -// static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, // -// double* singular_values, double* U, double* V) // -// // -// Description: // -// This routine sorts the singular values from largest to smallest // -// singular value and interchanges the columns of U and the columns of V // -// whenever a swap is made. I.e. if the i-th singular value is swapped // -// with the j-th singular value, then the i-th and j-th columns of U are // -// interchanged and the i-th and j-th columns of V are interchanged. // -// // -// Arguments: // -// int nrows // -// The number of rows of the matrix U. // -// int ncols // -// The number of columns of the matrix U. // -// double* singular_values // -// On input, a pointer to the array of singular values. On output, the// -// sorted array of singular values. // -// double* U // -// On input, a pointer to a matrix already initialized to a matrix // -// with mutually orthogonal columns. On output, the matrix with // -// mutually orthogonal possibly permuted columns. // -// double* V // -// On input, a pointer to a square matrix with the same number of rows // -// and columns as the columns of the matrix U, i.e. V[ncols][ncols]. // -// The matrix V is assumed to be initialized to an orthogonal matrix. // -// On output, V is an orthogonal matrix with possibly permuted columns.// -// // -// Return Values: // -// The function is of type void. // -// // -// Example: // -// #define M // -// #define N // -// double U[M][N]; // -// double V[N][N]; // -// double diagonal[N]; // -// // -// (your code to initialize the matrices U, V, and diagonal. ) // -// ( - Note this routine is not accessible from outside) // -// ( i.e. it is declared static.) // -// // -// Sort_by_Decreasing_Singular_Values(nrows, ncols, singular_values, // -// (double*) U, (double*) V); // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // -static void Sort_by_Decreasing_Singular_Values(int nrows, int ncols, - double* singular_values, double* U, double* V) -{ - int i,j,max_index; - double temp; - double *p1, *p2; - - for (i = 0; i < ncols - 1; i++) { - max_index = i; - for (j = i + 1; j < ncols; j++) - if (singular_values[j] > singular_values[max_index] ) - max_index = j; - if (max_index == i) continue; - temp = singular_values[i]; - singular_values[i] = singular_values[max_index]; - singular_values[max_index] = temp; - p1 = U + max_index; - p2 = U + i; - for (j = 0; j < nrows; j++, p1 += ncols, p2 += ncols) { - temp = *p1; - *p1 = *p2; - *p2 = temp; - } - p1 = V + max_index; - p2 = V + i; - for (j = 0; j < ncols; j++, p1 += ncols, p2 += ncols) { - temp = *p1; - *p1 = *p2; - *p2 = temp; - } - } -} - - -//////////////////////////////////////////////////////////////////////////////// -// void Singular_Value_Decomposition_Solve(double* U, double* D, double* V, // -// double tolerance, int nrows, int ncols, double *B, double* x) // -// // -// Description: // -// This routine solves the system of linear equations Ax=B where A =UDV', // -// is the singular value decomposition of A. Given UDV'x=B, then // -// x = V(1/D)U'B, where 1/D is the pseudo-inverse of D, i.e. if D[i] > 0 // -// then (1/D)[i] = 1/D[i] and if D[i] = 0, then (1/D)[i] = 0. Since // -// the singular values are subject to round-off error. A tolerance is // -// given so that if D[i] < tolerance, D[i] is treated as if it is 0. // -// The default tolerance is D[0] * DBL_EPSILON * ncols, if the user // -// specified tolerance is less than the default tolerance, the default // -// tolerance is used. // -// // -// Arguments: // -// double* U // -// A matrix with mutually orthonormal columns. // -// double* D // -// A diagonal matrix with decreasing non-negative diagonal elements. // -// i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i. // -// double* V // -// An orthogonal matrix. // -// double tolerance // -// An lower bound for non-zero singular values (provided tolerance > // -// ncols * DBL_EPSILON * D[0]). // -// int nrows // -// The number of rows of the matrix U and B. // -// int ncols // -// The number of columns of the matrix U. Also the number of rows and // -// columns of the matrices D and V. // -// double* B // -// A pointer to a vector dimensioned as nrows which is the right-hand // -// side of the equation Ax = B where A = UDV'. // -// double* x // -// A pointer to a vector dimensioned as ncols, which is the least // -// squares solution of the equation Ax = B where A = UDV'. // -// // -// Return Values: // -// The function is of type void. // -// // -// Example: // -// #define M // -// #define N // -// #define NB // -// double U[M][N]; // -// double V[N][N]; // -// double D[N]; // -// double B[M]; // -// double x[N]; // -// double tolerance; // -// // -// (your code to initialize the matrices U,D,V,B) // -// // -// Singular_Value_Decomposition_Solve((double*) U, D, (double*) V, // -// tolerance, M, N, B, x, bcols) // -// // -// printf(" The solution of Ax=B is \n"); // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // - -void Singular_Value_Decomposition_Solve(double* U, double* D, double* V, - double tolerance, int nrows, int ncols, double *B, double* x) -{ - int i,j,k; - double *pu, *pv; - double dum; - - dum = DBL_EPSILON * D[0] * (double) ncols; - if (tolerance < dum) tolerance = dum; - - for ( i = 0, pv = V; i < ncols; i++, pv += ncols) { - x[i] = 0.0; - for (j = 0; j < ncols; j++) - if (D[j] > tolerance ) { - for (k = 0, dum = 0.0, pu = U; k < nrows; k++, pu += ncols) - dum += *(pu + j) * B[k]; - x[i] += dum * *(pv + j) / D[j]; - } - } -} - - -//////////////////////////////////////////////////////////////////////////////// -// void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V,// -// double tolerance, int nrows, int ncols, double *Astar) // -// // -// Description: // -// This routine calculates the pseudo-inverse of the matrix A = UDV'. // -// where U, D, V constitute the singular value decomposition of A. // -// Let Astar be the pseudo-inverse then Astar = V(1/D)U', where 1/D is // -// the pseudo-inverse of D, i.e. if D[i] > 0 then (1/D)[i] = 1/D[i] and // -// if D[i] = 0, then (1/D)[i] = 0. Because the singular values are // -// subject to round-off error. A tolerance is given so that if // -// D[i] < tolerance, D[i] is treated as if it were 0. // -// The default tolerance is D[0] * DBL_EPSILON * ncols, assuming that the // -// diagonal matrix of singular values is sorted from largest to smallest, // -// if the user specified tolerance is less than the default tolerance, // -// then the default tolerance is used. // -// // -// Arguments: // -// double* U // -// A matrix with mutually orthonormal columns. // -// double* D // -// A diagonal matrix with decreasing non-negative diagonal elements. // -// i.e. D[i] > D[j] if i < j and D[i] >= 0 for all i. // -// double* V // -// An orthogonal matrix. // -// double tolerance // -// An lower bound for non-zero singular values (provided tolerance > // -// ncols * DBL_EPSILON * D[0]). // -// int nrows // -// The number of rows of the matrix U and B. // -// int ncols // -// The number of columns of the matrix U. Also the number of rows and // -// columns of the matrices D and V. // -// double* Astar // -// On input, a pointer to the first element of an ncols x nrows matrix.// -// On output, the pseudo-inverse of UDV'. // -// // -// Return Values: // -// The function is of type void. // -// // -// Example: // -// #define M // -// #define N // -// double U[M][N]; // -// double V[N][N]; // -// double D[N]; // -// double Astar[N][M]; // -// double tolerance; // -// // -// (your code to initialize the matrices U,D,V) // -// // -// Singular_Value_Decomposition_Inverse((double*) U, D, (double*) V, // -// tolerance, M, N, (double*) Astar); // -// // -// printf(" The pseudo-inverse of A = UDV' is \n"); // -// ... // -//////////////////////////////////////////////////////////////////////////////// -// // - -void Singular_Value_Decomposition_Inverse(double* U, double* D, double* V, - double tolerance, int nrows, int ncols, double *Astar) -{ - int i,j,k; - double *pu, *pv, *pa; - double dum; - - dum = DBL_EPSILON * D[0] * (double) ncols; - if (tolerance < dum) tolerance = dum; - for ( i = 0, pv = V, pa = Astar; i < ncols; i++, pv += ncols) - for ( j = 0, pu = U; j < nrows; j++, pa++) - for (k = 0, *pa = 0.0; k < ncols; k++, pu++) - if (D[k] > tolerance) *pa += *(pv + k) * *pu / D[k]; -} diff --git a/software/apps/svd/main.c b/software/apps/svd/main.c deleted file mode 100644 index 8a217c0cd..000000000 --- a/software/apps/svd/main.c +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#include "encoding.h" -#include "printf.h" -#include "runtime.h" -#include "synchronization.h" - -#include "nrutil.h" -#include "svd.c" - -// Define Matrix dimensions: -#define M 4 -#define N 32 - -int32_t matrix_U[M * N] __attribute__((section(".l1_prio"))); -int32_t matrix_V[M * N] __attribute__((section(".l1_prio"))); -int32_t matrix_W[N] __attribute__((section(".l1_prio"))); - -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} - -void init_vector(int32_t *vector, uint32_t num_el, int32_t a, int32_t b, - uint32_t core_id) { - uint32_t const split = 8; // How many blocks to split the vector into - uint32_t const reminder = num_el % split; - uint32_t i, j; - for (i = core_id * split; i < core_id * split + split; i++) { - j = i % split; - vector[i] = a * (int32_t)j + b; - } - while (i < reminder) { - j = i % split; - vector[i] = a * (int32_t)j + b; - } -} - -int volatile error __attribute__((section(".l1"))); - -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - int32_t const U_a = 1; - int32_t const U_b = 1; - int32_t const U_c = -32; - int32_t const V_a = 2; - int32_t const V_b = 1; - int32_t const V_c = 16; - // Init matrix - init_matrix(matrix_U, M, N, U_a, U_b, U_c, core_id, num_cores); - init_matrix(matrix_V, M, N, V_a, V_b, V_c, core_id, num_cores); - init_vector(matrix_W, N, V_a, V_b, core_id); - mempool_barrier(num_cores); - - if (core_id == 0) { - // Test the Matri x SVD - svdcmp(matrix_U, M, N, matrix_W, matrix_V); - } - - // Wait until all cores have finished - mempool_barrier(num_cores); - - return error; -} diff --git a/software/apps/svd/nrutil.h b/software/apps/svd/nrutil.h deleted file mode 100644 index a137444ab..000000000 --- a/software/apps/svd/nrutil.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -//#include -//#include -//#include - -#ifndef NR_UTILS_H -#define NR_UTILS_H - -#define NR_END 1 -#define FREE_ARG char * - -static int32_t sqrarg; -#define SQR(a) ((sqrarg = (a)) == 0 ? 0 : sqrarg * sqrarg) -static int32_t dsqrarg; -#define DSQR(a) ((dsqrarg = (a)) == 0 ? 0 : dsqrarg * dsqrarg) -static int32_t dmaxarg1, dmaxarg2; -#define DMAX(a, b) \ - (dmaxarg1 = (a), dmaxarg2 = (b), \ - (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2)) -static int32_t dminarg1, dminarg2; -#define DMIN(a, b) \ - (dminarg1 = (a), dminarg2 = (b), \ - (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2)) -static int32_t maxarg1, maxarg2; -#define FMAX(a, b) \ - (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) -static int32_t minarg1, minarg2; -#define FMIN(a, b) \ - (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2)) -static long lmaxarg1, lmaxarg2; -#define LMAX(a, b) \ - (lmaxarg1 = (a), lmaxarg2 = (b), \ - (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2)) -static long lminarg1, lminarg2; -#define LMIN(a, b) \ - (lminarg1 = (a), lminarg2 = (b), \ - (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2)) -static int32_t imaxarg1, imaxarg2; -#define IMAX(a, b) \ - (imaxarg1 = (a), imaxarg2 = (b), \ - (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2)) -static int32_t iminarg1, iminarg2; -#define IMIN(a, b) \ - (iminarg1 = (a), iminarg2 = (b), \ - (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) -#define ABS(a) (a < 0 ? -a : a) -#define SIGN(a, b) ((b) >= 0 ? ABS(a) : -ABS(a)) - -int32_t sqrt_q32(const int32_t number, const uint32_t fracBits); - -#define sqrt2 0b1011010100000100 -int32_t sqrt_q32(const int32_t number, const uint32_t fracBits) { - - int32_t root = 0; - int32_t start = 0; - int32_t end = 46341; // smallest integer that is larger than sqrt(0x7FFFFFFF) - int32_t mid; - - if (number > 0) { - while (start <= end) { - mid = (start + end) >> 1; - if (((mid * mid) >> fracBits) == number) { - root = mid; - break; - } - if (((mid * mid) >> fracBits) < number) { - start = mid + 1; - root = mid; - } else { - end = mid - 1; - } - } - } - - return root; -} - -#endif diff --git a/software/apps/svd/svd.c b/software/apps/svd/svd.c deleted file mode 100644 index fa2fcbd0c..000000000 --- a/software/apps/svd/svd.c +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -int32_t pythag(int32_t a, int32_t b); -void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v); - -int32_t pythag(int32_t a, int32_t b) { - int32_t absa = ABS(a); - int32_t absb = ABS(b); - if (absa > absb) { - return absa * sqrt_q32(1 + SQR(absb / absa), 4); - } else { - return (absb == 0 ? 0 : absb * sqrt_q32(1 + SQR(absa / absb), 4)); - } -} - -void svdcmp(int32_t *a, int32_t m, int32_t n, int32_t *w, int32_t *v) { - int32_t flag, i, its, j, jj, k, l, nm; - int32_t anorm, c, f, g, h, s, scale, x, y, z; - int32_t rv1[n]; - - // printf("PROVA\n"); - - g = scale = anorm = 0.0; - for (i = 1; i <= n; i++) { - l = i + 1; - rv1[i] = scale * g; - g = s = scale = 0.0; - if (i <= m) { - for (k = i; k <= m; k++) { - scale += ABS(a[k * m + i]); - } - if (scale) { - for (k = i; k <= m; k++) { - a[k * m + i] /= scale; - s += a[k * m + i] * a[k * m + i]; - } - f = a[i * m + i]; - g = -SIGN(sqrt_q32(s, 4), f); - h = f * g - s; - a[i * m + i] = f - g; - for (j = l; j <= n; j++) { - for (s = 0.0, k = i; k <= m; k++) { - s += a[k * m + i] * a[k * m + i]; - } - f = s / h; - for (k = i; k <= m; k++) { - a[k * m + i] += f * a[k * m + i]; - } - } - for (k = i; k <= m; k++) { - a[k * m + i] *= scale; - } - } - } - w[i] = scale * g; - g = s = scale = 0.0; - if (i <= m && i != n) { - for (k = l; k <= n; k++) { - scale += ABS(a[k * m + i]); - } - if (scale) { - for (k = l; k <= n; k++) { - a[k * m + i] /= scale; - s += a[i * m + k] * a[i * m + k]; - } - f = a[i * m + l]; - g = -SIGN(sqrt_q32(s, 4), f); - h = f * g - s; - a[i * m + l] = f - g; - for (k = l; k <= n; k++) { - rv1[k] = a[i * m + k] / h; - } - for (j = l; j <= m; j++) { - for (s = 0, k = l; k <= n; k++) { - s += a[j * m + k] * a[i * m + k]; - } - for (k = l; k <= n; k++) { - a[j * m + k] += s * rv1[k]; - } - } - for (k = l; k <= n; k++) { - a[i * m + k] *= scale; - } - } - } - anorm = FMAX(anorm, (ABS(w[i]) + ABS(rv1[i]))); - } - - for (i = n; i >= 1; i--) { - if (i < n) { - if (g) { - for (j = l; j <= n; j++) { - v[j * m + i] = (a[i * m + j] / a[i * m + j]) / g; - } - for (j = l; j <= n; j++) { - for (s = 0, k = l; k <= n; k++) { - s += a[i * m + k] * v[k * m + j]; - } - for (k = l; k <= n; k++) { - v[k * m + j] += s * v[k * m + i]; - } - } - } - for (j = l; j <= n; j++) { - v[i * m + j] = v[j * m + i] = 0; - } - } - v[i * m + i] = 1; - g = rv1[i]; - l = i; - } - - // for (i = IMIN(m, n); i >= 1; i--) { - // l = i + 1; - // g = w[i]; - // for (j = l; j <= n; j++) { - // a[i][j] = 0; - // } - // if (g) { - // g = 1.0 / g; - // for (j = l; j <= n; j++) { - // for (s = 0.0, k = l; k <= m; k++) { - // s += a[k][i] * a[k][j]; - // } - // f = (s / a[i][i]) * g; - // for (k = i; k <= m; k++) { - // a[k][j] += f * a[k][i]; - // } - // } - // for (j = i; j <= m; j++) { - // a[j][i] *= g; - // } - // } else { for (j = i; j <= m; j++) { - // a[j][i] = 0.0; - // } - // } - // ++a[i][i]; - // } - // for (k = n; k >= 1; k--) { - // for (its = 1; its <= 30; its++) { - // flag = 1; - // for (l = k; l >= 1; l--) { - // nm = l - 1; - // if ((int32_t) (ABS(rv1[l]) + anorm) == anorm) { - // flag = 0; - // break; - // } - // if ((int32_t) (ABS(w[nm]) + anorm) == anorm) { - // break; - // } - // } - // if (flag) { - // c = 0.0; - // s = 1.0; - // for (i = l; i <= k; i++) { - // f = s * rv1[i]; - // rv1[i] = c * rv1[i]; - // if ((int32_t) (ABS(f) + anorm) == anorm) { - // break; - // } - // g = w[i]; - // h = pythag(f, g); - // w[i] = h; - // h = 1.0 / h; - // c = g * h; - // s = -f * h; - // for (j = 1; j <= m; j++) { - // y = a[j][nm]; - // z = a[j][i]; - // a[j][nm] = y * c + z * s; - // a[j][i] = z * c - y * s; - // } - // } - // } - // z = w[k]; - // if (l == k) { - // if (z < 0.0) { - // w[k] = -z; - // for (j = 1; j <= n; j++) { - // v[j][k] = -v[j][k]; - // } - // } - // break; - // } - // if (its == 30) { - // exit(1); - // } - // x = w[l]; - // nm = k - 1; - // y = w[nm]; - // g = rv1[nm]; - // h = rv1[k]; - // f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); - // g = pythag(f, 1.0); - // f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; - // c = s = 1.0; - // for (j = l; j <= nm; j++) { - // i = j + 1; - // g = rv1[i]; - // y = w[i]; - // h = s * g; - // g = c * g; - // z = pythag(f, h); - // rv1[j] = z; - // c = f / z; - // s = h / z; - // f = x * c + g * s; - // g = g * c - x * s; - // h = y * s; - // y *= c; - // for (jj = 1; jj <= n; jj++) { - // x = v[jj][j]; - // z = v[jj][i]; - // v[jj][j] = x * c + z * s; - // v[jj][i] = z * c - x * s; - // } - // z = pythag(f, h); - // w[j] = z; - // if (z) { - // z = 1.0 / z; - // c = f * z; - // s = h * z; - // } - // f = c * g + s * y; - // x = c * y - s * g; - // for (jj = 1; jj <= m; jj++) { - // y = a[jj][j]; - // z = a[jj][i]; - // a[jj][j] = y * c + z * s; - // a[jj][i] = z * c - y * s; - // } - // } - // rv1[l] = 0.0; - // rv1[k] = f; - // w[k] = x; - // } - // } -} From 0fbf978d877baa4f718ab459032660507acd5b48 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Wed, 14 Dec 2022 15:10:13 +0100 Subject: [PATCH 19/22] [software] Fix reading of the number of cores --- software/apps/mat_inv/mempool_mat_inv_q32p.h | 27 ++++++++++--------- .../mat_inv/mempool_mat_inv_q32p_folded.h | 3 ++- .../mat_inv/mempool_mat_inv_q32p_memsized.h | 21 ++++++++------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h index 09e2b449f..c79548185 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p.h @@ -24,6 +24,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, int32_t out1, out2, out3, out4; uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); uint32_t i, j, loopCnt, k, l; /* loop counters */ uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ @@ -31,7 +32,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, /* CREATE THE IDENTITY MATRIX */ pDstT1 = pDst; - for (k = core_id * 4; k < m; k += 4 * NUM_CORES) { + for (k = core_id * 4; k < m; k += 4 * num_cores) { for (j = 0; j < m; j++) { pDstT1[k * m + j] = (uint32_t)(k == j); pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j); @@ -39,7 +40,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j); } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); /* Loop over the number of columns of the input matrix. */ loopCnt = n; @@ -125,7 +126,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, return 1; } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); /* DIVIDE BY THE PIVOT */ /* Points to the pivot row of input and destination matrices */ @@ -138,7 +139,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, in = *pPivotRowIn; ///* Loop over columns to the right of pivot */ - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { in1 = pSrcT1[j]; in2 = pSrcT1[j + 1]; in3 = pSrcT1[j + 2]; @@ -151,7 +152,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, pSrcT1[j + 1] = out2; pSrcT1[j + 2] = out3; pSrcT1[j + 3] = out4; - // j += NUM_CORES * 4; + // j += num_cores * 4; } if (core_id == (n >> 2U) - 1) { j = 4 * ((n - l) >> 2U); @@ -162,7 +163,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, } } /* Loop over columns */ - for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { in1 = pSrcT2[j]; in2 = pSrcT2[j + 1]; in3 = pSrcT2[j + 2]; @@ -184,13 +185,13 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, j++; } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); /* REPLACE ROWS */ pSrcT1 = pSrc; pSrcT2 = pDst; /* Loop over rows */ - for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + for (k = core_id * 4; k < m; k += num_cores * 4) { i = 0U; while (i < 4) { if ((i + k) != l) { @@ -250,7 +251,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, i++; } } - mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); // /* REPLACE ROWS */ // pSrcT1 = pSrc; @@ -280,7 +281,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * NUM_CORES; + // j += 4 * num_cores; // } // if (core_id == (n >> 2U) - 1) { // j = 4 * ((n - l) >> 2U); @@ -306,7 +307,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * NUM_CORES; + // j += 4 * num_cores; // } // if (core_id == (n >> 2U) - 1) { // j = 4 * (n >> 2U); @@ -317,11 +318,11 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, // j++; // } // } - // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / + // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / // 4)); // } // } - // mempool_log_partial_barrier(2, core_id, MIN(NUM_CORES, n / 4)); + // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); pSrc++; /* Increment the input pointer */ loopCnt--; /* Decrement the loop counter */ diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h index 6064a1faf..5015039ff 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h @@ -12,8 +12,9 @@ void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); uint32_t i, j, k, shift; - for (i = core_id * 4; i < n * n; i += NUM_CORES * 4) { + for (i = core_id * 4; i < n * n; i += num_cores * 4) { k = i / n; j = i % n; shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h index b697f9d24..3a5bfe5c0 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h +++ b/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h @@ -26,6 +26,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, int32_t out1, out2, out3, out4; uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = absolute_core_id; uint32_t i, j, k, l; /* loop counters */ uint32_t m = @@ -34,7 +35,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, /* CREATE THE IDENTITY MATRIX */ pDstT1 = pDst; - for (k = core_id * 4; k < m; k += NUM_CORES * 4) { + for (k = core_id * 4; k < m; k += num_cores * 4) { for (j = 0; j < n; j++) { pDstT1[k * n + j] = (uint32_t)(k == j); pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j); @@ -43,7 +44,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, } } // pDstT1 = pDst; - // for (i = absolute_core_id * 4; i < n * m; i += NUM_CORES * 4) { + // for (i = absolute_core_id * 4; i < n * m; i += num_cores * 4) { // k = i / n; // j = i % n; // pDstT1[k * n + j] = (uint32_t) (k == j); @@ -147,8 +148,8 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, in = *pPivotRowIn; /* Loop over columns to the right of pivot */ core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U); - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; - // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += NUM_CORES * 4) { + core_id = core_id > num_cores ? core_id + num_cores : core_id; + // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { // in1 = pSrcT1[j]; // in2 = pSrcT1[j + 1]; // in3 = pSrcT1[j + 2]; @@ -196,8 +197,8 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, } /* Loop over columns */ core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U); - core_id = core_id > NUM_CORES ? core_id + NUM_CORES : core_id; - for (j = core_id * 4; j < 4 * (n >> 2U); j += NUM_CORES * 4) { + core_id = core_id > num_cores ? core_id + num_cores : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { in1 = pSrcT2[j]; in2 = pSrcT2[j + 1]; in3 = pSrcT2[j + 2]; @@ -224,7 +225,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, /* REPLACE ROWS */ pSrcT1 = pSrc; pSrcT2 = pDst; - for (k = absolute_core_id / (n >> 2U); k < m; k += NUM_CORES / (n >> 2U)) { + for (k = absolute_core_id / (n >> 2U); k < m; k += num_cores / (n >> 2U)) { /* Only the columns to the right of the pivot are to be processed */ if (k != l) { pSrcT1 = pSrc + k * n; @@ -369,7 +370,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, // uint32_t check = 0; // if (absolute_core_id >= m * nPE) // mempool_wfi(); - // for (k = absolute_core_id / nPE; k < m; k += NUM_CORES / nPE) { + // for (k = absolute_core_id / nPE; k < m; k += num_cores / nPE) { // /* Only the columns to the right of the pivot are to be // processed */ if (k != l) { // pSrcT1 = pSrc + k * n; @@ -504,7 +505,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, // /* REPLACE ROWS */ // pSrcT1 = pSrc; // pSrcT2 = pDst; - // for (i = absolute_core_id * 4; i < (n * m); i += NUM_CORES * 4) { + // for (i = absolute_core_id * 4; i < (n * m); i += num_cores * 4) { // k = i / n; // if (k != l) { // in = *(pSrc + k * n); @@ -559,7 +560,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, // pSrcT1 = pSrc; // pSrcT2 = pDst; // core_id = absolute_core_id; - // for (k = core_id; k < m; k += NUM_CORES) { + // for (k = core_id; k < m; k += num_cores) { // /* Only the columns to the right of the pivot are to be // processed */ if (k != l) { // pSrcT1 = pSrc + k * n; From 4c42194546d86d3d70831551fc73e2820b3055a5 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Thu, 13 Apr 2023 09:35:42 +0200 Subject: [PATCH 20/22] [software] Move the kernels to runtime/kernels folder [software] Add comment on algorithm --- software/apps/mat_inv/initialization.h | 7 - software/apps/mat_inv/main.c | 17 +- software/apps/mat_inv/mempool_mat_inv_q32p.h | 341 ---------- .../mat_inv/mempool_mat_inv_q32p_folded.h | 291 -------- .../kernel/mempool_mat_inv_q32p.h} | 626 +++++++++++++++++- .../kernel}/mempool_mat_inv_q32s.h | 35 +- 6 files changed, 653 insertions(+), 664 deletions(-) delete mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p.h delete mode 100644 software/apps/mat_inv/mempool_mat_inv_q32p_folded.h rename software/{apps/mat_inv/mempool_mat_inv_q32p_memsized.h => runtime/kernel/mempool_mat_inv_q32p.h} (54%) rename software/{apps/mat_inv => runtime/kernel}/mempool_mat_inv_q32s.h (87%) diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h index 6e48e7951..a37d5f38c 100644 --- a/software/apps/mat_inv/initialization.h +++ b/software/apps/mat_inv/initialization.h @@ -31,13 +31,6 @@ void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, uint32_t core_id); void display(int32_t *A, int32_t n, int32_t m) { - // int32_t i, j; - // for (i = 0; i < n; i++) { - // for (j = 0; j < m; j++) { - // printf("%8d ", A[i * m + j]); - // } - // printf("\n"); - //} int32_t i; for (i = 0; i < n * m; i++) { printf("Output[%d] = %8d\n", i, A[i]); diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index ebe4eca06..c89edb079 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -11,21 +11,18 @@ #define N 16 #define M 16 -#define O 16 #define N_BANKS (1024) -#define N_USED_BANKS (64) +#define N_USED_BANKS (16) #define VERBOSE -// #define SINGLE +#define SINGLE // #define PARALLEL -#define MEMSIZED +// #define MEMSIZED // #define FOLDED #include "initialization.h" -#include "mempool_mat_inv_q32p.h" -#include "mempool_mat_inv_q32p_folded.h" -#include "mempool_mat_inv_q32p_memsized.h" -#include "mempool_mat_inv_q32s.h" +#include "kernel/mempool_mat_inv_q32p.h" +#include "kernel/mempool_mat_inv_q32s.h" #ifdef FOLDED int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1"))); @@ -107,7 +104,7 @@ void multi_core_memsized() { mempool_barrier(num_cores); mempool_start_benchmark(); - mempool_GJinv_q32p_memsized(matrix, inv, M, &flag); + mempool_GJinv_memsized_q32p(matrix, inv, M, &flag); mempool_stop_benchmark(); mempool_barrier(num_cores); @@ -141,7 +138,7 @@ void multi_core_folded() { mempool_stop_benchmark(); if (core_id < nPE) { mempool_start_benchmark(); - mempool_GJinv_q32p_folded(folded_matrix, inv, M, &flag, nPE); + mempool_GJinv_folded_q32p(folded_matrix, inv, M, &flag, nPE); mempool_stop_benchmark(); } mempool_barrier(num_cores); diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p.h b/software/apps/mat_inv/mempool_mat_inv_q32p.h deleted file mode 100644 index c79548185..000000000 --- a/software/apps/mat_inv/mempool_mat_inv_q32p.h +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* GAUSS JORDAN INVERSION */ - -int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag); - -int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag) { - - int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, - *pPRT_pDst; /* Temporary input and output data matrix pointer */ - - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t i, j, loopCnt, k, l; /* loop counters */ - uint32_t m = - n; /* M is the number of rows. However, the matirces must be square. */ - - /* CREATE THE IDENTITY MATRIX */ - - pDstT1 = pDst; - for (k = core_id * 4; k < m; k += 4 * num_cores) { - for (j = 0; j < m; j++) { - pDstT1[k * m + j] = (uint32_t)(k == j); - pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j); - pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j); - pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j); - } - } - mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - - /* Loop over the number of columns of the input matrix. */ - loopCnt = n; - /* Index modifier to navigate through the columns */ - l = 0U; - - while (loopCnt > 0U) { - - pSrcT1 = pSrc + (l * n); - pDstT1 = pDst + (l * n); - in = *pSrcT1; - - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - pSrcT2 = pSrc + (n * k); - pDstT2 = pDst + (n * k); - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } - } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; - } - } - mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - pPivotRowIn = pSrc + (l * n); - pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; - - ///* Loop over columns to the right of pivot */ - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; - // j += num_cores * 4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; - } - } - /* Loop over columns */ - for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); - j++; - } - } - mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - /* Loop over rows */ - for (k = core_id * 4; k < m; k += num_cores * 4) { - i = 0U; - while (i < 4) { - if ((i + k) != l) { - pSrcT1 = pSrc + (i + k) * n; - pSrcT2 = pDst + (i + k) * n; - /* Element of the reference row */ - in = *pSrcT1; - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over columns to the right of pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - /* Loop over columns */ - j = 0; - while (j < 4 * (n >> 2U)) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4; - } - while (j < n) { - in1 = pSrcT2[j]; - out1 = pPRT_pDst[j]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - i++; - } - } - mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - - // /* REPLACE ROWS */ - // pSrcT1 = pSrc; - // pSrcT2 = pDst; - // /* Loop over rows */ - // for (k = 0; k < m; k++) { - // if (k != l) { - // pSrcT1 = pSrc + k * n; - // pSrcT2 = pDst + k * n; - // /* Element of the reference row */ - // in = *pSrcT1; - // pPRT_in = pPivotRowIn; - // pPRT_pDst = pPivotRowDst; - // /* Loop over columns to the right of pivot */ - // j = core_id * 4; - // // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - // - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = pPRT_in[j]; - // out2 = pPRT_in[j + 1]; - // out3 = pPRT_in[j + 2]; - // out4 = pPRT_in[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * num_cores; - // } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // in1 = pSrcT1[j]; - // out1 = pPRT_in[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // /* Loop over columns */ - // j = core_id * 4; - // while (j < 4 * (n >> 2U)) { - // in1 = pSrcT2[j]; - // in2 = pSrcT2[j + 1]; - // in3 = pSrcT2[j + 2]; - // in4 = pSrcT2[j + 3]; - // out1 = pPRT_pDst[j]; - // out2 = pPRT_pDst[j + 1]; - // out3 = pPRT_pDst[j + 2]; - // out4 = pPRT_pDst[j + 3]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * num_cores; - // } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pSrcT2[j]; - // out1 = pPRT_pDst[j]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / - // 4)); - // } - // } - // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - - pSrc++; /* Increment the input pointer */ - loopCnt--; /* Decrement the loop counter */ - l++; /* Increment the index modifier */ - } - - // if ((flag != 1U) && (x == 0)) { - // for (i = 0; i < m * n; i++) { - // if (pSrc[i] != 0) - // break; - // } - // if (i == m * n) - // return 1; - // } - return 0; -} diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h b/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h deleted file mode 100644 index 5015039ff..000000000 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_folded.h +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* GAUSS JORDAN INVERSION */ - -int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag, uint32_t nPE); -void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); - -void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t i, j, k, shift; - for (i = core_id * 4; i < n * n; i += num_cores * 4) { - k = i / n; - j = i % n; - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pDst[shift + j] = pSrc[i]; - pDst[shift + j + 1] = pSrc[i + 1]; - pDst[shift + j + 2] = pSrc[i + 2]; - pDst[shift + j + 3] = pSrc[i + 3]; - } - mempool_log_barrier(2, core_id); -} - -int mempool_GJinv_q32p_folded(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag, uint32_t nPE) { - - int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, - *pPRT_pDst; /* Temporary input and output data matrix pointer */ - - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; - - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id; - uint32_t shift = 0; - uint32_t i, j, k, l; /* loop counters */ - uint32_t m = - n; /* M is the number of rows. However, the matrices must be square. */ - - /* CREATE THE IDENTITY MATRIX */ - pDstT1 = pDst; - for (i = core_id * 4; i < n * m; i += nPE * 4) { - k = i / n; - j = i % n; - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pDstT1[shift + j] = (uint32_t)(k == j); - pDstT1[shift + j + 1] = (uint32_t)(k == (j + 1)); - pDstT1[shift + j + 2] = (uint32_t)(k == (j + 2)); - pDstT1[shift + j + 3] = (uint32_t)(k == (j + 3)); - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* Index modifier to navigate through the columns */ - l = 0U; - while (l < n) { - - shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; - pSrcT1 = pSrc + shift; - pDstT1 = pDst + shift; - in = *pSrcT1; - - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (absolute_core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pSrcT2 = pSrc + shift; - pDstT2 = pDst + shift; - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } - } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; - } - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; - pPivotRowIn = pSrc + shift; - pPivotRowDst = pDst + shift; - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; - - /* Loop over columns to the right of pivot */ - core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); - core_id = core_id > nPE ? core_id + nPE : core_id; - for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; - } - if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; - } - } - - /* Loop over columns */ - core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); - core_id = core_id > nPE ? core_id + nPE : core_id; - for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - pSrcT2[j] = FIX_DIV(in1, in); - j++; - } - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { - /* Only the columns to the right of the pivot are to be processed */ - if (k != l) { - shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; - pSrcT1 = pSrc + shift; - pSrcT2 = pDst + shift; - /* Element of the reference row */ - in = *pSrcT1; - /* Reference row pointers */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - core_id = core_id - (l >> 2U); - j = core_id * 4; - while (j < 4 * ((n - l) >> 2U)) { - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - core_id = absolute_core_id % (n >> 2U); - /* Loop over the columns */ - j = core_id * 4; - while (j < 4 * (n >> 2U)) { - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == (n >> 2U) - 1) { - j = 4 * (n >> 2U); - while (j < n) { - in1 = pSrcT2[j]; - out1 = pPRT_pDst[j]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - } - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - pSrc++; /* Increment the input pointer */ - l++; /* Increment the index modifier */ - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - return 0; -} diff --git a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h b/software/runtime/kernel/mempool_mat_inv_q32p.h similarity index 54% rename from software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h rename to software/runtime/kernel/mempool_mat_inv_q32p.h index 3a5bfe5c0..42b26eb21 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32p_memsized.h +++ b/software/runtime/kernel/mempool_mat_inv_q32p.h @@ -8,10 +8,356 @@ uint32_t volatile pivot_barrier __attribute__((section(".l1"))); -int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag); +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); -int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t i, j, k, shift; + for (i = core_id * 4; i < n * n; i += num_cores * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDst[shift + j] = pSrc[i]; + pDst[shift + j + 1] = pSrc[i + 1]; + pDst[shift + j + 2] = pSrc[i + 2]; + pDst[shift + j + 3] = pSrc[i + 3]; + } + mempool_log_barrier(2, core_id); +} + +int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag) { + + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matirces must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + + pDstT1 = pDst; + for (k = core_id * 4; k < m; k += 4 * num_cores) { + for (j = 0; j < m; j++) { + pDstT1[k * m + j] = (int32_t)(k == j); + pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j); + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* Loop over the number of columns of the input matrix. */ + loopCnt = n; + /* Index modifier to navigate through the columns */ + l = 0U; + + while (loopCnt > 0U) { + + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + ///* Loop over columns to the right of pivot */ + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + // j += num_cores * 4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + /* Loop over rows */ + for (k = core_id * 4; k < m; k += num_cores * 4) { + i = 0U; + while (i < 4) { + if ((i + k) != l) { + pSrcT1 = pSrc + (i + k) * n; + pSrcT2 = pDst + (i + k) * n; + /* Element of the reference row */ + in = *pSrcT1; + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over columns to the right of pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + /* Loop over columns */ + j = 0; + while (j < 4 * (n >> 2U)) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + i++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + // /* REPLACE ROWS */ + // pSrcT1 = pSrc; + // pSrcT2 = pDst; + // /* Loop over rows */ + // for (k = 0; k < m; k++) { + // if (k != l) { + // pSrcT1 = pSrc + k * n; + // pSrcT2 = pDst + k * n; + // /* Element of the reference row */ + // in = *pSrcT1; + // pPRT_in = pPivotRowIn; + // pPRT_pDst = pPivotRowDst; + // /* Loop over columns to the right of pivot */ + // j = core_id * 4; + // // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n + // - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) { + // in1 = pSrcT1[j]; + // in2 = pSrcT1[j + 1]; + // in3 = pSrcT1[j + 2]; + // in4 = pSrcT1[j + 3]; + // out1 = pPRT_in[j]; + // out2 = pPRT_in[j + 1]; + // out3 = pPRT_in[j + 2]; + // out4 = pPRT_in[j + 3]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * num_cores; + // } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * ((n - l) >> 2U); + // while (j < n - l) { + // in1 = pSrcT1[j]; + // out1 = pPRT_in[j]; + // pSrcT1[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // /* Loop over columns */ + // j = core_id * 4; + // while (j < 4 * (n >> 2U)) { + // in1 = pSrcT2[j]; + // in2 = pSrcT2[j + 1]; + // in3 = pSrcT2[j + 2]; + // in4 = pSrcT2[j + 3]; + // out1 = pPRT_pDst[j]; + // out2 = pPRT_pDst[j + 1]; + // out3 = pPRT_pDst[j + 2]; + // out4 = pPRT_pDst[j + 3]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + // j += 4 * num_cores; + // } + // if (core_id == (n >> 2U) - 1) { + // j = 4 * (n >> 2U); + // while (j < n) { + // in1 = pSrcT2[j]; + // out1 = pPRT_pDst[j]; + // pSrcT2[j] = in1 - FIX_MUL(in, out1); + // j++; + // } + // } + // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / + // 4)); + // } + // } + // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ + } + + // if ((flag != 1U) && (x == 0)) { + // for (i = 0; i < m * n; i++) { + // if (pSrc[i] != 0) + // break; + // } + // if (i == m * n) + // return 1; + // } + return 0; +} + +int mempool_GJinv_memsized_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, uint32_t *flag) { int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ @@ -28,7 +374,7 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, uint32_t absolute_core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = absolute_core_id; - uint32_t i, j, k, l; /* loop counters */ + uint32_t j, k, l; /* loop counters */ uint32_t m = n; /* M is the number of rows. However, the matirces must be square. */ @@ -37,10 +383,10 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, pDstT1 = pDst; for (k = core_id * 4; k < m; k += num_cores * 4) { for (j = 0; j < n; j++) { - pDstT1[k * n + j] = (uint32_t)(k == j); - pDstT1[(k + 1) * n + j] = (uint32_t)((k + 1) == j); - pDstT1[(k + 2) * n + j] = (uint32_t)((k + 2) == j); - pDstT1[(k + 3) * n + j] = (uint32_t)((k + 3) == j); + pDstT1[k * n + j] = (int32_t)(k == j); + pDstT1[(k + 1) * n + j] = (int32_t)((k + 1) == j); + pDstT1[(k + 2) * n + j] = (int32_t)((k + 2) == j); + pDstT1[(k + 3) * n + j] = (int32_t)((k + 3) == j); } } // pDstT1 = pDst; @@ -627,3 +973,267 @@ int mempool_GJinv_q32p_memsized(int32_t *pSrc, int32_t *pDst, uint32_t n, return 0; } + +int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag, uint32_t nPE) { + + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t shift = 0; + uint32_t i, j, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matrices must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + pDstT1 = pDst; + for (i = core_id * 4; i < n * m; i += nPE * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDstT1[shift + j] = (int32_t)(k == j); + pDstT1[shift + j + 1] = (int32_t)(k == (j + 1)); + pDstT1[shift + j + 2] = (int32_t)(k == (j + 2)); + pDstT1[shift + j + 3] = (int32_t)(k == (j + 3)); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { + + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pDstT1 = pDst + shift; + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT2 = pSrc + shift; + pDstT2 = pDst + shift; + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pPivotRowIn = pSrc + shift; + pPivotRowDst = pDst + shift; + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + + /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { + /* Only the columns to the right of the pivot are to be processed */ + if (k != l) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pSrcT2 = pDst + shift; + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + core_id = absolute_core_id % (n >> 2U); + /* Loop over the columns */ + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + return 0; +} diff --git a/software/apps/mat_inv/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h similarity index 87% rename from software/apps/mat_inv/mempool_mat_inv_q32s.h rename to software/runtime/kernel/mempool_mat_inv_q32s.h index a20b918e0..0d4c77c7a 100644 --- a/software/apps/mat_inv/mempool_mat_inv_q32s.h +++ b/software/runtime/kernel/mempool_mat_inv_q32s.h @@ -8,6 +8,21 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n); +/* GAUSS JORDAN ALGORITHM + - Form the augmented matrix by the identity matrix + - LOOP OVER ROWS ... + - Check if the element on the diagonal of the input matrix is zero + > The element is zero, check if there is a nonzero element in one of the + rows below on the same column > Exchange the row with the row containing a + nonzero element on the same column > If there is no such element then the + matrix is singular and the algorithm fails + + - Divide the current row by the element on the diagonal + - Replace all the rows below with the sum of that row and a multiple of the + current row (row i), so that each new element in column i, below row i is + zero. +*/ + int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ @@ -30,10 +45,10 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { /* CREATE THE IDENTITY MATRIX */ for (k = 0; k < m; k += 4) { for (j = 0; j < n; j++) { - pDstT1[k * m + j] = (uint32_t)(k == j); - pDstT1[(k + 1) * m + j] = (uint32_t)((k + 1) == j); - pDstT1[(k + 2) * m + j] = (uint32_t)((k + 2) == j); - pDstT1[(k + 3) * m + j] = (uint32_t)((k + 3) == j); + pDstT1[k * m + j] = (int32_t)(k == j); + pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j); } } @@ -133,7 +148,7 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { /* Pivot element of the row */ in = *pPivotRowIn; - /* Loop over number of columns to the right of the pilot element */ + /* Loop over columns to the right of the pilot element */ j = 0; while (j < 4 * ((n - l) >> 2U)) { in1 = *pSrcT1; @@ -155,6 +170,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { *pSrcT1++ = FIX_DIV(in1, in); j++; } + + /* Alternative = remainder of loop unrolling using switch-case */ // switch ((n - l) % 4) { // case 3: // in1 = *pSrcT1; @@ -181,7 +198,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { // *pSrcT1++ = out1; // break; //} - /* Loop over number of columns of the destination matrix */ + + /* Loop over columns of the destination matrix */ j = 0; while (j < 4 * (n >> 2U)) { in1 = *pSrcT2; @@ -243,6 +261,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { *pSrcT1++ = in1 - FIX_MUL(in, out1); j++; } + + /* Alternative = remainder of loop unrolling using switch-case */ // switch ((n - l) % 4) { // case 3: // in1 = *pSrcT1; @@ -269,7 +289,8 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { // *pSrcT1++ = in1 - FIX_MUL(in, out1); // break; //} - /* Loop over the number of columns to + + /* Loop over the columns to replace the elements in the destination matrix */ j = 0; while (j < 4 * (n >> 2U)) { From cc31b71293d323975a4c5d740c506a965b6ebb2d Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Fri, 26 May 2023 13:44:25 +0200 Subject: [PATCH 21/22] [software] Clean up --- software/apps/mat_inv/main.c | 107 +-- .../runtime/kernel/mempool_mat_inv_q32p.h | 699 +----------------- .../runtime/kernel/mempool_mat_inv_q32s.h | 56 -- 3 files changed, 30 insertions(+), 832 deletions(-) diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c index c89edb079..7ada71ebb 100644 --- a/software/apps/mat_inv/main.c +++ b/software/apps/mat_inv/main.c @@ -17,7 +17,6 @@ #define VERBOSE #define SINGLE // #define PARALLEL -// #define MEMSIZED // #define FOLDED #include "initialization.h" @@ -37,102 +36,52 @@ int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); uint32_t flag __attribute__((section(".l1"))); #endif -// Driver program -void single_core() { +int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); // Initialize barrier and synchronize mempool_barrier_init(core_id); +/* initialize the data */ +#if defined(SINGLE) || defined(PARALLEL) init_matrix(matrix, N, M, -156, 427, -219, core_id); init_matrix_zeros(inv, M, M, core_id); - mempool_barrier(num_cores); - if (core_id == 0) { - mempool_start_benchmark(); - mempool_GJinv_q32s(matrix, inv, M); - mempool_stop_benchmark(); + flag = 0U; } mempool_barrier(num_cores); -#ifdef VERBOSE - if (core_id == 0) - display(inv, N, M); -#endif - mempool_barrier(num_cores); -} - -void multi_core() { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); +#elif defined(FOLDED) + uint32_t nPE = N_USED_BANKS >> 2U; init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, M, M, core_id); + init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); if (core_id == 0) { flag = 0U; } mempool_barrier(num_cores); - if (core_id < MIN(NUM_CORES, N / 4)) { - mempool_start_benchmark(); - mempool_GJinv_q32p(matrix, inv, M, &flag); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -#ifdef VERBOSE - if (core_id == 0) - display(inv, M, N); #endif - mempool_barrier(num_cores); -} - -void multi_core_memsized() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(inv, N, M, core_id); +/* Execute the kernel */ +#if defined(SINGLE) if (core_id == 0) { - flag = 0U; + mempool_start_benchmark(); + mempool_GJinv_q32s(matrix, inv, M); + mempool_stop_benchmark(); } mempool_barrier(num_cores); - mempool_start_benchmark(); - mempool_GJinv_memsized_q32p(matrix, inv, M, &flag); - mempool_stop_benchmark(); - - mempool_barrier(num_cores); -#ifdef VERBOSE - if (core_id == 0) - display(inv, M, N); -#endif - mempool_barrier(num_cores); -} - -#ifdef FOLDED -void multi_core_folded() { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t nPE = N_USED_BANKS >> 2U; - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - init_matrix(matrix, N, M, -156, 427, -219, core_id); - init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); - init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); - if (core_id == 0) { - flag = 0U; - __atomic_store_n(&pivot_barrier, 0U, __ATOMIC_RELAXED); +#elif defined(PARALLEL) + if (core_id < MIN(NUM_CORES, N / 4)) { + mempool_start_benchmark(); + mempool_GJinv_q32p(matrix, inv, M, &flag); + mempool_stop_benchmark(); } mempool_barrier(num_cores); +#elif defined(FOLDED) mempool_start_benchmark(); fold_matrix(matrix, folded_matrix, N); mempool_stop_benchmark(); @@ -142,23 +91,15 @@ void multi_core_folded() { mempool_stop_benchmark(); } mempool_barrier(num_cores); + +#endif + +/* Display the result of computation */ #ifdef VERBOSE if (core_id == 0) - display_folded(inv, M, N); -#endif + display(inv, M, N); mempool_barrier(num_cores); -} #endif -int main() { -#if defined(SINGLE) - single_core(); -#elif defined(PARALLEL) - multi_core(); -#elif defined(MEMSIZED) - multi_core_memsized(); -#elif defined(FOLDED) - multi_core_folded(); -#endif return 0; } diff --git a/software/runtime/kernel/mempool_mat_inv_q32p.h b/software/runtime/kernel/mempool_mat_inv_q32p.h index 42b26eb21..a937ae33e 100644 --- a/software/runtime/kernel/mempool_mat_inv_q32p.h +++ b/software/runtime/kernel/mempool_mat_inv_q32p.h @@ -6,8 +6,6 @@ /* GAUSS JORDAN INVERSION */ -uint32_t volatile pivot_barrier __attribute__((section(".l1"))); - void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { @@ -270,77 +268,6 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, } mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - // /* REPLACE ROWS */ - // pSrcT1 = pSrc; - // pSrcT2 = pDst; - // /* Loop over rows */ - // for (k = 0; k < m; k++) { - // if (k != l) { - // pSrcT1 = pSrc + k * n; - // pSrcT2 = pDst + k * n; - // /* Element of the reference row */ - // in = *pSrcT1; - // pPRT_in = pPivotRowIn; - // pPRT_pDst = pPivotRowDst; - // /* Loop over columns to the right of pivot */ - // j = core_id * 4; - // // j = core_id * 4 > 4 * (l >> 2U) ? core_id * 4 : 4 * ((n - // - l) >> 2U); while (j < 4 * ((n - l) >> 2U)) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = pPRT_in[j]; - // out2 = pPRT_in[j + 1]; - // out3 = pPRT_in[j + 2]; - // out4 = pPRT_in[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * num_cores; - // } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // in1 = pSrcT1[j]; - // out1 = pPRT_in[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // /* Loop over columns */ - // j = core_id * 4; - // while (j < 4 * (n >> 2U)) { - // in1 = pSrcT2[j]; - // in2 = pSrcT2[j + 1]; - // in3 = pSrcT2[j + 2]; - // in4 = pSrcT2[j + 3]; - // out1 = pPRT_pDst[j]; - // out2 = pPRT_pDst[j + 1]; - // out3 = pPRT_pDst[j + 2]; - // out4 = pPRT_pDst[j + 3]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * num_cores; - // } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pSrcT2[j]; - // out1 = pPRT_pDst[j]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / - // 4)); - // } - // } - // mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); - pSrc++; /* Increment the input pointer */ loopCnt--; /* Decrement the loop counter */ l++; /* Increment the index modifier */ @@ -357,623 +284,7 @@ int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, return 0; } -int mempool_GJinv_memsized_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, - uint32_t *flag) { - - int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ - int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ - int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ - int32_t *pPRT_in, *pPivotRowDst, - *pPRT_pDst; /* Temporary input and output data matrix pointer */ - - int32_t in = 0; - int32_t Xchg1, Xchg2, Xchg3, Xchg4; - int32_t in1, in2, in3, in4; - int32_t out1, out2, out3, out4; - - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t core_id = absolute_core_id; - uint32_t j, k, l; /* loop counters */ - uint32_t m = - n; /* M is the number of rows. However, the matirces must be square. */ - - /* CREATE THE IDENTITY MATRIX */ - - pDstT1 = pDst; - for (k = core_id * 4; k < m; k += num_cores * 4) { - for (j = 0; j < n; j++) { - pDstT1[k * n + j] = (int32_t)(k == j); - pDstT1[(k + 1) * n + j] = (int32_t)((k + 1) == j); - pDstT1[(k + 2) * n + j] = (int32_t)((k + 2) == j); - pDstT1[(k + 3) * n + j] = (int32_t)((k + 3) == j); - } - } - // pDstT1 = pDst; - // for (i = absolute_core_id * 4; i < n * m; i += num_cores * 4) { - // k = i / n; - // j = i % n; - // pDstT1[k * n + j] = (uint32_t) (k == j); - // pDstT1[k * n + j + 1] = (uint32_t) (k == (j + 1)); - // pDstT1[k * n + j + 2] = (uint32_t) (k == (j + 2)); - // pDstT1[k * n + j + 3] = (uint32_t) (k == (j + 3)); - // } - // mempool_log_barrier(2, absolute_core_id); - - /* Index modifier to navigate through the columns */ - l = 0U; - while (l < n) { - - pSrcT1 = pSrc + (l * n); - pDstT1 = pDst + (l * n); - in = *pSrcT1; - - /* CHECK IF PIVOT ELEMENT IS ZERO */ - if (absolute_core_id == 0) { - if (in == 0U) { - /* Loop over the rows present below */ - for (k = l + 1U; k < m; k++) { - pSrcT2 = pSrc + (n * k); - pDstT2 = pDst + (n * k); - /* EXCHANGE */ - if (*pSrcT2 != 0) { - /* Loop over colums to the right of the pivot */ - j = 0; - while (j < 4 * ((n - l) >> 2U)) { - Xchg1 = pSrcT2[j]; - Xchg2 = pSrcT2[j + 1]; - Xchg3 = pSrcT2[j + 2]; - Xchg4 = pSrcT2[j + 3]; - out1 = pSrcT1[j]; - out2 = pSrcT1[j + 1]; - out3 = pSrcT1[j + 2]; - out4 = pSrcT1[j + 3]; - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - pSrcT1[j] = Xchg1; - pSrcT1[j + 1] = Xchg2; - pSrcT1[j + 2] = Xchg3; - pSrcT1[j + 3] = Xchg4; - j += 4; - } - while (j < n - l) { - Xchg1 = pSrcT2[j]; - pSrcT2[j] = pSrcT1[j]; - pSrcT1[j] = Xchg1; - j++; - } - /* Loop over colums */ - j = 0; - while (j < 4 * (n >> 2U)) { - Xchg1 = pDstT2[j]; - Xchg2 = pDstT2[j + 1]; - Xchg3 = pDstT2[j + 2]; - Xchg4 = pDstT2[j + 3]; - out1 = pDstT1[j]; - out2 = pDstT1[j + 1]; - out3 = pDstT1[j + 2]; - out4 = pDstT1[j + 3]; - pDstT2[j] = out1; - pDstT2[j + 1] = out2; - pDstT2[j + 2] = out3; - pDstT2[j + 3] = out4; - pDstT1[j] = Xchg1; - pDstT1[j + 1] = Xchg2; - pDstT1[j + 2] = Xchg3; - pDstT1[j + 3] = Xchg4; - j += 4; - } - while (j < n) { - Xchg1 = pDstT2[j]; - pDstT2[j] = pDstT1[j]; - pDstT1[j] = Xchg1; - j++; - } - *flag = 1U; - break; - } - } - } - /* Update the status if the matrix is singular */ - if ((*flag == 0U) && (in == 0U)) { - return 1; - } - } - mempool_log_barrier(2, absolute_core_id); - - /* DIVIDE BY THE PIVOT */ - /* Points to the pivot row of input and destination matrices */ - pPivotRowIn = pSrc + (l * n); - pPivotRowDst = pDst + (l * n); - /* Temporary pointers to the pivot row pointers */ - pSrcT1 = pPivotRowIn; - pSrcT2 = pPivotRowDst; - /* Pivot element of the row */ - in = *pPivotRowIn; - /* Loop over columns to the right of pivot */ - core_id = absolute_core_id - (((l * n + l) % N_BANKS) >> 2U); - core_id = core_id > num_cores ? core_id + num_cores : core_id; - // for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // out4 = FIX_DIV(in4, in); - // pSrcT1[j] = out1; - // pSrcT1[j + 1] = out2; - // pSrcT1[j + 2] = out3; - // pSrcT1[j + 3] = out4; - //} - // if (core_id == 0) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // in1 = pSrcT1[j]; - // pSrcT1[j] = FIX_DIV(in1, in); - // j++; - // } - //} - if (core_id == 0) { - j = 0; - while (j < 4 - l % 4) { - in1 = pSrcT1[j]; - pSrcT1[j] = FIX_DIV(in1, in); - j++; - } - } else { - j = core_id * 4 - l % 4; - if (j < (n - l)) { - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT1[j] = out1; - pSrcT1[j + 1] = out2; - pSrcT1[j + 2] = out3; - pSrcT1[j + 3] = out4; - } - } - /* Loop over columns */ - core_id = absolute_core_id - (((l * n) % N_BANKS) >> 2U); - core_id = core_id > num_cores ? core_id + num_cores : core_id; - for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - out1 = FIX_DIV(in1, in); - out2 = FIX_DIV(in2, in); - out3 = FIX_DIV(in3, in); - out4 = FIX_DIV(in4, in); - pSrcT2[j] = out1; - pSrcT2[j + 1] = out2; - pSrcT2[j + 2] = out3; - pSrcT2[j + 3] = out4; - } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pSrcT2[j]; - // pSrcT2[j] = FIX_DIV(in1, in); - // j++; - // } - //} - mempool_log_barrier(2, absolute_core_id); - - /* REPLACE ROWS */ - pSrcT1 = pSrc; - pSrcT2 = pDst; - for (k = absolute_core_id / (n >> 2U); k < m; k += num_cores / (n >> 2U)) { - /* Only the columns to the right of the pivot are to be processed */ - if (k != l) { - pSrcT1 = pSrc + k * n; - pSrcT2 = pDst + k * n; - /* Element of the reference row */ - in = *pSrcT1; - /* Reference row pointers */ - pPRT_in = pPivotRowIn; - pPRT_pDst = pPivotRowDst; - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - core_id = core_id - (l >> 2U); - j = core_id * 4; - while (j < 4 * ((n - l) >> 2U)) { - out1 = pPRT_in[j]; - out2 = pPRT_in[j + 1]; - out3 = pPRT_in[j + 2]; - out4 = pPRT_in[j + 3]; - in1 = pSrcT1[j]; - in2 = pSrcT1[j + 1]; - in3 = pSrcT1[j + 2]; - in4 = pSrcT1[j + 3]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - if (core_id == 0) { - j = 4 * ((n - l) >> 2U); - while (j < n - l) { - in1 = pSrcT1[j]; - out1 = pPRT_in[j]; - pSrcT1[j] = in1 - FIX_MUL(in, out1); - j++; - } - } - /* Loop over the columns */ - core_id = absolute_core_id % (n >> 2U); - j = core_id * 4; - while (j < 4 * (n >> 2U)) { - out1 = pPRT_pDst[j]; - out2 = pPRT_pDst[j + 1]; - out3 = pPRT_pDst[j + 2]; - out4 = pPRT_pDst[j + 3]; - in1 = pSrcT2[j]; - in2 = pSrcT2[j + 1]; - in3 = pSrcT2[j + 2]; - in4 = pSrcT2[j + 3]; - pSrcT2[j] = in1 - FIX_MUL(in, out1); - pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - j += 4 * (n >> 2U); - } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pSrcT2[j]; - // out1 = pPRT_pDst[j]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - //} - // uint32_t core_id_in; - // uint32_t core_id_Dst; - // int32_t p1_in, p2_in, p3_in, p4_in; - // int32_t p1_Dst, p2_Dst, p3_Dst, p4_Dst; - // core_id_in = absolute_core_id % (n >> 2U) - (l >> 2U); - // core_id_Dst = absolute_core_id % (n >> 2U); - // j = core_id_in == 0 ? 0 : (core_id_in * 4 - l % 4); - // i = core_id_Dst * 4; - // p1_in = pPRT_in[j]; - // p2_in = pPRT_in[j + 1]; - // p3_in = pPRT_in[j + 2]; - // p4_in = pPRT_in[j + 3]; - // p1_Dst = pPRT_pDst[i]; - // p2_Dst = pPRT_pDst[i + 1]; - // p3_Dst = pPRT_pDst[i + 2]; - // p4_Dst = pPRT_pDst[i + 3]; - // if(core_id_in == 0) { - // switch (4 - l % 4) { - // case (1): - // in1 = pSrcT1[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // break; - // case (2): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // break; - // case (3): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // break; - // case (4): - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); - // break; - // } - //} else { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, p1_in); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, p2_in); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, p3_in); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, p4_in); - //} - // in1 = pSrcT2[i]; - // in2 = pSrcT2[i + 1]; - // in3 = pSrcT2[i + 2]; - // in4 = pSrcT2[i + 3]; - // pSrcT2[i] = in1 - FIX_MUL(in, p1_Dst); - // pSrcT2[i + 1] = in2 - FIX_MUL(in, p2_Dst); - // pSrcT2[i + 2] = in3 - FIX_MUL(in, p3_Dst); - // pSrcT2[i + 3] = in4 - FIX_MUL(in, p4_Dst); - } - } - mempool_log_barrier(2, absolute_core_id); - - // /* REPLACE ROWS */ - // pSrcT1 = pSrc; - // pSrcT2 = pDst; - // /* Reference row pointers */ - // pPRT_in = pSrc + (l * n); - // pPRT_pDst = pDst + (l * n); - // int32_t pivot = *pPRT_in; - // uint32_t nPE = (n >> 2U); - // uint32_t check = 0; - // if (absolute_core_id >= m * nPE) - // mempool_wfi(); - // for (k = absolute_core_id / nPE; k < m; k += num_cores / nPE) { - // /* Only the columns to the right of the pivot are to be - // processed */ if (k != l) { - // pSrcT1 = pSrc + k * n; - // pSrcT2 = pDst + k * n; - // /* Element of the reference row */ - // in = *pSrcT1; - // /* Loop over the columns */ - // core_id = absolute_core_id % nPE; - // core_id = core_id - (l >> 2U); - // j = core_id * 4; - // while (j < 4 * ((n - l) >> 2U)) { - // out1 = pPRT_in[j]; - // out2 = pPRT_in[j + 1]; - // out3 = pPRT_in[j + 2]; - // out4 = pPRT_in[j + 3]; - // out1 = FIX_DIV(out1, pivot); - // out2 = FIX_DIV(out2, pivot); - // out3 = FIX_DIV(out3, pivot); - // out4 = FIX_DIV(out4, pivot); - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * (n >> 2U); - // } - // if (core_id == 0) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // out1 = pPRT_in[j]; - // out1 = FIX_DIV(out1, pivot); - // in1 = pSrcT1[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // /* Loop over the columns */ - // core_id = absolute_core_id % nPE; - // j = core_id * 4; - // while (j < 4 * (n >> 2U)) { - // out1 = pPRT_pDst[j]; - // out2 = pPRT_pDst[j + 1]; - // out3 = pPRT_pDst[j + 2]; - // out4 = pPRT_pDst[j + 3]; - // out1 = FIX_DIV(out1, pivot); - // out2 = FIX_DIV(out2, pivot); - // out3 = FIX_DIV(out3, pivot); - // out4 = FIX_DIV(out4, pivot); - // in1 = pSrcT2[j]; - // in2 = pSrcT2[j + 1]; - // in3 = pSrcT2[j + 2]; - // in4 = pSrcT2[j + 3]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4 * nPE; - // } - // __atomic_fetch_add(&pivot_barrier, 1, __ATOMIC_RELAXED); - // mempool_wfi(); - // } else { - // do { - // check = __atomic_fetch_add(&pivot_barrier, 0, - // __ATOMIC_RELAXED); mempool_wait(20); - // } while (check < ((m - 1) * nPE)); - // /* Loop over the columns */ - // core_id = absolute_core_id % (n >> 2U); - // core_id = core_id - (l >> 2U); - // j = core_id * 4; - // while (j < 4 * ((n - l) >> 2U)) { - // in1 = pPRT_in[j]; - // in2 = pPRT_in[j + 1]; - // in3 = pPRT_in[j + 2]; - // in4 = pPRT_in[j + 3]; - // out1 = FIX_DIV(in1, pivot); - // out2 = FIX_DIV(in2, pivot); - // out3 = FIX_DIV(in3, pivot); - // out4 = FIX_DIV(in4, pivot); - // pPRT_in[j] = out1; - // pPRT_in[j + 1] = out2; - // pPRT_in[j + 2] = out3; - // pPRT_in[j + 3] = out4; - // j += 4 * (n >> 2U); - // } - // if (core_id == 0) { - // j = 4 * ((n - l) >> 2U); - // while (j < n - l) { - // in1 = pPRT_in[j]; - // pPRT_in[j] = FIX_DIV(in1, pivot); - // j++; - // } - // } - // /* Loop over the columns */ - // core_id = absolute_core_id % (n >> 2U); - // j = core_id * 4; - // while (j < 4 * (n >> 2U)) { - // in1 = pPRT_pDst[j]; - // in2 = pPRT_pDst[j + 1]; - // in3 = pPRT_pDst[j + 2]; - // in4 = pPRT_pDst[j + 3]; - // out1 = FIX_DIV(in1, pivot); - // out2 = FIX_DIV(in2, pivot); - // out3 = FIX_DIV(in3, pivot); - // out4 = FIX_DIV(in4, pivot); - // pPRT_pDst[j] = out1; - // pPRT_pDst[j + 1] = out2; - // pPRT_pDst[j + 2] = out3; - // pPRT_pDst[j + 3] = out4; - // j += 4 * (n >> 2U); - // } - // if (core_id == (n >> 2U) - 1) { - // j = 4 * (n >> 2U); - // while (j < n) { - // in1 = pPRT_pDst[j]; - // pPRT_pDst[j] = FIX_DIV(in1, pivot); - // j++; - // } - // } - // if ((m * nPE) - 1 == __atomic_fetch_add(&pivot_barrier, 1, - // __ATOMIC_RELAXED)) { - // __atomic_store_n(&pivot_barrier, 0, __ATOMIC_RELAXED); - // __sync_synchronize(); - // wake_up_all(); - // } - // mempool_wfi(); - // } - // } - - // /* REPLACE ROWS */ - // pSrcT1 = pSrc; - // pSrcT2 = pDst; - // for (i = absolute_core_id * 4; i < (n * m); i += num_cores * 4) { - // k = i / n; - // if (k != l) { - // in = *(pSrc + k * n); - // j = i - (k * n); - // if (j >= 4 * (l >> 2U)) { - // if (j == 4 * (l >> 2U)) { - // pSrcT1 = pSrc + k * n; - // pPRT_in = pPivotRowIn; - // uint32_t bound = j + 4 - l; - // j = 0; - // while (j < bound) { - // in1 = *pSrcT1; - // out1 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // j++; - // } - // } else { - // pSrcT1 = pSrc + (i - l); - // pPRT_in = pPivotRowIn + (j - l); - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // in4 = *(pSrcT1 + 3); - // out1 = *pPRT_in++; - // out2 = *pPRT_in++; - // out3 = *pPRT_in++; - // out4 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // *pSrcT1++ = in2 - FIX_MUL(in, out2); - // *pSrcT1++ = in3 - FIX_MUL(in, out3); - // *pSrcT1++ = in4 - FIX_MUL(in, out4); - // } - // } - // pSrcT2 = pDst + i; - // pPRT_pDst = pPivotRowDst + j; - // in1 = *pSrcT2; - // in2 = *(pSrcT2 + 1); - // in3 = *(pSrcT2 + 2); - // in4 = *(pSrcT2 + 3); - // out1 = *pPRT_pDst++; - // out2 = *pPRT_pDst++; - // out3 = *pPRT_pDst++; - // out4 = *pPRT_pDst++; - // *pSrcT2++ = in1 - FIX_MUL(in, out1); - // *pSrcT2++ = in2 - FIX_MUL(in, out2); - // *pSrcT2++ = in3 - FIX_MUL(in, out3); - // *pSrcT2++ = in4 - FIX_MUL(in, out4); - // } - // } - // mempool_log_barrier(2, absolute_core_id); - // /* REPLACE ROWS */ - // pSrcT1 = pSrc; - // pSrcT2 = pDst; - // core_id = absolute_core_id; - // for (k = core_id; k < m; k += num_cores) { - // /* Only the columns to the right of the pivot are to be - // processed */ if (k != l) { - // pSrcT1 = pSrc + k * n; - // pSrcT2 = pDst + k * n; - // /* Element of the reference row */ - // in = *pSrcT1; - // /* Reference row pointers */ - // pPRT_in = pPivotRowIn; - // pPRT_pDst = pPivotRowDst; - // /* Loop over the columns */ - // j = 0; - // while (j < 4 * ((n - l) >> 2U)) { - // in1 = pSrcT1[j]; - // in2 = pSrcT1[j + 1]; - // in3 = pSrcT1[j + 2]; - // in4 = pSrcT1[j + 3]; - // out1 = pPRT_in[j]; - // out2 = pPRT_in[j + 1]; - // out3 = pPRT_in[j + 2]; - // out4 = pPRT_in[j + 3]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4; - // } - // while (j < n - l) { - // in1 = pSrcT1[j]; - // out1 = pPRT_in[j]; - // pSrcT1[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // /* Loop over the columns */ - // j = 0; - // while (j < 4 * (n >> 2U)) { - // in1 = pSrcT2[j]; - // in2 = pSrcT2[j + 1]; - // in3 = pSrcT2[j + 2]; - // in4 = pSrcT2[j + 3]; - // out1 = pPRT_pDst[j]; - // out2 = pPRT_pDst[j + 1]; - // out3 = pPRT_pDst[j + 2]; - // out4 = pPRT_pDst[j + 3]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); - // pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); - // pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); - // j += 4; - // } - // while (j < n) { - // in1 = pSrcT2[j]; - // out1 = pPRT_pDst[j]; - // pSrcT2[j] = in1 - FIX_MUL(in, out1); - // j++; - // } - // } - // } - // mempool_log_barrier(2, absolute_core_id); - - pSrc++; /* Increment the input pointer */ - l++; /* Increment the index modifier */ - } - mempool_log_barrier(2, absolute_core_id); - - return 0; -} - +/* The input matrix is folded in memory, to have ony local accesses */ int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, uint32_t *flag, uint32_t nPE) { @@ -991,9 +302,10 @@ int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, uint32_t absolute_core_id = mempool_get_core_id(); uint32_t core_id = absolute_core_id; uint32_t shift = 0; - uint32_t i, j, k, l; /* loop counters */ - uint32_t m = - n; /* M is the number of rows. However, the matrices must be square. */ + /* loop counters */ + uint32_t i, j, k, l; + /* M is the number of rows. However, the matrices must be square. */ + uint32_t m = n; /* CREATE THE IDENTITY MATRIX */ pDstT1 = pDst; @@ -1018,6 +330,7 @@ int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, in = *pSrcT1; /* CHECK IF PIVOT ELEMENT IS ZERO */ + // This is done by a single core if (absolute_core_id == 0) { if (in == 0U) { /* Loop over the rows present below */ diff --git a/software/runtime/kernel/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h index 0d4c77c7a..ce84de24e 100644 --- a/software/runtime/kernel/mempool_mat_inv_q32s.h +++ b/software/runtime/kernel/mempool_mat_inv_q32s.h @@ -171,34 +171,6 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { j++; } - /* Alternative = remainder of loop unrolling using switch-case */ - // switch ((n - l) % 4) { - // case 3: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // out3 = FIX_DIV(in3, in); - // *pSrcT1++ = out1; - // *pSrcT1++ = out2; - // *pSrcT1++ = out3; - // break; - // case 2: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // out1 = FIX_DIV(in1, in); - // out2 = FIX_DIV(in2, in); - // *pSrcT1++ = out1; - // *pSrcT1++ = out2; - // break; - // case 1: - // in1 = *pSrcT1; - // out1 = FIX_DIV(in1, in); - // *pSrcT1++ = out1; - // break; - //} - /* Loop over columns of the destination matrix */ j = 0; while (j < 4 * (n >> 2U)) { @@ -262,34 +234,6 @@ int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { j++; } - /* Alternative = remainder of loop unrolling using switch-case */ - // switch ((n - l) % 4) { - // case 3: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // in3 = *(pSrcT1 + 2); - // out1 = *pPRT_in++; - // out2 = *pPRT_in++; - // out3 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // *pSrcT1++ = in2 - FIX_MUL(in, out2); - // *pSrcT1++ = in3 - FIX_MUL(in, out3); - // break; - // case 2: - // in1 = *pSrcT1; - // in2 = *(pSrcT1 + 1); - // out1 = *pPRT_in++; - // out2 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // *pSrcT1++ = in2 - FIX_MUL(in, out2); - // break; - // case 1: - // in1 = *pSrcT1; - // out1 = *pPRT_in++; - // *pSrcT1++ = in1 - FIX_MUL(in, out1); - // break; - //} - /* Loop over the columns to replace the elements in the destination matrix */ j = 0; From c04dea31421a5c210f037540901c26d0000714d6 Mon Sep 17 00:00:00 2001 From: Marco Bertuletti Date: Fri, 26 May 2023 15:10:37 +0200 Subject: [PATCH 22/22] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9c9660fd..c78894eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Use custom compiler for VCS specified with `CC` and `CCX` environment variable - Implement operand gating for SIMD and MAC Units in Snitch IPU's DSP Unit - Add Channel Estimation application and kernels +- Add Gauss-Jordan matrix inversion kernel ### Fixed - Fix type issue in `snitch_addr_demux`