diff --git a/CHANGELOG.md b/CHANGELOG.md index f9c9660fd..c78894eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Use custom compiler for VCS specified with `CC` and `CCX` environment variable - Implement operand gating for SIMD and MAC Units in Snitch IPU's DSP Unit - Add Channel Estimation application and kernels +- Add Gauss-Jordan matrix inversion kernel ### Fixed - Fix type issue in `snitch_addr_demux` diff --git a/software/apps/mat_inv/initialization.h b/software/apps/mat_inv/initialization.h new file mode 100644 index 000000000..a37d5f38c --- /dev/null +++ b/software/apps/mat_inv/initialization.h @@ -0,0 +1,95 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#define FIXED_POINT 16 +#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) +#define FIX_MUL(a, b) ((int32_t)((a * b) >> FIXED_POINT)) +#define MIN(a, b) (a < b ? a : b) + +dump(l, 1); +dump(loopCnt, 2); +dump(i, 3); + +void display(int32_t *A, int32_t n, int32_t m); + +#ifdef FOLDED +void display_folded(int32_t *A, int32_t n, int32_t m); +#endif + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m); + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, + int32_t n, int32_t m, int32_t o); + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id); + +void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id); + +void display(int32_t *A, int32_t n, int32_t m) { + int32_t i; + for (i = 0; i < n * m; i++) { + printf("Output[%d] = %8d\n", i, A[i]); + } +} + +#ifdef FOLDED +void display_folded(int32_t *A, int32_t n, int32_t m) { + int32_t i, j, k, shift; + for (i = 0; i < n * m; i++) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + printf("Output[%d] = %8d\n", i, A[shift + j]); + } +} +#endif + +void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) { + int32_t i, j; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + t_matrix[j * n + i] = matrix[i * m + j]; + } + } +} + +void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, + int32_t n, int32_t m, int32_t o) { + int32_t i, j, k; + for (i = 0; i < n; i++) { + for (j = 0; j < o; j++) { + matrix_product[i * o + j] = 0; + for (k = 0; k < m; k++) { + matrix_product[i * o + j] += + FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]); + } + } + } +} + +void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + int32_t a, int32_t b, int32_t c, uint32_t core_id) { + if (core_id == 0) { + for (uint32_t j = 0; j < num_rows; j++) { + for (uint32_t i = 0; i < num_columns; i++) { + matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c; + } + } + } +} + +void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, + uint32_t core_id) { + if (core_id == 0) { + for (uint32_t i = 0; i < num_columns; i++) { + for (uint32_t j = 0; j < num_rows; j++) { + matrix[j * num_columns + i] = 0; + } + } + } +} diff --git a/software/apps/mat_inv/main.c b/software/apps/mat_inv/main.c new file mode 100644 index 000000000..7ada71ebb --- /dev/null +++ b/software/apps/mat_inv/main.c @@ -0,0 +1,105 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define N 16 +#define M 16 +#define N_BANKS (1024) +#define N_USED_BANKS (16) + +#define VERBOSE +#define SINGLE +// #define PARALLEL +// #define FOLDED + +#include "initialization.h" +#include "kernel/mempool_mat_inv_q32p.h" +#include "kernel/mempool_mat_inv_q32s.h" + +#ifdef FOLDED +int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)] + __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)] + __attribute__((aligned(N_BANKS), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); +#else +int32_t matrix[N * M] __attribute__((aligned(N), section(".l1"))); +int32_t inv[M * M] __attribute__((aligned(N), section(".l1"))); +uint32_t flag __attribute__((section(".l1"))); +#endif + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + +/* initialize the data */ +#if defined(SINGLE) || defined(PARALLEL) + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(inv, M, M, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + +#elif defined(FOLDED) + uint32_t nPE = N_USED_BANKS >> 2U; + init_matrix(matrix, N, M, -156, 427, -219, core_id); + init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id); + if (core_id == 0) { + flag = 0U; + } + mempool_barrier(num_cores); + +#endif + +/* Execute the kernel */ +#if defined(SINGLE) + if (core_id == 0) { + mempool_start_benchmark(); + mempool_GJinv_q32s(matrix, inv, M); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + +#elif defined(PARALLEL) + if (core_id < MIN(NUM_CORES, N / 4)) { + mempool_start_benchmark(); + mempool_GJinv_q32p(matrix, inv, M, &flag); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + +#elif defined(FOLDED) + mempool_start_benchmark(); + fold_matrix(matrix, folded_matrix, N); + mempool_stop_benchmark(); + if (core_id < nPE) { + mempool_start_benchmark(); + mempool_GJinv_folded_q32p(folded_matrix, inv, M, &flag, nPE); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + +#endif + +/* Display the result of computation */ +#ifdef VERBOSE + if (core_id == 0) + display(inv, M, N); + mempool_barrier(num_cores); +#endif + + return 0; +} diff --git a/software/runtime/kernel/mempool_mat_inv_q32p.h b/software/runtime/kernel/mempool_mat_inv_q32p.h new file mode 100644 index 000000000..a937ae33e --- /dev/null +++ b/software/runtime/kernel/mempool_mat_inv_q32p.h @@ -0,0 +1,552 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* GAUSS JORDAN INVERSION */ + +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n); + +void fold_matrix(int32_t *pSrc, int32_t *pDst, uint32_t n) { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t i, j, k, shift; + for (i = core_id * 4; i < n * n; i += num_cores * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDst[shift + j] = pSrc[i]; + pDst[shift + j + 1] = pSrc[i + 1]; + pDst[shift + j + 2] = pSrc[i + 2]; + pDst[shift + j + 3] = pSrc[i + 3]; + } + mempool_log_barrier(2, core_id); +} + +int mempool_GJinv_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag) { + + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t i, j, loopCnt, k, l; /* loop counters */ + uint32_t m = + n; /* M is the number of rows. However, the matirces must be square. */ + + /* CREATE THE IDENTITY MATRIX */ + + pDstT1 = pDst; + for (k = core_id * 4; k < m; k += 4 * num_cores) { + for (j = 0; j < m; j++) { + pDstT1[k * m + j] = (int32_t)(k == j); + pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j); + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* Loop over the number of columns of the input matrix. */ + loopCnt = n; + /* Index modifier to navigate through the columns */ + l = 0U; + + while (loopCnt > 0U) { + + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + if (core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + pSrcT2 = pSrc + (n * k); + pDstT2 = pDst + (n * k); + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + ///* Loop over columns to the right of pivot */ + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += num_cores * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + // j += num_cores * 4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + /* Loop over columns */ + for (j = core_id * 4; j < 4 * (n >> 2U); j += num_cores * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + /* Loop over rows */ + for (k = core_id * 4; k < m; k += num_cores * 4) { + i = 0U; + while (i < 4) { + if ((i + k) != l) { + pSrcT1 = pSrc + (i + k) * n; + pSrcT2 = pDst + (i + k) * n; + /* Element of the reference row */ + in = *pSrcT1; + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over columns to the right of pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + /* Loop over columns */ + j = 0; + while (j < 4 * (n >> 2U)) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + i++; + } + } + mempool_log_partial_barrier(2, core_id, MIN(num_cores, n / 4)); + + pSrc++; /* Increment the input pointer */ + loopCnt--; /* Decrement the loop counter */ + l++; /* Increment the index modifier */ + } + + // if ((flag != 1U) && (x == 0)) { + // for (i = 0; i < m * n; i++) { + // if (pSrc[i] != 0) + // break; + // } + // if (i == m * n) + // return 1; + // } + return 0; +} + +/* The input matrix is folded in memory, to have ony local accesses */ +int mempool_GJinv_folded_q32p(int32_t *pSrc, int32_t *pDst, uint32_t n, + uint32_t *flag, uint32_t nPE) { + + int32_t volatile *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t volatile *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + uint32_t shift = 0; + /* loop counters */ + uint32_t i, j, k, l; + /* M is the number of rows. However, the matrices must be square. */ + uint32_t m = n; + + /* CREATE THE IDENTITY MATRIX */ + pDstT1 = pDst; + for (i = core_id * 4; i < n * m; i += nPE * 4) { + k = i / n; + j = i % n; + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pDstT1[shift + j] = (int32_t)(k == j); + pDstT1[shift + j + 1] = (int32_t)(k == (j + 1)); + pDstT1[shift + j + 2] = (int32_t)(k == (j + 2)); + pDstT1[shift + j + 3] = (int32_t)(k == (j + 3)); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { + + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pDstT1 = pDst + shift; + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + // This is done by a single core + if (absolute_core_id == 0) { + if (in == 0U) { + /* Loop over the rows present below */ + for (k = l + 1U; k < m; k++) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT2 = pSrc + shift; + pDstT2 = pDst + shift; + /* EXCHANGE */ + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + Xchg1 = pSrcT2[j]; + Xchg2 = pSrcT2[j + 1]; + Xchg3 = pSrcT2[j + 2]; + Xchg4 = pSrcT2[j + 3]; + out1 = pSrcT1[j]; + out2 = pSrcT1[j + 1]; + out3 = pSrcT1[j + 2]; + out4 = pSrcT1[j + 3]; + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + pSrcT1[j] = Xchg1; + pSrcT1[j + 1] = Xchg2; + pSrcT1[j + 2] = Xchg3; + pSrcT1[j + 3] = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = pSrcT2[j]; + pSrcT2[j] = pSrcT1[j]; + pSrcT1[j] = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < 4 * (n >> 2U)) { + Xchg1 = pDstT2[j]; + Xchg2 = pDstT2[j + 1]; + Xchg3 = pDstT2[j + 2]; + Xchg4 = pDstT2[j + 3]; + out1 = pDstT1[j]; + out2 = pDstT1[j + 1]; + out3 = pDstT1[j + 2]; + out4 = pDstT1[j + 3]; + pDstT2[j] = out1; + pDstT2[j + 1] = out2; + pDstT2[j + 2] = out3; + pDstT2[j + 3] = out4; + pDstT1[j] = Xchg1; + pDstT1[j + 1] = Xchg2; + pDstT1[j + 2] = Xchg3; + pDstT1[j + 3] = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = pDstT2[j]; + pDstT2[j] = pDstT1[j]; + pDstT1[j] = Xchg1; + j++; + } + *flag = 1U; + break; + } + } + } + /* Update the status if the matrix is singular */ + if ((*flag == 0U) && (in == 0U)) { + return 1; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* DIVIDE BY THE PIVOT */ + /* Points to the pivot row of input and destination matrices */ + shift = N_BANKS * ((l * n) / N_USED_BANKS) + (l * n) % N_USED_BANKS; + pPivotRowIn = pSrc + shift; + pPivotRowDst = pDst + shift; + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + /* Loop over columns to the right of pivot */ + core_id = absolute_core_id - (((l * n + l) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * ((n - l) >> 2U); j += nPE * 4) { + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT1[j] = out1; + pSrcT1[j + 1] = out2; + pSrcT1[j + 2] = out3; + pSrcT1[j + 3] = out4; + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + pSrcT1[j] = FIX_DIV(in1, in); + j++; + } + } + + /* Loop over columns */ + core_id = absolute_core_id - (((l * n) % N_USED_BANKS) >> 2U); + core_id = core_id > nPE ? core_id + nPE : core_id; + for (j = core_id * 4; j < 4 * (n >> 2U); j += nPE * 4) { + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + pSrcT2[j] = out1; + pSrcT2[j + 1] = out2; + pSrcT2[j + 2] = out3; + pSrcT2[j + 3] = out4; + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + pSrcT2[j] = FIX_DIV(in1, in); + j++; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* REPLACE ROWS */ + pSrcT1 = pSrc; + pSrcT2 = pDst; + for (k = absolute_core_id / (n >> 2U); k < m; k += nPE / (n >> 2U)) { + /* Only the columns to the right of the pivot are to be processed */ + if (k != l) { + shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS; + pSrcT1 = pSrc + shift; + pSrcT2 = pDst + shift; + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + /* Loop over the columns */ + core_id = absolute_core_id % (n >> 2U); + core_id = core_id - (l >> 2U); + j = core_id * 4; + while (j < 4 * ((n - l) >> 2U)) { + out1 = pPRT_in[j]; + out2 = pPRT_in[j + 1]; + out3 = pPRT_in[j + 2]; + out4 = pPRT_in[j + 3]; + in1 = pSrcT1[j]; + in2 = pSrcT1[j + 1]; + in3 = pSrcT1[j + 2]; + in4 = pSrcT1[j + 3]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + pSrcT1[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT1[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT1[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == 0) { + j = 4 * ((n - l) >> 2U); + while (j < n - l) { + in1 = pSrcT1[j]; + out1 = pPRT_in[j]; + pSrcT1[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + core_id = absolute_core_id % (n >> 2U); + /* Loop over the columns */ + j = core_id * 4; + while (j < 4 * (n >> 2U)) { + out1 = pPRT_pDst[j]; + out2 = pPRT_pDst[j + 1]; + out3 = pPRT_pDst[j + 2]; + out4 = pPRT_pDst[j + 3]; + in1 = pSrcT2[j]; + in2 = pSrcT2[j + 1]; + in3 = pSrcT2[j + 2]; + in4 = pSrcT2[j + 3]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + pSrcT2[j + 1] = in2 - FIX_MUL(in, out2); + pSrcT2[j + 2] = in3 - FIX_MUL(in, out3); + pSrcT2[j + 3] = in4 - FIX_MUL(in, out4); + j += 4 * (n >> 2U); + } + if (core_id == (n >> 2U) - 1) { + j = 4 * (n >> 2U); + while (j < n) { + in1 = pSrcT2[j]; + out1 = pPRT_pDst[j]; + pSrcT2[j] = in1 - FIX_MUL(in, out1); + j++; + } + } + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + return 0; +} diff --git a/software/runtime/kernel/mempool_mat_inv_q32s.h b/software/runtime/kernel/mempool_mat_inv_q32s.h new file mode 100644 index 000000000..ce84de24e --- /dev/null +++ b/software/runtime/kernel/mempool_mat_inv_q32s.h @@ -0,0 +1,275 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* GAUSS JORDAN INVERSION */ + +int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n); + +/* GAUSS JORDAN ALGORITHM + - Form the augmented matrix by the identity matrix + - LOOP OVER ROWS ... + - Check if the element on the diagonal of the input matrix is zero + > The element is zero, check if there is a nonzero element in one of the + rows below on the same column > Exchange the row with the row containing a + nonzero element on the same column > If there is no such element then the + matrix is singular and the algorithm fails + + - Divide the current row by the element on the diagonal + - Replace all the rows below with the sum of that row and a multiple of the + current row (row i), so that each new element in column i, below row i is + zero. +*/ + +int mempool_GJinv_q32s(int32_t *pSrc, int32_t *pDst, uint32_t n) { + + int32_t *pSrcT1, *pSrcT2; /* Temporary input data matrix pointer */ + int32_t *pDstT1, *pDstT2; /* Temporary output data matrix pointer */ + int32_t *pPivotRowIn; /* Temporary input and output data matrix pointer */ + int32_t *pPRT_in, *pPivotRowDst, + *pPRT_pDst; /* Temporary input and output data matrix pointer */ + + int32_t in = 0; + int32_t Xchg1, Xchg2, Xchg3, Xchg4; + int32_t in1, in2, in3, in4; + int32_t out1, out2, out3, out4; + + uint32_t m = + n; /* M is the number of rows. However, the matrices must be square. */ + uint32_t i, j, k, l; /* loop counters */ + uint32_t flag = 0U; /* Flag to check if the matrix is singular */ + + pDstT1 = pDst; /* Working pointer for destination matrix */ + /* CREATE THE IDENTITY MATRIX */ + for (k = 0; k < m; k += 4) { + for (j = 0; j < n; j++) { + pDstT1[k * m + j] = (int32_t)(k == j); + pDstT1[(k + 1) * m + j] = (int32_t)((k + 1) == j); + pDstT1[(k + 2) * m + j] = (int32_t)((k + 2) == j); + pDstT1[(k + 3) * m + j] = (int32_t)((k + 3) == j); + } + } + + /* Index modifier to navigate through the columns */ + l = 0U; + while (l < n) { + + pSrcT1 = pSrc + (l * n); + pDstT1 = pDst + (l * n); + k = 1U; + in = *pSrcT1; + + /* CHECK IF PIVOT ELEMENT IS ZERO */ + + if (in == 0) { + /* Loop over the rows present below */ + for (i = (l + 1U); i < m; i++) { + pSrcT2 = pSrc + (n * i); + pDstT2 = pDstT1 + (n * k); + + /* EXCHANGE */ + + if (*pSrcT2 != 0) { + /* Loop over colums to the right of the pivot */ + j = 0; + while (j < (n - l) - (n - l) % 4) { + Xchg1 = *(pSrcT2); + Xchg2 = *(pSrcT2 + 1); + Xchg3 = *(pSrcT2 + 2); + Xchg4 = *(pSrcT2 + 3); + out1 = *(pSrcT1); + out2 = *(pSrcT1 + 1); + out3 = *(pSrcT1 + 2); + out4 = *(pSrcT1 + 3); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + *pSrcT1++ = Xchg1; + *pSrcT1++ = Xchg2; + *pSrcT1++ = Xchg3; + *pSrcT1++ = Xchg4; + j += 4; + } + while (j < n - l) { + Xchg1 = *pSrcT2; + *pSrcT2++ = *pSrcT1; + *pSrcT1++ = Xchg1; + j++; + } + /* Loop over colums */ + j = 0; + while (j < n - n % 4) { + Xchg1 = *(pDstT2); + Xchg2 = *(pDstT2 + 1); + Xchg3 = *(pDstT2 + 2); + Xchg4 = *(pDstT2 + 3); + out1 = *(pDstT1); + out2 = *(pDstT1 + 1); + out3 = *(pDstT1 + 2); + out4 = *(pDstT1 + 3); + *pDstT2++ = out1; + *pDstT2++ = out2; + *pDstT2++ = out3; + *pDstT2++ = out4; + *pDstT1++ = Xchg1; + *pDstT1++ = Xchg2; + *pDstT1++ = Xchg3; + *pDstT1++ = Xchg4; + j += 4; + } + while (j < n) { + Xchg1 = *pDstT2; + *pDstT2++ = *pDstT1; + *pDstT1++ = Xchg1; + j++; + } + flag = 1U; + break; + } + k++; + } + } + /* Return when the matrix is singular */ + if ((flag == 0U) && (in == 0)) { + return 1; + } + + /* DIVIDE BY THE PIVOT */ + + /* Points to the pivot row of input and destination matrices */ + pPivotRowIn = pSrc + (l * n); + pPivotRowDst = pDst + (l * n); + /* Temporary pointers to the pivot row pointers */ + pSrcT1 = pPivotRowIn; + pSrcT2 = pPivotRowDst; + /* Pivot element of the row */ + in = *pPivotRowIn; + + /* Loop over columns to the right of the pilot element */ + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + in1 = *pSrcT1; + in2 = *(pSrcT1 + 1); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT1++ = out1; + *pSrcT1++ = out2; + *pSrcT1++ = out3; + *pSrcT1++ = out4; + j += 4; + } + while (j < n - l) { + in1 = *pSrcT1; + *pSrcT1++ = FIX_DIV(in1, in); + j++; + } + + /* Loop over columns of the destination matrix */ + j = 0; + while (j < 4 * (n >> 2U)) { + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = FIX_DIV(in1, in); + out2 = FIX_DIV(in2, in); + out3 = FIX_DIV(in3, in); + out4 = FIX_DIV(in4, in); + *pSrcT2++ = out1; + *pSrcT2++ = out2; + *pSrcT2++ = out3; + *pSrcT2++ = out4; + j += 4; + } + while (j < n) { + in1 = *pSrcT2; + *pSrcT2++ = FIX_DIV(in1, in); + j++; + } + + /* REPLACE ROWS */ + + pSrcT1 = pSrc; + pSrcT2 = pDst; + i = 0U; /* pivot index */ + k = m; /* row index */ + while (k > 0U) { + /* Only the columns to the right of the pivot are to be processed */ + if (i == l) { + pSrcT1 += n - l; + pSrcT2 += n; + } else { + /* Element of the reference row */ + in = *pSrcT1; + /* Reference row pointers */ + pPRT_in = pPivotRowIn; + pPRT_pDst = pPivotRowDst; + j = 0; + while (j < 4 * ((n - l) >> 2U)) { + in1 = *pSrcT1; + in2 = *(pSrcT1 + 1); + in3 = *(pSrcT1 + 2); + in4 = *(pSrcT1 + 3); + out1 = *pPRT_in++; + out2 = *pPRT_in++; + out3 = *pPRT_in++; + out4 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + *pSrcT1++ = in2 - FIX_MUL(in, out2); + *pSrcT1++ = in3 - FIX_MUL(in, out3); + *pSrcT1++ = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n - l) { + in1 = *pSrcT1; + out1 = *pPRT_in++; + *pSrcT1++ = in1 - FIX_MUL(in, out1); + j++; + } + + /* Loop over the columns to + replace the elements in the destination matrix */ + j = 0; + while (j < 4 * (n >> 2U)) { + in1 = *pSrcT2; + in2 = *(pSrcT2 + 1); + in3 = *(pSrcT2 + 2); + in4 = *(pSrcT2 + 3); + out1 = *pPRT_pDst++; + out2 = *pPRT_pDst++; + out3 = *pPRT_pDst++; + out4 = *pPRT_pDst++; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + *pSrcT2++ = in2 - FIX_MUL(in, out2); + *pSrcT2++ = in3 - FIX_MUL(in, out3); + *pSrcT2++ = in4 - FIX_MUL(in, out4); + j += 4; + } + while (j < n) { + in1 = *pSrcT2; + out1 = *pPRT_pDst; + *pSrcT2++ = in1 - FIX_MUL(in, out1); + j++; + } + } + /* Increment temporary input pointer */ + pSrcT1 = pSrcT1 + l; + /* Decrement loop counter */ + k--; + /* Increment pivot index */ + i++; + } + + pSrc++; /* Increment the input pointer */ + l++; /* Increment the index modifier */ + } + + return 0; +} diff --git a/software/runtime/serial.c b/software/runtime/serial.c index a53ec2e1f..44aa30fe2 100644 --- a/software/runtime/serial.c +++ b/software/runtime/serial.c @@ -4,7 +4,7 @@ #include -extern char fake_uart; +extern volatile char fake_uart; void _putchar(char character) { // send char to console