Skip to content

Commit

Permalink
[software] Fix compilation warning for folded scheduled fft
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Aug 31, 2023
1 parent ffcd75a commit 04f3bc0
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 66 deletions.
112 changes: 47 additions & 65 deletions software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,36 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
v2s C3) {
int16_t t0, t1, t2, t3, t4, t5;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2s A, B, C, D, E, F, G, H;

// LOAD INDEXES
#if defined(FOLDED) || defined(SCHEDULED)
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
uint32_t n2_store = n2 >> 2U;
uint32_t i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
uint32_t i1_store = i0_store + n2_store;
uint32_t i2_store = i1_store + n2_store;
uint32_t i3_store = i2_store + n2_store;
#else
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
#endif
// STORE INDEXES
#if defined(FOLDED) || defined(SCHEDULED)
uint32_t n2_store = n2 >> 2U;
i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
i1_store = i0_store + n2_store;
i2_store = i1_store + n2_store;
i3_store = i2_store + n2_store;
#else
i0_store = i0;
i1_store = i1;
i2_store = i2;
i3_store = i3;
#endif

#ifndef ASM
v2s s1 = {1, 1};
Expand Down Expand Up @@ -100,17 +110,10 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
E = __PACK2(t1, t0);
F = __PACK2(t3, t2);
G = __PACK2(t5, t4);
#if defined(FOLDED) || defined(SCHEDULED)
*((v2s *)&pOut[i0_store * 2U]) = A;
*((v2s *)&pOut[i1_store * 2U]) = E;
*((v2s *)&pOut[i2_store * 2U]) = F;
*((v2s *)&pOut[i3_store * 2U]) = G;
#else
*((v2s *)&pOut[i0 * 2U]) = A;
*((v2s *)&pOut[i1 * 2U]) = E;
*((v2s *)&pOut[i2 * 2U]) = F;
*((v2s *)&pOut[i3 * 2U]) = G;
#endif
#else
v2s s1, s2;
/* Read yb (real), xb(imag) input */
Expand Down Expand Up @@ -169,17 +172,10 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
: [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
:);
#if defined(FOLDED) || defined(SCHEDULED)
*((v2s *)&pOut[i0_store * 2U]) = A;
*((v2s *)&pOut[i1_store * 2U]) = E;
*((v2s *)&pOut[i2_store * 2U]) = F;
*((v2s *)&pOut[i3_store * 2U]) = G;
#else
*((v2s *)&pOut[i0 * 2U]) = A;
*((v2s *)&pOut[i1 * 2U]) = E;
*((v2s *)&pOut[i2 * 2U]) = F;
*((v2s *)&pOut[i3 * 2U]) = G;
#endif
#endif
}

Expand All @@ -205,21 +201,17 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
v2s C3) {
int16_t t0, t1, t2, t3, t4, t5;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2s A, B, C, D, E, F, G, H;

// LOAD INDEXES
#if defined(FOLDED) || defined(SCHEDULED)
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
* 3fftLen/4] */
i1 = i0 + N_BANKS;
i2 = i1 + N_BANKS;
i3 = i2 + N_BANKS;
uint32_t n2_store = n2 >> 2U;
uint32_t i0_store =
(i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
uint32_t i1_store = i0_store + n2_store;
uint32_t i2_store = i1_store + n2_store;
uint32_t i3_store = i2_store + n2_store;
#else
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
Expand All @@ -228,6 +220,20 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
i2 = i1 + n2;
i3 = i2 + n2;
#endif
// STORE INDEXES
#if defined(FOLDED) || defined(SCHEDULED)
uint32_t n2_store = n2 >> 2U;
i0_store =
(i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
i1_store = i0_store + n2_store;
i2_store = i1_store + n2_store;
i3_store = i2_store + n2_store;
#else
i0_store = i0;
i1_store = i1;
i2_store = i2;
i3_store = i3;
#endif

#ifndef ASM
v2s s1 = {1, 1};
Expand Down Expand Up @@ -283,17 +289,10 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
A = __PACK2(t1, t0);
B = __PACK2(t3, t2);
C = __PACK2(t5, t4);
#if defined(FOLDED) || defined(SCHEDULED)
*((v2s *)&pOut[i0_store * 2U]) = D;
*((v2s *)&pOut[i1_store * 2U]) = A;
*((v2s *)&pOut[i2_store * 2U]) = B;
*((v2s *)&pOut[i3_store * 2U]) = C;
#else
*((v2s *)&pOut[i0 * 2U]) = D;
*((v2s *)&pOut[i1 * 2U]) = A;
*((v2s *)&pOut[i2 * 2U]) = B;
*((v2s *)&pOut[i3 * 2U]) = C;
#endif
#else
v2s s1;
/* Read yb (real), xb(imag) input */
Expand Down Expand Up @@ -348,18 +347,10 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
: [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
:);

#if defined(FOLDED) || defined(SCHEDULED)
*((v2s *)&pOut[i0_store * 2U]) = D;
*((v2s *)&pOut[i1_store * 2U]) = A;
*((v2s *)&pOut[i2_store * 2U]) = B;
*((v2s *)&pOut[i3_store * 2U]) = C;
#else
*((v2s *)&pOut[i0 * 2U]) = D;
*((v2s *)&pOut[i1 * 2U]) = A;
*((v2s *)&pOut[i2 * 2U]) = B;
*((v2s *)&pOut[i3 * 2U]) = C;
#endif
#endif
}

Expand All @@ -376,21 +367,17 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
uint32_t i0) {
int16_t t0, t1;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2s A, B, C, D, E, F, G, H;

// LOAD INDEXES
#if defined(FOLDED) || defined(SCHEDULED)
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4],
pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
i1 = i0 + N_BANKS;
i2 = i1 + N_BANKS;
i3 = i2 + N_BANKS;
#ifndef SCHEDULED
uint32_t i0_store = i0 * 4;
uint32_t i1_store = i0_store + 1;
uint32_t i2_store = i1_store + 1;
uint32_t i3_store = i2_store + 1;
#endif
#else
/* index calculation for the input as, */
/* pIn[i0 + 0], pIn[i0 + fftLen/4],
Expand All @@ -399,6 +386,18 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
i2 = i1 + 1U;
i3 = i2 + 1U;
#endif
// STORE INDEXES
#if defined(FOLDED)
i0_store = i0 * 4;
i1_store = i0_store + 1;
i2_store = i1_store + 1;
i3_store = i2_store + 1;
#else
i0_store = i0;
i1_store = i1;
i2_store = i2;
i3_store = i3;
#endif

#ifndef ASM
v2s s1 = {1, 1};
Expand All @@ -424,13 +423,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
t0 = (int16_t)H[0];
t1 = (int16_t)H[1];
F = __SRA2(F, s1);
/* xa' = (xa+xb+xc+xd) */
/* ya' = (ya+yb+yc+yd) */
#if defined(FOLDED)
/* xa' = (xa+xb+xc+xd) */
/* ya' = (ya+yb+yc+yd) */
*((v2s *)&pOut[i0_store * 2U]) = __ADD2(E, G);
#else
*((v2s *)&pOut[i0 * 2U]) = __ADD2(E, G);
#endif
/* A0 = (xb-xd), A1 = (yd-yb) */
A = __PACK2(-t0, t1);
/* B0 = (xd-xb), B1 = (yb-yd) */
Expand All @@ -444,15 +439,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
B = __ADD2(F, B);
#if defined(FOLDED)
*((v2s *)&pOut[i1_store * 2U]) = E;
*((v2s *)&pOut[i2_store * 2U]) = A;
*((v2s *)&pOut[i3_store * 2U]) = B;
#else
*((v2s *)&pOut[i1 * 2U]) = E;
*((v2s *)&pOut[i2 * 2U]) = A;
*((v2s *)&pOut[i3 * 2U]) = B;
#endif
#else
/* Read yb (real), xb(imag) input */
B = *(v2s *)&pIn[i1 * 2U];
Expand Down Expand Up @@ -491,16 +480,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
[t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s1] "=&r"(s1)
:
:);
#if defined(FOLDED)
*((v2s *)&pOut[i0_store * 2U]) = H;
*((v2s *)&pOut[i1_store * 2U]) = E;
*((v2s *)&pOut[i2_store * 2U]) = A;
*((v2s *)&pOut[i3_store * 2U]) = B;
#else
*((v2s *)&pOut[i0 * 2U]) = H;
*((v2s *)&pOut[i1 * 2U]) = E;
*((v2s *)&pOut[i2 * 2U]) = A;
*((v2s *)&pOut[i3 * 2U]) = B;
#endif
#endif
}
23 changes: 22 additions & 1 deletion software/runtime/kernel/mempool_radix4_cfft_q16p.h
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,8 @@ void mempool_radix4_cfft_q16p_folded(int16_t *pSrc16, int16_t *pDst16,

void mempool_radix4_cfft_q16p_scheduler(
int16_t *pSrc16, int16_t *pDst16, uint32_t fftLen, int16_t *pCoef_src,
int16_t *pCoef_dst, __attribute__((unused)) uint16_t *pBitRevTable,
__attribute__((unused)) int16_t *pCoef_dst,
__attribute__((unused)) uint16_t *pBitRevTable,
__attribute__((unused)) uint16_t bitReverseLen, uint8_t bitReverseFlag,
uint32_t nPE) {

Expand All @@ -421,17 +422,25 @@ void mempool_radix4_cfft_q16p_scheduler(
int16_t t0, t1, t2, t3, t4, t5;
v2s CoSi1, CoSi2, CoSi3;
v2s C1, C2, C3;
#ifdef FOLDED_TWIDDLES
uint32_t n1, n2, n2_store;
uint32_t i0, k, ic, ic_store;
#else
uint32_t n1, n2;
uint32_t i0, k, ic;
uint32_t twidCoefModifier = 1U;
#endif
int16_t *pTmp;

/* FIRST STAGE */
n1 = fftLen;
n2 = n1 >> 2U;
for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
ic = i0;
#ifdef FOLDED_TWIDDLES
ic_store = ic >> 2U;
n2_store = n2 >> 2U;
#endif
LOAD_STORE_TWIDDLEFACT;
SHUFFLE_TWIDDLEFACT;
for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
Expand All @@ -445,19 +454,27 @@ void mempool_radix4_cfft_q16p_scheduler(
pTmp = pSrc16;
pSrc16 = pDst16;
pDst16 = pTmp;
#ifdef FOLDED_TWIDDLES
pTmp = pCoef_src;
pCoef_src = pCoef_dst;
pCoef_dst = pTmp;
#else
twidCoefModifier <<= 2U;
#endif
mempool_log_partial_barrier(2, absolute_core_id, nPE);

/* MIDDLE STAGE */
for (k = fftLen / 4U; k > 4U; k >>= 2U) {
n1 = n2;
n2 >>= 2U;
for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
#ifdef FOLDED_TWIDDLES
ic = i0;
ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
n2_store = n2 >> 2U;
#else
ic = (i0 % n2) * twidCoefModifier;
#endif
LOAD_STORE_TWIDDLEFACT;
SHUFFLE_TWIDDLEFACT;

Expand All @@ -473,9 +490,13 @@ void mempool_radix4_cfft_q16p_scheduler(
pTmp = pSrc16;
pSrc16 = pDst16;
pDst16 = pTmp;
#ifdef FOLDED_TWIDDLES
pTmp = pCoef_src;
pCoef_src = pCoef_dst;
pCoef_dst = pTmp;
#else
twidCoefModifier <<= 2U;
#endif
mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
}

Expand Down

0 comments on commit 04f3bc0

Please sign in to comment.