From b9ff15557414b682d06b813d8509ee66be72df16 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Fri, 5 Jul 2024 15:56:59 +0200 Subject: [PATCH] [software] Add shuffle instruction in FFT butterfly --- .../apps/baremetal/cfft_radix4_f16/main.c | 46 ++++++- software/data/generate_cfft.py | 8 +- .../mempool_radix4_cfft_butterfly_f16.h | 120 ++++++++---------- .../baremetal/mempool_radix4_cfft_f16p.h | 85 ++++++++++++- 4 files changed, 179 insertions(+), 80 deletions(-) diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c index c459d062a..30341f46d 100644 --- a/software/apps/baremetal/cfft_radix4_f16/main.c +++ b/software/apps/baremetal/cfft_radix4_f16/main.c @@ -21,7 +21,6 @@ #include "data_cfft_radix4_f16.h" /* CHOOSE ONE */ -//#define SINGLE // Single core FFT. //#define PARALLEL // Parallel FFT not "memory-aware". //#define FOLDED // Parallel FFT with "memory-aware" load/store. #define SCHEDULED // Folded FFTs arranged in rows and cols.''' @@ -29,9 +28,9 @@ // Bitreversal index from table. #define BITREVERSETABLE // Independent FFTs scheduled on one row (default 1). -#define N_FFTs_ROW 2 +#define N_FFTs_ROW 1 // Independent FFTs scheduled on columns (default 1). -#define N_FFTs_COL 2 +#define N_FFTs_COL 1 #if (N_FFTs_COL > MAX_COL) #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] #endif @@ -77,6 +76,20 @@ int main() { /* INITIALIZATION */ +#if (defined(SINGLE) || defined(PARALLEL)) + if (core_id == 0) { + dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t)); + dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, + 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); + printf("01: END INITIALIZATION\n"); + } + mempool_barrier(num_cores); +#endif + +#if (defined(SCHEDULED) || defined(FOLDED)) + if (core_id == 0) { for (uint32_t j = 0; j < N_FFTs_ROW; j++) { for (uint32_t i = 0; i < N_FFTs_COL; i++) { @@ -88,6 +101,8 @@ int main() { BITREVINDEXTABLE_LENGTH * sizeof(int32_t)); } mempool_barrier(num_cores); + +#ifdef FOLDED_TWIDDLES for (uint32_t j = 0; j < N_FFTs_COL; j++) { uint32_t N_WORDS_COL = N_CSAMPLES >> 2; for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { @@ -99,10 +114,31 @@ int main() { *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)]; } } +#else + if (core_id == 0) { + dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, + 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + } +#endif + mempool_barrier(num_cores); + if (core_id == 0) { printf("01: END INITIALIZATION\n"); } mempool_barrier(num_cores); +#endif + + /* COMPUTATION */ + +#ifdef PARALLEL + mempool_start_benchmark(); + mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1, + num_cores); + mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, + l1_BitRevIndexTable, num_cores); + mempool_stop_benchmark(); + pRes = l1_pSrc; +#endif #ifdef FOLDED if (core_id < (N_CSAMPLES / 16)) { @@ -111,7 +147,7 @@ int main() { l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16)); pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; - mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH, + mempool_bitrevtable_q16p_xpulpimg((int16_t *)pRes, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable, (N_CSAMPLES / 16)); mempool_stop_benchmark(); } @@ -140,7 +176,7 @@ int main() { printf("02: END COMPUTATION\n"); } - mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0); + mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/data/generate_cfft.py b/software/data/generate_cfft.py index fa6759df3..c0506a8d3 100755 --- a/software/data/generate_cfft.py +++ b/software/data/generate_cfft.py @@ -58,8 +58,10 @@ def generate_cfft_q16(N): def generate_cfft_f16(N): - src = np.random.rand(N).astype(np.float16) - src = src + 1.j * np.random.rand(N).astype(np.float16) + # src = np.random.rand(N).astype(np.float16) + # src = src + 1.j * np.random.rand(N).astype(np.float16) + src = np.cos(np.linspace(0, N / 4, num=N)).astype(np.float16) + src = src + 1.j * np.sin(np.linspace(0, N / 4, num=N)).astype(np.float16) dst = np.fft.fft(src) src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten() dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten() @@ -142,7 +144,7 @@ def main(): "--dimension", type=int, required=False, - default=64, + default=256, help='Input dimension' ) diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h index edf7ea735..0d68e3d9d 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h @@ -27,7 +27,6 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, uint32_t i0, uint32_t n2, v2h CoSi1, v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, v2h C3) { - __fp16 t0, t1, t2, t3; uint32_t i1, i2, i3; uint32_t i0_store, i1_store, i2_store, i3_store; v2h A, B, C, D, E, F, G, H; @@ -74,6 +73,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, A = *(v2h *)&pIn[i0 * 2U]; /* Read xc (real), yc(imag) input */ C = *(v2h *)&pIn[i2 * 2U]; + asm volatile( // G = (xb + xd), (yb + yd) "vfadd.h %[G],%[B],%[D];" @@ -85,33 +85,30 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, "vfsub.h %[F],%[A],%[C];" // C = (yb - yd), (xd - xb) - // D = (yd - yb), (xb - xd) - "pv.extract.h %[t0],%[H],0;" // yb - yd - "pv.extract.h %[t1],%[H],1;" // xb - xd - "xor %[t2],%[t0],%[neg_mask];" // yd - yb - "xor %[t3],%[t1],%[neg_mask];" // xd - xb - "pv.pack %[C],%[t0],%[t3];" - "pv.pack %[D],%[t2],%[t1];" + "pv.shuffle2.h %[C],%[H],%[mask];" + "vfmul.h %[C],%[C],%[neg_mask];" // xa + xb + xc + xd, ya + yc + yb + yd "vfadd.h %[A],%[E],%[G];" // xa + xc - xb - xd, ya + yc - yb - yd "vfsub.h %[B],%[E],%[G];" // xa - xc + yb - yd, ya - yc + xd - xb - "vfadd.h %[C],%[F],%[C];" + "vfadd.h %[D],%[F],%[C];" // xa - xc + yd - yb, ya - yc + xb - xd - "vfadd.h %[D],%[F],%[D];" + "vfsub.h %[C],%[F],%[C];" + + // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb) + // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb) + "vfdotpex.s.h %[s0],%[CoSi1],%[D];" + "vfdotpex.s.h %[s1],%[C1],%[D];" // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) - // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd) - "vfdotpex.s.h %[s0],%[CoSi2],%[B];" - "vfdotpex.s.h %[s1],%[C2],%[B];" - // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd) - // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd) - "vfdotpex.s.h %[s2],%[CoSi1],%[D];" - "vfdotpex.s.h %[s3],%[C1],%[D];" - // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb) - // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb) + // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd) + "vfdotpex.s.h %[s2],%[CoSi2],%[B];" + "vfdotpex.s.h %[s3],%[C2],%[B];" + + // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd) + // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd) "vfdotpex.s.h %[s4],%[CoSi3],%[C];" "vfdotpex.s.h %[s5],%[C3],%[C];" @@ -121,17 +118,17 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, "vfcpka.h.s %[C], %[s3], %[s2];" // xd', yd' "vfcpka.h.s %[D], %[s5], %[s4];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), - [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), - [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0), + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), + [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0), [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), [s5] "=&r"(s5) : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), - [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000) + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003), + [neg_mask] "r"(0x3C00BC00) :); *((v2h *)&pOut[i0_store * 2U]) = A; - *((v2h *)&pOut[i1_store * 2U]) = B; - *((v2h *)&pOut[i2_store * 2U]) = C; + *((v2h *)&pOut[i1_store * 2U]) = C; + *((v2h *)&pOut[i2_store * 2U]) = B; *((v2h *)&pOut[i3_store * 2U]) = D; } @@ -155,7 +152,6 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, uint32_t i0, uint32_t n2, v2h CoSi1, v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, v2h C3) { - __fp16 t0, t1, t2, t3; uint32_t i1, i2, i3; uint32_t i0_store, i1_store, i2_store, i3_store; v2h A, B, C, D, E, F, G, H; @@ -205,6 +201,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, A = *(v2h *)&pIn[i0 * 2U]; /* Read xc (real), yc(imag) input */ C = *(v2h *)&pIn[i2 * 2U]; + asm volatile( // G = (xb + xd), (yb + yd) "vfadd.h %[G],%[B],%[D];" @@ -216,33 +213,30 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, "vfsub.h %[F],%[A],%[C];" // C = (yb - yd), (xd - xb) - // D = (yd - yb), (xb - xd) - "pv.extract.h %[t0],%[H],0;" // yb - yd - "pv.extract.h %[t1],%[H],1;" // xb - xd - "xor %[t2],%[t0],%[neg_mask];" // yd - yb - "xor %[t3],%[t1],%[neg_mask];" // xd - xb - "pv.pack %[C],%[t0],%[t3];" - "pv.pack %[D],%[t2],%[t1];" + "pv.shuffle2.h %[C],%[H],%[mask];" + "vfmul.h %[C],%[C],%[neg_mask];" // xa + xb + xc + xd, ya + yc + yb + yd "vfadd.h %[A],%[E],%[G];" // xa + xc - xb - xd, ya + yc - yb - yd "vfsub.h %[B],%[E],%[G];" // xa - xc + yb - yd, ya - yc + xd - xb - "vfadd.h %[C],%[F],%[C];" + "vfadd.h %[D],%[F],%[C];" // xa - xc + yd - yb, ya - yc + xb - xd - "vfadd.h %[D],%[F],%[D];" + "vfsub.h %[C],%[F],%[C];" + + // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb) + // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb) + "vfdotpex.s.h %[s0],%[CoSi1],%[D];" + "vfdotpex.s.h %[s1],%[C1],%[D];" // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) - // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd) - "vfdotpex.s.h %[s0],%[CoSi2],%[B];" - "vfdotpex.s.h %[s1],%[C2],%[B];" - // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd) - // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd) - "vfdotpex.s.h %[s2],%[CoSi1],%[D];" - "vfdotpex.s.h %[s3],%[C1],%[D];" - // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb) - // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb) + // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd) + "vfdotpex.s.h %[s2],%[CoSi2],%[B];" + "vfdotpex.s.h %[s3],%[C2],%[B];" + + // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd) + // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd) "vfdotpex.s.h %[s4],%[CoSi3],%[C];" "vfdotpex.s.h %[s5],%[C3],%[C];" @@ -252,18 +246,17 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, "vfcpka.h.s %[C], %[s3], %[s2];" // xd', yd' "vfcpka.h.s %[D], %[s5], %[s4];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), - [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), - [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0), + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), + [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0), [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), [s5] "=&r"(s5) : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), - [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000) + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003), + [neg_mask] "r"(0x3C00BC00) :); - *((v2h *)&pOut[i0_store * 2U]) = A; - *((v2h *)&pOut[i1_store * 2U]) = B; - *((v2h *)&pOut[i2_store * 2U]) = C; + *((v2h *)&pOut[i1_store * 2U]) = C; + *((v2h *)&pOut[i2_store * 2U]) = B; *((v2h *)&pOut[i3_store * 2U]) = D; } @@ -278,7 +271,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, */ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, uint32_t i0) { - __fp16 t0, t1; + __fp16 t0, t1, t2, t3; uint32_t i1, i2, i3; uint32_t i0_store, i1_store, i2_store, i3_store; v2h A, B, C, D, E, F, G, H; @@ -300,7 +293,7 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, i3 = i2 + 1U; #endif // STORE INDEXES -#if defined(FOLDED) +#if defined(FOLDED) || (defined(SCHEDULED) && defined(BITREVERSETABLE)) i0_store = i0 * 4; i1_store = i0_store + 1; i2_store = i1_store + 1; @@ -320,7 +313,6 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, A = *(v2h *)&pIn[i0 * 2U]; /* Read xc (imag), yc(real) input */ C = *(v2h *)&pIn[i2 * 2U]; - __fp16 t2, t3; asm volatile( /* (xb - xd), (yb - yd) */ "vfsub.h %[H],%[B],%[D];" @@ -331,32 +323,26 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, /* (xa - xc), (ya - yc) */ "vfsub.h %[F],%[A],%[C];" - "pv.extract.h %[t0],%[H],0;" // (yb - yd) - "pv.extract.h %[t1],%[H],1;" // (xb - xd) - "xor %[t2],%[t0],%[neg_mask];" // (yd - yb) - "xor %[t3],%[t1],%[neg_mask];" // (xd - xb) /* (yd - yb), (xb - xd) */ - "pv.pack %[A],%[t2],%[t1];" - /* (yb - yd), (xd - xb) */ - "pv.pack %[B],%[t0],%[t3];" + "pv.shuffle2.h %[A],%[H],%[mask];" + "vfmul.h %[A],%[A],%[neg_mask];" /* (xa + xc + xb + xd), (ya + yc + yb + yd) */ "vfadd.h %[H],%[E],%[G];" /* (xa + xc - xb - xd), (ya + yc - yb - yd) */ "vfsub.h %[E],%[E],%[G];" /* (xa - xc + yd - yb), (ya - yc + xb - xd) */ - "vfadd.h %[A],%[F],%[A];" + "vfadd.h %[B],%[F],%[A];" /* (xa - xc + yb - yd), (ya - yc + xd - xb) */ - "vfadd.h %[B],%[F],%[B];" + "vfsub.h %[A],%[F],%[A];" : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3) - : [neg_mask] "r"(0x00008000) + : [mask] "r"(0x00020003), [neg_mask] "r"(0x3C00BC00) :); - *((v2h *)&pOut[i0_store * 2U]) = H; *((v2h *)&pOut[i1_store * 2U]) = E; - *((v2h *)&pOut[i2_store * 2U]) = A; - *((v2h *)&pOut[i3_store * 2U]) = B; + *((v2h *)&pOut[i2_store * 2U]) = B; + *((v2h *)&pOut[i3_store * 2U]) = A; } diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h index c82684995..e7bd7edc5 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h @@ -17,9 +17,9 @@ "pv.extract.h %[t0],%[CoSi1],1;" \ "pv.extract.h %[t2],%[CoSi2],1;" \ "pv.extract.h %[t4],%[CoSi3],1;" \ - "xor %[t0],%[t0],%[neg_mask];" \ - "xor %[t2],%[t2],%[neg_mask];" \ - "xor %[t4],%[t4],%[neg_mask];" \ + "xor %[t1],%[t1],%[neg_mask];" \ + "xor %[t3],%[t3],%[neg_mask];" \ + "xor %[t5],%[t5],%[neg_mask];" \ "pv.pack %[C1],%[t1],%[t0];" \ "pv.pack %[C2],%[t3],%[t2];" \ "pv.pack %[C3],%[t5],%[t4];" \ @@ -31,7 +31,6 @@ :); #ifdef FOLDED_TWIDDLES - #define LOAD_STORE_TWIDDLEFACT \ CoSi1 = *(v2h *)&pCoef_src[2U * ic]; \ CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ @@ -60,6 +59,82 @@ CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)]; #endif +void mempool_radix4_cfft_f16p(__fp16 *pSrc16, uint32_t fftLen, + const __fp16 *pCoef16, uint32_t twidCoefModifier, + uint32_t nPE) { + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id % nPE; + __fp16 t0, t1, t2, t3, t4, t5; + v2h CoSi1, CoSi2, CoSi3; + v2h C1, C2, C3; + uint32_t n1, n2, ic, i0, j, k; + uint32_t step, steps; + + /* START OF FIRST STAGE PROCESSING */ + n1 = fftLen; + n2 = n1 >> 2U; + step = (n2 + nPE - 1) / nPE; + for (i0 = core_id * step; i0 < MIN(core_id * step + step, n2); i0++) { + /* Twiddle coefficients index modifier */ + ic = i0 * twidCoefModifier; + /* co1 & si1 are read from Coefficient pointer */ + CoSi1 = *(v2h *)&pCoef16[ic * 2U]; + /* co2 & si2 are read from Coefficient pointer */ + CoSi2 = *(v2h *)&pCoef16[2U * (ic * 2U)]; + /* co3 & si3 are read from Coefficient pointer */ + CoSi3 = *(v2h *)&pCoef16[3U * (ic * 2U)]; + SHUFFLE_TWIDDLEFACT; + radix4_butterfly_first(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + mempool_log_barrier(2, absolute_core_id); + /* END OF FIRST STAGE PROCESSING */ + + /* START OF MIDDLE STAGE PROCESSING */ + twidCoefModifier <<= 2U; + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + uint32_t offset, butt_id; + n1 = n2; + n2 >>= 2U; + step = (n2 + nPE - 1) / nPE; + butt_id = core_id % n2; + offset = (core_id / n2) * n1; + for (j = butt_id * step; j < MIN(butt_id * step + step, n2); j++) { + /* Twiddle coefficients index modifier */ + ic = twidCoefModifier * j; + /* co1 & si1 are read from Coefficient pointer */ + CoSi1 = *(v2h *)&pCoef16[ic * 2U]; + /* co2 & si2 are read from Coefficient pointer */ + CoSi2 = *(v2h *)&pCoef16[2U * (ic * 2U)]; + /* co3 & si3 are read from Coefficient pointer */ + CoSi3 = *(v2h *)&pCoef16[3U * (ic * 2U)]; + SHUFFLE_TWIDDLEFACT; + /* Butterfly implementation */ + for (i0 = offset + j; i0 < fftLen; i0 += ((nPE + n2 - 1) / n2) * n1) { + radix4_butterfly_middle(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, + C2, C3); + } + } + twidCoefModifier <<= 2U; + mempool_log_barrier(2, absolute_core_id); + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + n1 = n2; + n2 >>= 2U; + steps = fftLen / n1; + step = (steps + nPE - 1) / nPE; + /* Butterfly implementation */ + for (i0 = core_id * step * n1; i0 < MIN((core_id * step + step) * n1, fftLen); + i0 += n1) { + radix4_butterfly_last(pSrc16, pSrc16, i0); + } + mempool_log_barrier(2, absolute_core_id); + /* END OF LAST STAGE PROCESSING */ + return; +} + /** @brief Full FFT butterfly @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are @@ -97,7 +172,6 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, n1 = fftLen; n2 = n1 >> 2U; for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { - #ifdef FOLDED_TWIDDLES ic = i0; ic_store = ic >> 2U; @@ -162,6 +236,7 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, } mempool_log_partial_barrier(2, absolute_core_id, nPE); /* END OF LAST STAGE PROCESSING */ + return; }