diff --git a/software/apps/cfft_radix4_f16/main.c b/software/apps/cfft_radix4_f16/main.c index be9c4bd41..28fcd7871 100644 --- a/software/apps/cfft_radix4_f16/main.c +++ b/software/apps/cfft_radix4_f16/main.c @@ -41,10 +41,11 @@ #define ASM #if !(defined(N_FFT_ROW) && defined(N_FFTs_COL)) -#define N_FFTs_ROW 2 -#define N_FFTs_COL 2 +#define N_FFTs_ROW 1 +#define N_FFTs_COL 1 #endif +#include "kernel/mempool_checks.h" #include "kernel/mempool_radix4_cfft_butterfly_f16.h" #include "kernel/mempool_radix4_cfft_f16p.h" #include "kernel/mempool_radix4_cfft_q16_bitreversal.h" @@ -73,19 +74,27 @@ int main() { // Each FFT is folded over 4 memory rows // Each memory row is 2 * N_BANKS samples for (uint32_t j = 0; j < N_FFTs_ROW; j++) { - dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc, (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t)); + dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc, + (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t)); } - dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); - dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); + dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, + 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); } // Initialize the Twiddles folded #ifdef FOLDED_TWIDDLES for (uint32_t j = 0; j < N_FFTs_COL; j++) { uint32_t N_WORDS_COL = (N_CSAMPLES / 4); for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { - *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4))] = *(v2h *)&l2_twiddleCoef_f16[2U * i]; - *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) + 1 * N_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)]; - *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) + 2 * N_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)]; + *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4))] = + *(v2h *)&l2_twiddleCoef_f16[2U * i]; + *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) + + 1 * N_BANKS)] = + *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)]; + *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) + + 2 * N_BANKS)] = + *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)]; } } #endif @@ -99,16 +108,20 @@ int main() { /////////////////////////////////////////////////////////////////////////////////////////////////// /* MULTI-CORE FOLDED */ #ifdef FOLDED - __fp16 *pRes; + __fp16 *pRes = NULL; if (core_id < (N_CSAMPLES / 16)) { mempool_start_benchmark(); #ifdef FOLDED_TWIDDLES - mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16)); + mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_f16_src, + l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16)); #else - mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, l1_twiddleCoef_f16_src, (N_CSAMPLES / 16)); + mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_f16_src, (N_CSAMPLES / 16)); #endif pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; - mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable, (N_CSAMPLES / 16)); + mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH, + l1_BitRevIndexTable, (N_CSAMPLES / 16)); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -117,14 +130,17 @@ int main() { /////////////////////////////////////////////////////////////////////////////////////////////////// /* MULTI-CORE SCHEDULED */ #ifdef SCHEDULED - __fp16 *pRes; + __fp16 *pRes = NULL; if (core_id < N_FFTs_COL * (N_CSAMPLES / 16)) { mempool_start_benchmark(); uint32_t col_fftLen = N_CSAMPLES / 4; uint32_t col_id = core_id / (N_CSAMPLES / 16); // Distribute FFTs over columns if (col_id < N_FFTs_COL) { - mempool_radix4_cfft_f16p_scheduler(l1_pSrc, l1_pDst, N_CSAMPLES, l1_pCoef_src + 2 * col_id * col_fftLen, l1_pCoef_dst + 2 * col_id * col_fftLen, l1_pRevT16, BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 1, (N_CSAMPLES / 16)); + mempool_radix4_cfft_f16p_scheduler( + l1_pSrc, l1_pDst, N_CSAMPLES, l1_pCoef_src + 2 * col_id * col_fftLen, + l1_pCoef_dst + 2 * col_id * col_fftLen, l1_pRevT16, + BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 1, (N_CSAMPLES / 16)); } pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; mempool_log_partial_barrier(2, core_id, N_FFTs_COL * (N_CSAMPLES / 16)); @@ -135,24 +151,7 @@ int main() { /////////////////////////////////////////////////////////////////////////////////////////////////// /* CHECK */ - if (core_id == 0) { - printf("Done!\n"); - for (uint32_t i = 0; i < 2 * N_CSAMPLES; i++) { - __fp16 exp = l2_pRes[i]; - __fp16 res = pRes[i]; - __fp16 dif; - float tol = (__fp16)0.05f; - float dif_f32; - asm volatile("fsub.h %[dif], %[res], %[exp];" - "fcvt.h.s %[dif_f32], %[dif];" - : [dif] "+&r"(dif), [dif_f32] "+&r"(dif_f32) - : [res] "r"(res), [exp] "r"(exp) - :); - if ((dif_f32 > tol) || (dif_f32 < (-tol))) { - printf("%d %x %x\n", i, *(int32_t *)&exp, *(int32_t *)&res); - } - } - } + mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c index ca9cb29c3..551214e9c 100644 --- a/software/apps/cfft_radix4_q16/main.c +++ b/software/apps/cfft_radix4_q16/main.c @@ -35,7 +35,7 @@ - ASM: Use asm_volatile statements */ -#define SCHEDULED +#define FOLDED #define FOLDED_TWIDDLES #define BITREVERSETABLE #define ASM // Use asm_volatile statements @@ -153,7 +153,7 @@ int main() { #endif pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH, - pRevT16, (N_CSAMPLES / 16)); + l1_BitRevIndexTable, (N_CSAMPLES / 16)); mempool_stop_benchmark(); } mempool_barrier(num_cores); diff --git a/software/runtime/data/data_cfft_radix4_f16.h.tpl b/software/runtime/data/data_cfft_radix4_f16.h.tpl index 825780e1a..883049a44 100644 --- a/software/runtime/data/data_cfft_radix4_f16.h.tpl +++ b/software/runtime/data/data_cfft_radix4_f16.h.tpl @@ -7,7 +7,7 @@ i = 0 out += '\n' for a in array: - out += '(__fp16){:0.5}f, '.format(a) + out += '(__fp16){:0.4}f, '.format(a) i += 1 if i % 8 == 0: out += '\n' diff --git a/software/runtime/data/data_cfft_radix4_f16.py b/software/runtime/data/data_cfft_radix4_f16.py index f927a8d5d..ca90265c8 100644 --- a/software/runtime/data/data_cfft_radix4_f16.py +++ b/software/runtime/data/data_cfft_radix4_f16.py @@ -93,16 +93,17 @@ def main(): args = parser.parse_args() Len = args.dimension - src = np.random.rand(Len) + 1.j * np.random.rand(Len) + src = np.random.rand(Len).astype(np.float16) + src = src + 1.j * np.random.rand(Len).astype(np.float16) dst = np.fft.fft(src) - src = np.column_stack((src.real, src.imag)).astype(np.float16).flatten() - dst = np.column_stack((dst.real, dst.imag)).astype(np.float16).flatten() + src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten() + dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten() Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) twi = np.zeros(int(2 * 3 * Len / 4), np.float16) for i in range(0, int(3 * Len / 4)): - twi[2 * i] = np.cos(i * 2 * np.pi / Len).astype(np.float16) - twi[2 * i + 1] = np.sin(i * 2 * np.pi / Len).astype(np.float16) + twi[2 * i] = np.sin(i * 2 * np.pi / Len).astype(np.float16) + twi[2 * i + 1] = np.cos(i * 2 * np.pi / Len).astype(np.float16) kwargs = {'name': 'data_cfft_radix4_f16', 'src': src, diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h index fbb6964ac..3e7a28245 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h +++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h @@ -29,7 +29,6 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, __fp16 t0, t1, t2, t3; uint32_t i1, i2, i3; uint32_t i0_store, i1_store, i2_store, i3_store; - float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f; v2h A, B, C, D, E, F, G, H; // LOAD INDEXES @@ -60,32 +59,40 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, i3_store = i3; #endif - /* Read yb (real), xb(imag) input */ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + /* Read xb (real), yb(imag) input */ B = *(v2h *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (real), yd(imag) input */ D = *(v2h *)&pIn[i3 * 2U]; - /* Read ya (real), xa (imag) input */ + /* Read xa (real), ya(imag) input */ A = *(v2h *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (real), yc(imag) input */ C = *(v2h *)&pIn[i2 * 2U]; asm volatile( - // xb - xd, yb - yd - "vfsub.h %[H],%[B],%[D];" - // xb + xd, yd + yd + // G = (xb + xd), (yb + yd) "vfadd.h %[G],%[B],%[D];" - // xa + xc, ya + yc + // H = (xb - xd), (yb - yd) + "vfsub.h %[H],%[B],%[D];" + // E = (xa + xc), (ya + yc) "vfadd.h %[E],%[A],%[C];" - "pv.extract.h %[t0],%[H],0;" // yb - yd - "pv.extract.h %[t1],%[H],1;" // xb - xd - // xa - xc, ya - yc + // F = (xa - xc), (ya - yc) "vfsub.h %[F],%[A],%[C];" + // C = (yb - yd), (xd - xb) + // D = (yd - yb), (xb - xd) + "pv.extract.h %[t0],%[H],0;" // yb - yd + "pv.extract.h %[t1],%[H],1;" // xb - xd "xor %[t2],%[t0],%[neg_mask];" // yd - yb "xor %[t3],%[t1],%[neg_mask];" // xd - xb - "pv.pack.h %[D],%[t2],%[t1];" // yd - yb, xb - xd - "pv.pack.h %[C],%[t0],%[t3];" // yb - yd, xd - xb + "pv.pack %[C],%[t0],%[t3];" + "pv.pack %[D],%[t2],%[t1];" - // xa + xc + xb + xd, ya + yb + yc + yd + // xa + xb + xc + xd, ya + yc + yb + yd "vfadd.h %[A],%[E],%[G];" // xa + xc - xb - xd, ya + yc - yb - yd "vfsub.h %[B],%[E],%[G];" @@ -94,30 +101,28 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, // xa - xc + yd - yb, ya - yc + xb - xd "vfadd.h %[D],%[F],%[D];" - // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd) + // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) + // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd) "vfdotpex.s.h %[s0],%[CoSi2],%[B];" - //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd) "vfdotpex.s.h %[s1],%[C2],%[B];" - - // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd) + // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd) + // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd) "vfdotpex.s.h %[s2],%[CoSi1],%[D];" - //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd) "vfdotpex.s.h %[s3],%[C1],%[D];" - - // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb) + // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb) + // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb) "vfdotpex.s.h %[s4],%[CoSi3],%[C];" - //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb) "vfdotpex.s.h %[s5],%[C3],%[C];" // xb', yb' - "vfcpka.h.s %[B], %[s0], %[s1];" + "vfcpka.h.s %[B], %[s1], %[s0];" // xc', yc' - "vfcpka.h.s %[C], %[s2], %[s3];" + "vfcpka.h.s %[C], %[s3], %[s2];" // xd', yd' - "vfcpka.h.s %[D], %[s4], %[s5];" + "vfcpka.h.s %[D], %[s5], %[s4];" : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), - [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0), + [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), + [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0), [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), [s5] "=&r"(s5) : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), @@ -125,8 +130,8 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, :); *((v2h *)&pOut[i0_store * 2U]) = A; *((v2h *)&pOut[i1_store * 2U]) = B; - *((v2h *)&pOut[i2_store * 2U]) = D; - *((v2h *)&pOut[i3_store * 2U]) = C; + *((v2h *)&pOut[i2_store * 2U]) = C; + *((v2h *)&pOut[i3_store * 2U]) = D; } /** @@ -152,7 +157,6 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, __fp16 t0, t1, t2, t3; uint32_t i1, i2, i3; uint32_t i0_store, i1_store, i2_store, i3_store; - float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f; v2h A, B, C, D, E, F, G, H; // LOAD INDEXES @@ -186,32 +190,40 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, i3_store = i3; #endif - /* Read yb (real), xb(imag) input */ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + /* Read xb (real), yb(imag) input */ B = *(v2h *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (real), yd(imag) input */ D = *(v2h *)&pIn[i3 * 2U]; - /* Read ya (real), xa (imag) input */ + /* Read xa (real), ya(imag) input */ A = *(v2h *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (real), yc(imag) input */ C = *(v2h *)&pIn[i2 * 2U]; asm volatile( - // xb - xd, yb - yd - "vfsub.h %[H],%[B],%[D];" - // xb + xd, yd + yd + // G = (xb + xd), (yb + yd) "vfadd.h %[G],%[B],%[D];" - // xa + xc, ya + yc + // H = (xb - xd), (yb - yd) + "vfsub.h %[H],%[B],%[D];" + // E = (xa + xc), (ya + yc) "vfadd.h %[E],%[A],%[C];" - "pv.extract.h %[t0],%[H],1;" // yb - yd - "pv.extract.h %[t1],%[H],0;" // xb - xd - // xa - xc, ya - yc + // F = (xa - xc), (ya - yc) "vfsub.h %[F],%[A],%[C];" + // C = (yb - yd), (xd - xb) + // D = (yd - yb), (xb - xd) + "pv.extract.h %[t0],%[H],0;" // yb - yd + "pv.extract.h %[t1],%[H],1;" // xb - xd "xor %[t2],%[t0],%[neg_mask];" // yd - yb "xor %[t3],%[t1],%[neg_mask];" // xd - xb - "pv.pack.h %[D],%[t2],%[t1];" // yd - yb, xb - xd - "pv.pack.h %[C],%[t0],%[t3];" // yb - yd, xd - xb + "pv.pack %[C],%[t0],%[t3];" + "pv.pack %[D],%[t2],%[t1];" - // xa + xc + xb + xd, ya + yb + yc + yd + // xa + xb + xc + xd, ya + yc + yb + yd "vfadd.h %[A],%[E],%[G];" // xa + xc - xb - xd, ya + yc - yb - yd "vfsub.h %[B],%[E],%[G];" @@ -220,30 +232,28 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, // xa - xc + yd - yb, ya - yc + xb - xd "vfadd.h %[D],%[F],%[D];" - // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd) + // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) + // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd) "vfdotpex.s.h %[s0],%[CoSi2],%[B];" - //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd) "vfdotpex.s.h %[s1],%[C2],%[B];" - - // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd) + // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd) + // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd) "vfdotpex.s.h %[s2],%[CoSi1],%[D];" - //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd) "vfdotpex.s.h %[s3],%[C1],%[D];" - - // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb) + // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb) + // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb) "vfdotpex.s.h %[s4],%[CoSi3],%[C];" - //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb) "vfdotpex.s.h %[s5],%[C3],%[C];" // xb', yb' - "vfcpka.h.s %[B], %[s0], %[s1];" + "vfcpka.h.s %[B], %[s1], %[s0];" // xc', yc' - "vfcpka.h.s %[C], %[s2], %[s3];" + "vfcpka.h.s %[C], %[s3], %[s2];" // xd', yd' - "vfcpka.h.s %[D], %[s4], %[s5];" + "vfcpka.h.s %[D], %[s5], %[s4];" : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), - [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0), + [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), + [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0), [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), [s5] "=&r"(s5) : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), @@ -252,8 +262,8 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, *((v2h *)&pOut[i0_store * 2U]) = A; *((v2h *)&pOut[i1_store * 2U]) = B; - *((v2h *)&pOut[i2_store * 2U]) = D; - *((v2h *)&pOut[i3_store * 2U]) = C; + *((v2h *)&pOut[i2_store * 2U]) = C; + *((v2h *)&pOut[i3_store * 2U]) = D; } /** @@ -301,34 +311,48 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, i3_store = i3; #endif - /* Read yb (real), xb(imag) input */ + /* Read xb (imag), yb(real) input */ B = *(v2h *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (imag), yd(real) input */ D = *(v2h *)&pIn[i3 * 2U]; - /* Read ya (real), xa(imag) input */ + /* Read xa (imag), ya(real) input */ A = *(v2h *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (imag), yc(real) input */ C = *(v2h *)&pIn[i2 * 2U]; __fp16 t2, t3; - asm volatile("vfsub.h %[H],%[B],%[D];" - "vfadd.h %[G],%[B],%[D];" - "vfadd.h %[E],%[A],%[C];" - "vfsub.h %[F],%[A],%[C];" - "pv.extract.h %[t0],%[H],1;" - "pv.extract.h %[t1],%[H],0;" - "xor %[t2],%[t0],%[neg_mask];" - "xor %[t3],%[t1],%[neg_mask];" - "pv.pack.h %[A],%[t2],%[t1];" - "pv.pack.h %[B],%[t0],%[t3];" - "vfadd.h %[H],%[E],%[G];" - "vfsub.h %[E],%[E],%[G];" - "vfadd.h %[A],%[F],%[A];" - "vfadd.h %[B],%[F],%[B];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), - [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3) - : [neg_mask] "r"(0x00008000) - :); + asm volatile( + /* (xb - xd), (yb - yd) */ + "vfsub.h %[H],%[B],%[D];" + /* (xb + xd), (yb + yd) */ + "vfadd.h %[G],%[B],%[D];" + /* (xa + xc), (ya + yc) */ + "vfadd.h %[E],%[A],%[C];" + /* (xa - xc), (ya - yc) */ + "vfsub.h %[F],%[A],%[C];" + + "pv.extract.h %[t0],%[H],0;" // (yb - yd) + "pv.extract.h %[t1],%[H],1;" // (xb - xd) + "xor %[t2],%[t0],%[neg_mask];" // (yd - yb) + "xor %[t3],%[t1],%[neg_mask];" // (xd - xb) + /* (yd - yb), (xb - xd) */ + "pv.pack %[A],%[t2],%[t1];" + /* (yb - yd), (xd - xb) */ + "pv.pack %[B],%[t0],%[t3];" + + /* (xa + xc + xb + xd), (ya + yc + yb + yd) */ + "vfadd.h %[H],%[E],%[G];" + /* (xa + xc - xb - xd), (ya + yc - yb - yd) */ + "vfsub.h %[E],%[E],%[G];" + /* (xa - xc + yd - yb), (ya - yc + xb - xd) */ + "vfadd.h %[A],%[F],%[A];" + /* (xa - xc + yb - yd), (ya - yc + xd - xb) */ + "vfadd.h %[B],%[F],%[B];" + + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), + [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0), + [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3) + : [neg_mask] "r"(0x00008000) + :); *((v2h *)&pOut[i0_store * 2U]) = H; *((v2h *)&pOut[i1_store * 2U]) = E; diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h index 725908b71..8946ce743 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h +++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h @@ -116,13 +116,13 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, *((v2s *)&pOut[i3_store * 2U]) = G; #else v2s s1, s2; - /* Read yb (real), xb(imag) input */ + /* Read xb (real), yb(imag) input */ B = *(v2s *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (real), yd(imag) input */ D = *(v2s *)&pIn[i3 * 2U]; - /* Read ya (real), xa (imag) input */ + /* Read xa (real), ya (imag) input */ A = *(v2s *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (real), yc(imag) input */ C = *(v2s *)&pIn[i2 * 2U]; asm volatile("addi %[s1], zero, 0x01;" "slli %[s1], %[s1], 0x10;" @@ -134,6 +134,10 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, "pv.sra.h %[D],%[D],%[s2];" "pv.sra.h %[A],%[A],%[s2];" "pv.sra.h %[C],%[C],%[s2];" + /* G = (xb + xd), (yb + yd) */ + /* H = (xb - xd), (yb - yd) */ + /* E = (xa + xc), (ya + yc) */ + /* F = (xa - xc), (ya - yc) */ "pv.add.h %[G],%[B],%[D];" "pv.sub.h %[H],%[B],%[D];" "pv.add.h %[E],%[A],%[C];" @@ -144,11 +148,23 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, "pv.sra.h %[B],%[G],%[s1];" "sub %[t3],zero,%[t1];" "sub %[t4],zero,%[t0];" + /* C = (yb - yd), (xd - xb) */ + /* D = (yd - yb), (xb - xd) */ "pv.pack %[C],%[t0],%[t3];" "pv.pack %[D],%[t4],%[t1];" + /* E = (xa + xc - xb - xd), (ya + yc - yb - yd) */ + /* G = (xa - xc + yb - yd), (ya - yc + xd - xb) */ + /* H = (xa - xc + yd - yb), (ya - yc + xb - xd) */ + /* A = (xa + xc + xb + xd), (ya + yc + yb + yd) */ "pv.sub.h %[E],%[E],%[G];" "pv.add.h %[G],%[F],%[C];" "pv.add.h %[H],%[F],%[D];" + /* t0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd) */ + /* t1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd) */ + /* t2 = Co1 * (xa - xc + yd - yb) + Si2 * (ya - yc + xb - xd) */ + /* t3 = Si1 * (xa - xc + yd - yb) - Co2 * (ya - yc + xb - xd) */ + /* t4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb) */ + /* t5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb) */ "pv.dotsp.h %[C],%[CoSi2],%[E];" "pv.dotsp.h %[D],%[C2],%[E];" "pv.dotsp.h %[E],%[CoSi1],%[H];" @@ -295,17 +311,21 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, *((v2s *)&pOut[i3_store * 2U]) = C; #else v2s s1; - /* Read yb (real), xb(imag) input */ + /* Read xb (real), yb(imag) input */ B = *(v2s *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (real), yd(imag) input */ D = *(v2s *)&pIn[i3 * 2U]; - /* Read ya (real), xa(imag) input */ + /* Read xa (real), ya(imag) input */ A = *(v2s *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (real), yc(imag) input */ C = *(v2s *)&pIn[i2 * 2U]; asm volatile("addi %[s1], zero, 0x01;" "slli %[s1], %[s1], 0x10;" "addi %[s1], %[s1], 0x01;" + /* G = (xb + xd), (yb + yd) */ + /* H = (xb - xd), (yb - yd) */ + /* E = (xa + xc), (ya + yc) */ + /* F = (xa - xc), (ya - yc) */ "pv.add.h %[G],%[B],%[D];" "pv.sub.h %[H],%[B],%[D];" "pv.add.h %[E],%[A],%[C];" @@ -314,6 +334,8 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, "pv.sra.h %[H],%[H],%[s1];" "pv.sra.h %[E],%[E],%[s1];" "pv.sra.h %[F],%[F],%[s1];" + /* A = (yb - yd), (xd - xb) */ + /* B = (yd - yb), (xb - xd) */ "pv.extract.h %[t0],%[H],0;" "pv.extract.h %[t1],%[H],1;" "pv.sub.h %[C],%[E],%[G];" @@ -325,6 +347,10 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, "pv.sra.h %[D],%[D],%[s1];" "pv.add.h %[E],%[F],%[A];" "pv.add.h %[F],%[F],%[B];" + /* C = (xa + xc - xb - xd), (ya + yc - yb - yd) */ + /* D = (xa + xc + xb + xd), (ya + yc + yb + yd) */ + /* E = (xa - xc + yb - yd), (ya - yc + xd - xb) */ + /* F = (xa - xc + yd - yb), (ya - yc + xb - xd) */ "pv.dotsp.h %[G],%[CoSi2],%[C];" "pv.dotsp.h %[H],%[C2],%[C];" "pv.dotsp.h %[A],%[CoSi1],%[F];" @@ -443,13 +469,13 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, *((v2s *)&pOut[i2_store * 2U]) = A; *((v2s *)&pOut[i3_store * 2U]) = B; #else - /* Read yb (real), xb(imag) input */ + /* Read xb (real), yb(imag) input */ B = *(v2s *)&pIn[i1 * 2U]; - /* Read yd (real), xd(imag) input */ + /* Read xd (real), yd(imag) input */ D = *(v2s *)&pIn[i3 * 2U]; - /* Read ya (real), xa(imag) input */ + /* Read xa (real), ya(imag) input */ A = *(v2s *)&pIn[i0 * 2U]; - /* Read yc (real), xc(imag) input */ + /* Read xc (real), yc(imag) input */ C = *(v2s *)&pIn[i2 * 2U]; int16_t t2, t3; v2s s1; @@ -457,6 +483,10 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, "addi %[s1], zero, 0x01;" "slli %[s1], %[s1], 0x10;" "addi %[s1], %[s1], 0x01;" + /* H = xb - xd, yb - yd */ + /* G = xb + xd, yb + yd */ + /* E = xa + xc, ya + yc */ + /* F = xa - xc, ya - yc */ "pv.sub.h %[H],%[B],%[D];" "pv.add.h %[G],%[B],%[D];" "pv.add.h %[E],%[A],%[C];" @@ -464,6 +494,8 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, "pv.sra.h %[H],%[H],%[s1];" "pv.sra.h %[G],%[G],%[s1];" "pv.sra.h %[E],%[E],%[s1];" + /* A = yd - yb, xb - xd */ + /* B = yb - yd, xd - xb */ "pv.extract.h %[t0],%[H],0;" "pv.extract.h %[t1],%[H],1;" "pv.sra.h %[F],%[F],%[s1];" @@ -471,6 +503,10 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, "sub %[t3], zero, %[t1];" "pv.pack %[A],%[t2],%[t1];" "pv.pack %[B],%[t0],%[t3];" + /* H = xa + xc + xb + xd */ + /* E = xa + xc - xb - xd */ + /* A = xa - xc + yd - yb */ + /* B = xa - xc + yb - yd */ "pv.add.h %[H],%[E],%[G];" "pv.sub.h %[E],%[E],%[G];" "pv.add.h %[A],%[F],%[A];" diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/runtime/kernel/mempool_radix4_cfft_f16p.h index 2076a108a..8682e6553 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_f16p.h +++ b/software/runtime/kernel/mempool_radix4_cfft_f16p.h @@ -7,6 +7,7 @@ #include "xpulp/builtins_v2.h" #define MIN(x, y) (((x) < (y)) ? (x) : (y)) +// CoSi: (Si, Co) -> C: (Co, -Si) #define SHUFFLE_TWIDDLEFACT \ asm volatile("pv.extract.h %[t1],%[CoSi1],0;" \ "pv.extract.h %[t3],%[CoSi2],0;" \ @@ -14,12 +15,12 @@ "pv.extract.h %[t0],%[CoSi1],1;" \ "pv.extract.h %[t2],%[CoSi2],1;" \ "pv.extract.h %[t4],%[CoSi3],1;" \ - "xor %[t1],%[t1],%[neg_mask];" \ - "xor %[t3],%[t3],%[neg_mask];" \ - "xor %[t5],%[t5],%[neg_mask];" \ - "pv.pack.h %[C1],%[t0],%[t1];" \ - "pv.pack.h %[C2],%[t2],%[t3];" \ - "pv.pack.h %[C3],%[t4],%[t5];" \ + "xor %[t0],%[t0],%[neg_mask];" \ + "xor %[t2],%[t2],%[neg_mask];" \ + "xor %[t4],%[t4],%[neg_mask];" \ + "pv.pack %[C1],%[t1],%[t0];" \ + "pv.pack %[C2],%[t3],%[t2];" \ + "pv.pack %[C3],%[t5],%[t4];" \ : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), \ [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), \ [t4] "=&r"(t4), [t5] "=&r"(t5) \ @@ -57,8 +58,6 @@ CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)]; #endif - - #ifdef FOLDED_TWIDDLES /** @brief Full FFT butterfly @@ -219,10 +218,10 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, */ void mempool_radix4_cfft_f16p_scheduler( - __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen, - __fp16 *pCoef_src, __fp16 *pCoef_dst, __attribute__((unused)) - uint16_t *pBitRevTable, __attribute__((unused)) uint16_t bitReverseLen, - uint8_t bitReverseFlag, uint32_t nPE) { + __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen, __fp16 *pCoef_src, + __fp16 *pCoef_dst, __attribute__((unused)) uint16_t *pBitRevTable, + __attribute__((unused)) uint16_t bitReverseLen, uint8_t bitReverseFlag, + uint32_t nPE) { uint32_t absolute_core_id = mempool_get_core_id(); uint32_t core_id = absolute_core_id % (fftLen >> 4U); diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h index e5380444c..dad8b6086 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h +++ b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h @@ -26,14 +26,14 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, #ifndef ASM #define SWAP_ITEMS \ - addr1 = *(uint32_t *)&pBitRevTab[i]; \ - addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ - addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ - addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ - addr1 = __SRA2(*(v2s*)&addr1, *(v2s*)&s2); \ - addr2 = __SRA2(*(v2s*)&addr2, *(v2s*)&s2); \ - addr3 = __SRA2(*(v2s*)&addr3, *(v2s*)&s2); \ - addr4 = __SRA2(*(v2s*)&addr4, *(v2s*)&s2); \ + addr1 = *(uint32_t *)&pBitRevTab[i]; \ + addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ + addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ + addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ + addr1 = __SRA2(*(v2s *)&addr1, *(v2s *)&s2); \ + addr2 = __SRA2(*(v2s *)&addr2, *(v2s *)&s2); \ + addr3 = __SRA2(*(v2s *)&addr3, *(v2s *)&s2); \ + addr4 = __SRA2(*(v2s *)&addr4, *(v2s *)&s2); \ a1 = addr1[1]; \ a2 = addr2[1]; \ a3 = addr3[1]; \ @@ -42,28 +42,28 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, b2 = addr2[0]; \ b3 = addr3[0]; \ b4 = addr4[0]; \ - tmpa1 = *(uint32_t *)&pSrc[a1]; \ - tmpa2 = *(uint32_t *)&pSrc[a2]; \ - tmpa3 = *(uint32_t *)&pSrc[a3]; \ - tmpa4 = *(uint32_t *)&pSrc[a4]; \ - tmpb1 = *(uint32_t *)&pSrc[b1]; \ - tmpb2 = *(uint32_t *)&pSrc[b2]; \ - tmpb3 = *(uint32_t *)&pSrc[b3]; \ - tmpb4 = *(uint32_t *)&pSrc[b4]; \ - *((uint32_t *)&pSrc[a1]) = tmpb1; \ - *((uint32_t *)&pSrc[a2]) = tmpb2; \ - *((uint32_t *)&pSrc[a3]) = tmpb3; \ - *((uint32_t *)&pSrc[a4]) = tmpb4; \ - *((uint32_t *)&pSrc[b1]) = tmpa1; \ - *((uint32_t *)&pSrc[b2]) = tmpa2; \ - *((uint32_t *)&pSrc[b3]) = tmpa3; \ + tmpa1 = *(uint32_t *)&pSrc[a1]; \ + tmpa2 = *(uint32_t *)&pSrc[a2]; \ + tmpa3 = *(uint32_t *)&pSrc[a3]; \ + tmpa4 = *(uint32_t *)&pSrc[a4]; \ + tmpb1 = *(uint32_t *)&pSrc[b1]; \ + tmpb2 = *(uint32_t *)&pSrc[b2]; \ + tmpb3 = *(uint32_t *)&pSrc[b3]; \ + tmpb4 = *(uint32_t *)&pSrc[b4]; \ + *((uint32_t *)&pSrc[a1]) = tmpb1; \ + *((uint32_t *)&pSrc[a2]) = tmpb2; \ + *((uint32_t *)&pSrc[a3]) = tmpb3; \ + *((uint32_t *)&pSrc[a4]) = tmpb4; \ + *((uint32_t *)&pSrc[b1]) = tmpa1; \ + *((uint32_t *)&pSrc[b2]) = tmpa2; \ + *((uint32_t *)&pSrc[b3]) = tmpa3; \ *((uint32_t *)&pSrc[b4]) = tmpa4; #else #define SWAP_ITEMS \ - addr1 = *(uint32_t *)&pBitRevTab[i]; \ - addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ - addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ - addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ + addr1 = *(uint32_t *)&pBitRevTab[i]; \ + addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ + addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ + addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" \ "pv.sra.h %[addr2],%[addr2],%[s2];" \ "pv.sra.h %[addr3],%[addr3],%[s2];" \ @@ -82,21 +82,21 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) \ : [s2] "r"(s2) \ :); \ - tmpa1 = *(uint32_t *)&pSrc[a1]; \ - tmpa2 = *(uint32_t *)&pSrc[a2]; \ - tmpa3 = *(uint32_t *)&pSrc[a3]; \ - tmpa4 = *(uint32_t *)&pSrc[a4]; \ - tmpb1 = *(uint32_t *)&pSrc[b1]; \ - tmpb2 = *(uint32_t *)&pSrc[b2]; \ - tmpb3 = *(uint32_t *)&pSrc[b3]; \ - tmpb4 = *(uint32_t *)&pSrc[b4]; \ - *((uint32_t *)&pSrc[a1]) = tmpb1; \ - *((uint32_t *)&pSrc[a2]) = tmpb2; \ - *((uint32_t *)&pSrc[a3]) = tmpb3; \ - *((uint32_t *)&pSrc[a4]) = tmpb4; \ - *((uint32_t *)&pSrc[b1]) = tmpa1; \ - *((uint32_t *)&pSrc[b2]) = tmpa2; \ - *((uint32_t *)&pSrc[b3]) = tmpa3; \ + tmpa1 = *(uint32_t *)&pSrc[a1]; \ + tmpa2 = *(uint32_t *)&pSrc[a2]; \ + tmpa3 = *(uint32_t *)&pSrc[a3]; \ + tmpa4 = *(uint32_t *)&pSrc[a4]; \ + tmpb1 = *(uint32_t *)&pSrc[b1]; \ + tmpb2 = *(uint32_t *)&pSrc[b2]; \ + tmpb3 = *(uint32_t *)&pSrc[b3]; \ + tmpb4 = *(uint32_t *)&pSrc[b4]; \ + *((uint32_t *)&pSrc[a1]) = tmpb1; \ + *((uint32_t *)&pSrc[a2]) = tmpb2; \ + *((uint32_t *)&pSrc[a3]) = tmpb3; \ + *((uint32_t *)&pSrc[a4]) = tmpb4; \ + *((uint32_t *)&pSrc[b1]) = tmpa1; \ + *((uint32_t *)&pSrc[b2]) = tmpa2; \ + *((uint32_t *)&pSrc[b3]) = tmpa3; \ *((uint32_t *)&pSrc[b4]) = tmpa4; #endif