From 568a4c7b75cf07845dc4ba511b89157a6a065fee Mon Sep 17 00:00:00 2001 From: Andrula Song Date: Thu, 4 Jan 2024 14:24:53 +0800 Subject: [PATCH 1/3] Audio: Component: Add HiFi5 implementation of audio_stream_copy Add HiFi5 implementation if function audio_stream_copy, compared with HiFi3 version, the HiFi5 method can save about 29% cycles. Signed-off-by: Andrula Song --- src/audio/component.c | 58 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/src/audio/component.c b/src/audio/component.c index 4358e2296523..0312c39ef339 100644 --- a/src/audio/component.c +++ b/src/audio/component.c @@ -24,8 +24,10 @@ #if defined(__XCC__) #include -#if XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4 -#define STREAMCOPY_HIFI3 +# if XCHAL_HAVE_HIFI5 +# define STREAMCOPY_HIFI5 +# elif XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4 +# define STREAMCOPY_HIFI3 #endif #endif @@ -182,7 +184,57 @@ void comp_get_copy_limits_frame_aligned(const struct comp_buffer *source, cl->sink_bytes = cl->frames * cl->sink_frame_bytes; } -#ifdef STREAMCOPY_HIFI3 +#if defined(STREAMCOPY_HIFI5) + +#include + +int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, + struct audio_stream *sink, uint32_t ooffset, uint32_t samples) +{ + int ssize = audio_stream_sample_bytes(source); /* src fmt == sink fmt */ + ae_int16x8 *src = (ae_int16x8 *)((int8_t *)audio_stream_get_rptr(source) + ioffset * ssize); + ae_int16x8 *dst = (ae_int16x8 *)((int8_t *)audio_stream_get_wptr(sink) + ooffset * ssize); + int shorts = samples * ssize >> 1; + int shorts_src; + int shorts_dst; + int shorts_copied; + int left, m, i; + ae_int16x4 in_sample1; + ae_int16x4 in_sample2; + ae_valignx2 inu; + ae_valignx2 outu = AE_ZALIGN128(); + + /* copy with 16bit as the minimum unit since the minimum sample size is 16 bit*/ + while (shorts > 0) { + src = audio_stream_wrap(source, src); + dst = audio_stream_wrap(sink, dst); + shorts_src = audio_stream_samples_without_wrap_s16(source, src); + shorts_dst = audio_stream_samples_without_wrap_s16(sink, dst); + shorts_copied = AE_MIN32(shorts_src, shorts_dst); + shorts_copied = AE_MIN32(shorts, shorts_copied); + m = shorts_copied >> 3; + left = shorts_copied & 0x07; + inu = AE_LA128_PP(src); + /* copy 4 * 16bit(8 bytes)per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample1, in_sample2, inu, src); + AE_SA16X4X2_IP(in_sample1, in_sample2, outu, dst); + } + AE_SA128POS_FP(outu, dst); + + /* process the left bits that less than 4 * 16 */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample1, (ae_int16 *)src, sizeof(ae_int16)); + AE_S16_0_IP(in_sample1, (ae_int16 *)dst, sizeof(ae_int16)); + } + shorts -= shorts_copied; + } + return samples; +} + +#elif defined(STREAMCOPY_HIFI3) + +#include int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, struct audio_stream *sink, uint32_t ooffset, uint32_t samples) From cb20c2f31734efbacdb20b58e84d6b682d03febb Mon Sep 17 00:00:00 2001 From: Andrula Song Date: Fri, 8 Mar 2024 15:02:35 +0800 Subject: [PATCH 2/3] Audio: Component: Fix the potential risks of HiFi3 audio_stream_copy Use while (shorts > 0) instead of while (short) to reduce the forever loop risk. Use general instruction AE_MIN32 replace AE_MIN_32_signed which is an internal proto intended for Xtensa compiler. Signed-off-by: Andrula Song --- src/audio/component.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/audio/component.c b/src/audio/component.c index 0312c39ef339..222f980f29cb 100644 --- a/src/audio/component.c +++ b/src/audio/component.c @@ -252,13 +252,13 @@ int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, ae_valign outu = AE_ZALIGN64(); /* copy with 16bit as the minimum unit since the minimum sample size is 16 bit*/ - while (shorts) { + while (shorts > 0) { src = audio_stream_wrap(source, src); dst = audio_stream_wrap(sink, dst); shorts_src = audio_stream_samples_without_wrap_s16(source, src); shorts_dst = audio_stream_samples_without_wrap_s16(sink, dst); - shorts_copied = AE_MIN_32_signed(shorts_src, shorts_dst); - shorts_copied = AE_MIN_32_signed(shorts, shorts_copied); + shorts_copied = AE_MIN32(shorts_src, shorts_dst); + shorts_copied = AE_MIN32(shorts, shorts_copied); m = shorts_copied >> 2; left = shorts_copied & 0x03; inu = AE_LA64_PP(src); From 124eb29c939bb1b197c9be567f914141f0304f62 Mon Sep 17 00:00:00 2001 From: Andrula Song Date: Thu, 4 Jan 2024 14:34:35 +0800 Subject: [PATCH 3/3] Audio: Component: Add HiFi5 implementation of cir_buf_copy. Add HiFi3 & HiFi5 implementation of function cir_buf_copy. Compared with generic C version, the HiFi3 version can save about 3% cycles and HiFi5 version can save about 40% cycles. Signed-off-by: Andrula Song --- src/audio/component.c | 92 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/src/audio/component.c b/src/audio/component.c index 222f980f29cb..3acbc47f9e78 100644 --- a/src/audio/component.c +++ b/src/audio/component.c @@ -232,6 +232,50 @@ int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, return samples; } +void cir_buf_copy(void *src, void *src_addr, void *src_end, void *dst, + void *dst_addr, void *dst_end, size_t byte_size) +{ + size_t bytes = byte_size; + size_t bytes_src; + size_t bytes_dst; + size_t bytes_copied; + size_t short_copied; + int left, m, i; + ae_int16x4 in_sample1, in_sample2; + ae_valignx2 inu; + ae_valignx2 outu = AE_ZALIGN128(); + ae_int16x8 *in = (ae_int16x8 *)src; + ae_int16x8 *out = (ae_int16x8 *)dst; + + while (bytes) { + bytes_src = cir_buf_bytes_without_wrap(in, src_end); + bytes_dst = cir_buf_bytes_without_wrap(out, dst_end); + bytes_copied = MIN(bytes_src, bytes_dst); + bytes_copied = MIN(bytes, bytes_copied); + short_copied = bytes_copied >> 1; + + m = short_copied >> 3; + left = short_copied & 0x07; + inu = AE_LA128_PP(in); + /* copy 2 * 4 * 16bit(16 bytes)per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample1, in_sample2, inu, in); + AE_SA16X4X2_IP(in_sample1, in_sample2, outu, out); + } + AE_SA128POS_FP(outu, out); + + /* process the left bits that less than 2 * 4 * 16 */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample1, (ae_int16 *)in, sizeof(ae_int16)); + AE_S16_0_IP(in_sample1, (ae_int16 *)out, sizeof(ae_int16)); + } + + bytes -= bytes_copied; + in = cir_buf_wrap(in, src_addr, src_end); + out = cir_buf_wrap(out, dst_addr, dst_end); + } +} + #elif defined(STREAMCOPY_HIFI3) #include @@ -279,6 +323,50 @@ int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, return samples; } +void cir_buf_copy(void *src, void *src_addr, void *src_end, void *dst, + void *dst_addr, void *dst_end, size_t byte_size) +{ + size_t bytes = byte_size; + size_t bytes_src; + size_t bytes_dst; + size_t bytes_copied; + size_t short_copied; + + int left, m, i; + ae_int16x4 in_sample = AE_ZERO16(); + ae_valign inu = AE_ZALIGN64(); + ae_valign outu = AE_ZALIGN64(); + ae_int16x4 *in = (ae_int16x4 *)src; + ae_int16x4 *out = (ae_int16x4 *)dst; + + while (bytes) { + bytes_src = cir_buf_bytes_without_wrap(in, src_end); + bytes_dst = cir_buf_bytes_without_wrap(out, dst_end); + bytes_copied = MIN(bytes_src, bytes_dst); + bytes_copied = MIN(bytes, bytes_copied); + short_copied = bytes_copied >> 1; + m = short_copied >> 2; + left = short_copied & 0x03; + inu = AE_LA64_PP(in); + /* copy 4 * 16bit(8 bytes)per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4_IP(in_sample, inu, in); + AE_SA16X4_IP(in_sample, outu, out); + } + AE_SA64POS_FP(outu, out); + + /* process the left bits that less than 4 * 16 */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); + AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16)); + } + + bytes -= bytes_copied; + in = cir_buf_wrap(in, src_addr, src_end); + out = cir_buf_wrap(out, dst_addr, dst_end); + } +} + #else int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, @@ -308,8 +396,6 @@ int audio_stream_copy(const struct audio_stream *source, uint32_t ioffset, return samples; } -#endif - void cir_buf_copy(void *src, void *src_addr, void *src_end, void *dst, void *dst_addr, void *dst_end, size_t byte_size) { @@ -332,6 +418,8 @@ void cir_buf_copy(void *src, void *src_addr, void *src_end, void *dst, } } +#endif + void audio_stream_copy_from_linear(const void *linear_source, int ioffset, struct audio_stream *sink, int ooffset, unsigned int samples)