Audio: Dcblock: Add HiFi3 implementation of dcblock

Add HiFi3 implementation of dcblock processing functions. Compared with generic C version, the 16 bit format can save about 48.1% cycles, and 48.4% for 24 bit format and 52.6% for 32 bit. Signed-off-by: Andrula Song <andrula.song@intel.com>
marc-hb · Mar 21, 2023 · abd81a1 · abd81a1
1 parent 9aa2c13
commit abd81a1
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 1 deletion.
diff --git a/src/audio/CMakeLists.txt b/src/audio/CMakeLists.txt
@@ -172,7 +172,7 @@ set(src_sources src/src.c src/src_generic.c)
 set(asrc_sources asrc/asrc.c asrc/asrc_farrow.c asrc/asrc_farrow_generic.c)
 set(eq-fir_sources module_adapter/module_adapter.c module_adapter/module/generic.c eq_fir/eq_fir.c eq_fir/eq_fir_generic.c)
 set(eq-iir_sources module_adapter/module_adapter.c module_adapter/module/generic.c eq_iir/eq_iir.c)
-set(dcblock_sources dcblock/dcblock.c dcblock/dcblock_generic.c dcblock/dcblock_hifi4.c)
+set(dcblock_sources dcblock/dcblock.c dcblock/dcblock_generic.c dcblock/dcblock_hifi3.c dcblock/dcblock_hifi4.c)
 set(crossover_sources crossover/crossover.c crossover/crossover_generic.c)
 set(tdfb_sources tdfb/tdfb.c tdfb/tdfb_generic.c tdfb/tdfb_direction.c)
 set(drc_sources drc/drc.c drc/drc_generic.c drc/drc_math_generic.c)

diff --git a/src/audio/dcblock/CMakeLists.txt b/src/audio/dcblock/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_local_sources(sof dcblock.c)
 add_local_sources(sof dcblock_generic.c)
+add_local_sources(sof dcblock_hifi3.c)
 add_local_sources(sof dcblock_hifi4.c)
diff --git a/src/audio/dcblock/dcblock_hifi3.c b/src/audio/dcblock/dcblock_hifi3.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2022 Intel Corporation. All rights reserved.
+//
+// Author: Andrula Song <andrula.song@intel.com>
+
+#include <stdint.h>
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/dcblock/dcblock.h>
+
+#ifdef DCBLOCK_HIFI3
+
+#include <xtensa/tie/xt_hifi3.h>
+LOG_MODULE_DECLARE(dcblock, CONFIG_SOF_LOG_LEVEL);
+
+static inline ae_int32x2  dcblock_cal(ae_int32x2 R, ae_int32x2 state_x, ae_int32x2 state_y,
+				      ae_int32x2 sample)
+{
+	ae_int64 out, temp;
+
+	/* R: Q2.30, y_prev: Q1.31 the result is Q2.62 */
+	temp = AE_MULF32S_LL(R, state_y);
+	out = AE_SUB64(AE_MOVAD32_L(sample), AE_MOVAD32_L(state_x));
+	/* shift out to 2.62 */
+	out = AE_ADD64S(AE_SLAI64S(out, 31), temp);
+	/* shift out to 1.63 */
+	return AE_ROUND32F64SSYM(AE_SLAI64S(out, 1));
+}
+
+/* Setup circular for component source */
+static inline void dcblock_set_circular(const struct audio_stream __sparse_cache *source)
+{
+	/* Set source as circular buffer 0 */
+	AE_SETCBEGIN0(source->addr);
+	AE_SETCEND0(source->end_addr);
+}
+
+#if CONFIG_FORMAT_S16LE
+static void dcblock_s16_default(const struct comp_dev *dev,
+				const struct audio_stream __sparse_cache *source,
+				const struct audio_stream __sparse_cache *sink,
+				uint32_t frames)
+{
+	struct comp_data *cd = comp_get_drvdata(dev);
+	ae_int16 *src = (ae_int16 *)source->r_ptr;
+	ae_int16 *dst = (ae_int16 *)sink->w_ptr;
+	ae_int16 *in;
+	ae_int16 *out;
+	ae_int32x2 R, state_x, state_y, sample;
+	ae_int16x4 in_sample, out_sample;
+	int ch, i, n;
+	int nch = source->channels;
+	const int inc = nch * sizeof(ae_int16);
+	int samples = nch * frames;
+
+	dcblock_set_circular(source);
+	while (samples) {
+		n = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = MIN(n, samples);
+		for (ch = 0; ch < nch; ch++) {
+			in = src + ch;
+			out = dst + ch;
+			state_x = cd->state[ch].x_prev;
+			state_y = cd->state[ch].y_prev;
+			R = cd->R_coeffs[ch];
+			for (i = 0; i < n; i += nch) {
+				/* Load a 16 bit sample*/
+				AE_L16_XC(in_sample, in, inc);
+				/* store the 16 bit sample to high 16bit of 32bit register*/
+				sample = AE_CVT32X2F16_32(in_sample);
+				state_y = dcblock_cal(R, state_x, state_y, sample);
+				state_x = sample;
+				out_sample = AE_ROUND16X4F32SSYM(state_y, state_y);
+				AE_S16_0_XP(out_sample, out, inc);
+			}
+			cd->state[ch].x_prev = state_x;
+			cd->state[ch].y_prev = state_y;
+		}
+		samples -= n;
+		dst = audio_stream_wrap(sink, dst + n);
+		src = audio_stream_wrap(source, src + n);
+	}
+}
+#endif /* CONFIG_FORMAT_S16LE */
+
+#if CONFIG_FORMAT_S24LE
+static void dcblock_s24_default(const struct comp_dev *dev,
+				const struct audio_stream __sparse_cache *source,
+				const struct audio_stream __sparse_cache *sink,
+				uint32_t frames)
+{
+	struct comp_data *cd = comp_get_drvdata(dev);
+	ae_int32 *src = (ae_int32 *)source->r_ptr;
+	ae_int32 *dst = (ae_int32 *)sink->w_ptr;
+	ae_int32 *in;
+	ae_int32 *out;
+	ae_int32x2 R, state_x, state_y;
+	ae_int32x2 in_sample, out_sample;
+	int ch, i, n;
+	int nch = source->channels;
+	const int inc = nch * sizeof(ae_int32);
+	int samples = nch * frames;
+
+	dcblock_set_circular(source);
+	while (samples) {
+		n = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = MIN(n, samples);
+		for (ch = 0; ch < nch; ch++) {
+			in = src + ch;
+			out = dst + ch;
+			state_x = cd->state[ch].x_prev;
+			state_y = cd->state[ch].y_prev;
+			R = cd->R_coeffs[ch];
+			for (i = 0; i < n; i += nch) {
+				AE_L32_XC(in_sample, in, inc);
+				in_sample = AE_SLAI32(in_sample, 8);
+				state_y = dcblock_cal(R, state_x, state_y, in_sample);
+				state_x = in_sample;
+				out_sample = AE_SRAI32R(state_y, 8);
+				out_sample = AE_SLAI32S(out_sample, 8);
+				out_sample = AE_SRAI32R(out_sample, 8);
+				AE_S32_L_XP(out_sample, out, inc);
+			}
+			cd->state[ch].x_prev = state_x;
+			cd->state[ch].y_prev = state_y;
+		}
+		samples -= n;
+		dst = audio_stream_wrap(sink, dst + n);
+		src = audio_stream_wrap(source, src + n);
+	}
+}
+#endif /* CONFIG_FORMAT_S24LE */
+
+#if CONFIG_FORMAT_S32LE
+static void dcblock_s32_default(const struct comp_dev *dev,
+				const struct audio_stream __sparse_cache *source,
+				const struct audio_stream __sparse_cache *sink,
+				uint32_t frames)
+{
+	struct comp_data *cd = comp_get_drvdata(dev);
+	ae_int32 *src = (ae_int32 *)source->r_ptr;
+	ae_int32 *dst = (ae_int32 *)sink->w_ptr;
+	ae_int32 *in;
+	ae_int32 *out;
+	ae_int32x2 R, state_x, state_y;
+	ae_int32x2 in_sample;
+	int ch, i, n;
+	int nch = source->channels;
+	const int inc = nch * sizeof(ae_int32);
+	int samples = nch * frames;
+
+	dcblock_set_circular(source);
+	while (samples) {
+		n = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = MIN(n, samples);
+		for (ch = 0; ch < nch; ch++) {
+			in = src + ch;
+			out = dst + ch;
+			state_x = cd->state[ch].x_prev;
+			state_y = cd->state[ch].y_prev;
+			R = cd->R_coeffs[ch];
+			for (i = 0; i < n; i += nch) {
+				AE_L32_XC(in_sample, in, inc);
+				state_y = dcblock_cal(R, state_x, state_y, in_sample);
+				state_x = in_sample;
+				AE_S32_L_XP(state_y, out, inc);
+			}
+			cd->state[ch].x_prev = state_x;
+			cd->state[ch].y_prev = state_y;
+		}
+		samples -= n;
+		dst = audio_stream_wrap(sink, dst + n);
+		src = audio_stream_wrap(source, src + n);
+	}
+}
+#endif /* CONFIG_FORMAT_S32LE */
+
+const struct dcblock_func_map dcblock_fnmap[] = {
+/* { SOURCE_FORMAT , PROCESSING FUNCTION } */
+#if CONFIG_FORMAT_S16LE
+	{ SOF_IPC_FRAME_S16_LE, dcblock_s16_default },
+#endif /* CONFIG_FORMAT_S16LE */
+#if CONFIG_FORMAT_S24LE
+	{ SOF_IPC_FRAME_S24_4LE, dcblock_s24_default },
+#endif /* CONFIG_FORMAT_S24LE */
+#if CONFIG_FORMAT_S32LE
+	{ SOF_IPC_FRAME_S32_LE, dcblock_s32_default },
+#endif /* CONFIG_FORMAT_S32LE */
+};
+
+const size_t dcblock_fncount = ARRAY_SIZE(dcblock_fnmap);
+#endif
diff --git a/src/include/sof/audio/dcblock/dcblock.h b/src/include/sof/audio/dcblock/dcblock.h
@@ -18,6 +18,8 @@
 # include <xtensa/config/core-isa.h>
 # if XCHAL_HAVE_HIFI4
 #  define DCBLOCK_HIFI4
+# elif XCHAL_HAVE_HIFI3
+#  define DCBLOCK_HIFI3
 # else
 #  define DCBLOCK_GENERIC
 # endif

diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt
@@ -467,6 +467,7 @@ zephyr_library_sources_ifdef(CONFIG_COMP_ASRC
 zephyr_library_sources_ifdef(CONFIG_COMP_DCBLOCK
 	${SOF_AUDIO_PATH}/dcblock/dcblock_generic.c
 	${SOF_AUDIO_PATH}/dcblock/dcblock.c
+	${SOF_AUDIO_PATH}/dcblock/dcblock_hifi3.c
 	${SOF_AUDIO_PATH}/dcblock/dcblock_hifi4.c
 )