Skip to content

Commit

Permalink
Audio: Mixin_mixout: Add HiFi5 implementation.
Browse files Browse the repository at this point in the history
Add HiFi5 implementation of mix functions, compared with
HiFi3 version, can reduce about 27% cycles.

Signed-off-by: Andrula Song <andrula.song@intel.com>
  • Loading branch information
andrula-song authored and lgirdwood committed Feb 27, 2024
1 parent ab87904 commit 0b3b574
Show file tree
Hide file tree
Showing 7 changed files with 326 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/audio/mixin_mixout/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c)
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c mixin_mixout_hifi5.c)
37 changes: 37 additions & 0 deletions src/audio/mixin_mixout/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,40 @@ config COMP_MIXIN_MIXOUT
default y
help
Select for Mixin_mixout component

choice "MIXIN_MIXOUT_SIMD_LEVEL_SELECT"
prompt "choose which SIMD level used for MIXIN_MIXOUT module"
depends on COMP_MIXIN_MIXOUT
default MIXIN_MIXOUT_HIFI_MAX

config MIXIN_MIXOUT_HIFI_MAX
prompt "Max HiFi level available in the toolchain"
bool
help
When this was selected, optimization level will be determined
by toolchain.

config MIXIN_MIXOUT_HIFI_5
prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module"
bool
help
This option used to build HIFI4 optimized MIXIN_MIXOUT code

config MIXIN_MIXOUT_HIFI_4
prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module"
bool
help
This option used to build HIFI4 optimized MIXIN_MIXOUT code

config MIXIN_MIXOUT_HIFI_3
prompt "choose HIFI3 intrinsic optimized MIXIN_MIXOUT module"
bool
help
This option used to build HIFI3 intrinsic optimized MIXIN_MIXOUT code

config MIXIN_MIXOUT_HIFI_NONE
prompt "choose generic C MIXIN_MIXOUT module, no HIFI SIMD involved"
bool
help
This option used to build MIXIN_MIXOUT generic code.
endchoice
12 changes: 0 additions & 12 deletions src/audio/mixin_mixout/mixin_mixout.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,6 @@
#include <sof/platform.h>
#include <stddef.h>

#define MIXIN_MIXOUT_GENERIC

#if defined(__XCC__)

#include <xtensa/config/core-isa.h>
#if XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4
#undef MIXIN_MIXOUT_GENERIC
#define MIXIN_MIXOUT_HIFI3
#endif

#endif

enum ipc4_mixin_config_param {
/* large_config_set param id for ipc4_mixer_mode_config */
IPC4_MIXER_MODE = 1
Expand Down
2 changes: 1 addition & 1 deletion src/audio/mixin_mixout/mixin_mixout_generic.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

#include "mixin_mixout.h"

#ifdef MIXIN_MIXOUT_GENERIC
#if SOF_USE_HIFI(NONE, MIXIN_MIXOUT)

#if CONFIG_FORMAT_S16LE
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
Expand Down
2 changes: 1 addition & 1 deletion src/audio/mixin_mixout/mixin_mixout_hifi3.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#include "mixin_mixout.h"

#ifdef MIXIN_MIXOUT_HIFI3
#if SOF_USE_HIFI(3, MIXIN_MIXOUT) || SOF_USE_HIFI(4, MIXIN_MIXOUT)

#if CONFIG_FORMAT_S16LE
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
Expand Down
285 changes: 285 additions & 0 deletions src/audio/mixin_mixout/mixin_mixout_hifi5.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
// SPDX-License-Identifier: BSD-3-Clause
//
// Copyright(c) 2024 Intel Corporation. All rights reserved.
//
// Author: Andrula Song <xiaoyuan.song@intel.com>

#include <sof/common.h>

#include "mixin_mixout.h"

#if SOF_USE_HIFI(5, MIXIN_MIXOUT)

#if CONFIG_FORMAT_S16LE
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
const struct cir_buf_ptr *source,
int32_t sample_count, uint16_t gain)
{
int samples_to_mix, samples_to_copy, left_samples;
int n, nmax, i, m, left;
ae_int16x4 in_sample, in_sample1;
ae_int16x4 out_sample, out_sample1;
ae_int16x8 *in;
ae_int16x8 *out;
ae_valignx2 inu = AE_ZALIGN128();
ae_valignx2 outu1 = AE_ZALIGN128();
ae_valignx2 outu2 = AE_ZALIGN128();
/* cir_buf_wrap() is required and is done below in a loop */
ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample;
ae_int16 *src = source->ptr;

assert(mixed_samples >= start_sample);
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
samples_to_copy = sample_count - samples_to_mix;
n = 0;

for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
/* calculate the remaining samples*/
nmax = (ae_int16 *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (ae_int16 *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int16x8 *)src;
out = (ae_int16x8 *)dst;
inu = AE_LA128_PP(in);
outu1 = AE_LA128_PP(out);
m = n >> 3;
left = n & 0x07;
/* process 8 samples per loop */
for (i = 0; i < m; i++) {
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out);
out--;
out_sample = AE_ADD16S(in_sample, out_sample);
out_sample1 = AE_ADD16S(in_sample1, out_sample1);
AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);

/* process the left samples that less than 8
* one by one to avoid memory access overrun
*/
for (i = 0; i < left ; i++) {
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
AE_L16_IP(out_sample, (ae_int16 *)out, 0);
out_sample = AE_ADD16S(in_sample, out_sample);
AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16));
}
}

for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
/* calculate the remaining samples*/
nmax = (ae_int16 *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (ae_int16 *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int16x8 *)src;
out = (ae_int16x8 *)dst;
inu = AE_LA128_PP(in);
m = n >> 3;
left = n & 0x07;
/* process 8 frames per loop */
for (i = 0; i < m; i++) {
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);

/* process the left samples that less than 8
* one by one to avoid memory access overrun
*/
for (i = 0; i < left ; i++) {
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16));
}
}
}
#endif /* CONFIG_FORMAT_S16LE */

#if CONFIG_FORMAT_S24LE
static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
const struct cir_buf_ptr *source,
int32_t sample_count, uint16_t gain)
{
int samples_to_mix, samples_to_copy, left_samples;
int n, nmax, i, m, left;
ae_int32x2 in_sample, in_sample1;
ae_int32x2 out_sample, out_sample1;
ae_int32x4 *in;
ae_int32x4 *out;
ae_valignx2 inu = AE_ZALIGN128();
ae_valignx2 outu1 = AE_ZALIGN128();
ae_valignx2 outu2 = AE_ZALIGN128();
/* cir_buf_wrap() is required and is done below in a loop */
int32_t *dst = (int32_t *)sink->ptr + start_sample;
int32_t *src = source->ptr;

assert(mixed_samples >= start_sample);
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
samples_to_copy = sample_count - samples_to_mix;
n = 0;

for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
/* calculate the remaining samples*/
nmax = (int32_t *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (int32_t *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int32x4 *)src;
out = (ae_int32x4 *)dst;
inu = AE_LA128_PP(in);
outu1 = AE_LA128_PP(out);
m = n >> 2;
left = n & 3;
/* process 2 samples per time */
for (i = 0; i < m; i++) {
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
out--;
out_sample = AE_ADD24S(in_sample, out_sample);
out_sample1 = AE_ADD24S(in_sample1, out_sample1);
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);

/* process the left sample to avoid memory access overrun */
if (left) {
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
out_sample = AE_ADD24S(in_sample, out_sample);
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
}
}

for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
nmax = (int32_t *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (int32_t *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int32x4 *)src;
out = (ae_int32x4 *)dst;
inu = AE_LA128_PP(in);
m = n >> 2;
left = n & 3;
for (i = 0; i < m; i++) {
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);
/* process the left sample to avoid memory access overrun */
if (left) {
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
}
}
}

#endif /* CONFIG_FORMAT_S24LE */

#if CONFIG_FORMAT_S32LE
static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
const struct cir_buf_ptr *source,
int32_t sample_count, uint16_t gain)
{
int samples_to_mix, samples_to_copy, left_samples;
int n, nmax, i, m, left;
ae_int32x2 in_sample, in_sample1;
ae_int32x2 out_sample, out_sample1;
ae_int32x4 *in;
ae_int32x4 *out;
ae_valignx2 inu = AE_ZALIGN128();
ae_valignx2 outu1 = AE_ZALIGN128();
ae_valignx2 outu2 = AE_ZALIGN128();
/* cir_buf_wrap() is required and is done below in a loop */
int32_t *dst = (int32_t *)sink->ptr + start_sample;
int32_t *src = source->ptr;

assert(mixed_samples >= start_sample);
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
samples_to_copy = sample_count - samples_to_mix;
n = 0;

for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
/* calculate the remaining samples*/
nmax = (int32_t *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (int32_t *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int32x4 *)src;
out = (ae_int32x4 *)dst;
inu = AE_LA128_PP(in);
outu1 = AE_LA128_PP(out);
m = n >> 2;
left = n & 3;
for (i = 0; i < m; i++) {
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
out--;
out_sample = AE_ADD32S(in_sample, out_sample);
out_sample1 = AE_ADD32S(in_sample1, out_sample1);
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);

/* process the left sample to avoid memory access overrun */
if (left) {
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
out_sample = AE_ADD32S(in_sample, out_sample);
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
}
}

for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
/* calculate the remaining samples*/
nmax = (int32_t *)source->buf_end - src;
n = AE_MIN32(left_samples, nmax);
nmax = (int32_t *)sink->buf_end - dst;
n = AE_MIN32(n, nmax);
in = (ae_int32x4 *)src;
out = (ae_int32x4 *)dst;
inu = AE_LA128_PP(in);
m = n >> 2;
left = n & 3;
for (i = 0; i < m; i++) {
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
}
AE_SA128POS_FP(outu2, out);
/* process the left sample to avoid memory access overrun */
if (left) {
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
}
}
}

#endif /* CONFIG_FORMAT_S32LE */

const struct mix_func_map mix_func_map[] = {
#if CONFIG_FORMAT_S16LE
{ SOF_IPC_FRAME_S16_LE, mix_s16 },
#endif
#if CONFIG_FORMAT_S24LE
{ SOF_IPC_FRAME_S24_4LE, mix_s24 },
#endif
#if CONFIG_FORMAT_S32LE
{ SOF_IPC_FRAME_S32_LE, mix_s32 }
#endif
};

const size_t mix_count = ARRAY_SIZE(mix_func_map);

#endif
1 change: 1 addition & 0 deletions zephyr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,7 @@ zephyr_library_sources_ifdef(CONFIG_COMP_MIXIN_MIXOUT
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout.c
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_generic.c
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi3.c
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi5.c
)

zephyr_library_sources_ifdef(CONFIG_COMP_TONE
Expand Down

0 comments on commit 0b3b574

Please sign in to comment.