Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust RdCostARM.h to compile on 32-bit Arm platforms #456

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 75 additions & 83 deletions source/Lib/CommonLib/arm/RdCostARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.

#include "CommonDefARM.h"
#include "CommonLib/CommonDef.h"
#include "neon/sum_neon.h"
#include "../RdCost.h"

#if defined( TARGET_SIMD_X86 )
Expand All @@ -69,7 +70,6 @@ namespace vvenc
{

#if defined( TARGET_SIMD_ARM )
#if REAL_TARGET_AARCH64

// The xGetHADs_ARMSIMD functions depend on the SIMDe kernels being enabled
// during compilation.
Expand Down Expand Up @@ -103,8 +103,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -135,8 +135,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -167,8 +167,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -199,8 +199,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -231,8 +231,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -263,8 +263,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -295,8 +295,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -327,8 +327,8 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
r2 = vaddq_s16( r2, r4 );
r3 = vaddq_s16( r3, r5 );

r0 = vpaddq_s16( r0, r2 );
r1 = vpaddq_s16( r1, r3 );
r0 = pairwise_add_s16x8( r0, r2 );
r1 = pairwise_add_s16x8( r1, r3 );

r0 = vaddq_s16( r0, vdupq_n_s16( 2 ) );
r1 = vaddq_s16( r1, vdupq_n_s16( 2 ) );
Expand Down Expand Up @@ -369,39 +369,39 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
m1[6].val[0] = vaddq_s16( m2[6].val[0], m2[7].val[0] );
m1[7].val[0] = vsubq_s16( m2[6].val[0], m2[7].val[0] ); // 14 bit

m2[0].val[0] = vzip1q_s16( m1[0].val[0], m1[1].val[0] );
m2[1].val[0] = vzip1q_s16( m1[2].val[0], m1[3].val[0] );
m2[2].val[0] = vzip2q_s16( m1[0].val[0], m1[1].val[0] );
m2[3].val[0] = vzip2q_s16( m1[2].val[0], m1[3].val[0] );
m2[4].val[0] = vzip1q_s16( m1[4].val[0], m1[5].val[0] );
m2[5].val[0] = vzip1q_s16( m1[6].val[0], m1[7].val[0] );
m2[6].val[0] = vzip2q_s16( m1[4].val[0], m1[5].val[0] );
m2[7].val[0] = vzip2q_s16( m1[6].val[0], m1[7].val[0] );

m1[0].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ) );
m1[1].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ) );
m1[2].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ) );
m1[3].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ) );
m1[4].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ) );
m1[5].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ) );
m1[6].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ) );
m1[7].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ) );

m3[0].val[1] = vmovl_high_s16( m1[0].val[0] );
m2[0].val[0] = vzipq_s16( m1[0].val[0], m1[1].val[0] ).val[0];
m2[1].val[0] = vzipq_s16( m1[2].val[0], m1[3].val[0] ).val[0];
m2[2].val[0] = vzipq_s16( m1[0].val[0], m1[1].val[0] ).val[1];
m2[3].val[0] = vzipq_s16( m1[2].val[0], m1[3].val[0] ).val[1];
m2[4].val[0] = vzipq_s16( m1[4].val[0], m1[5].val[0] ).val[0];
m2[5].val[0] = vzipq_s16( m1[6].val[0], m1[7].val[0] ).val[0];
m2[6].val[0] = vzipq_s16( m1[4].val[0], m1[5].val[0] ).val[1];
m2[7].val[0] = vzipq_s16( m1[6].val[0], m1[7].val[0] ).val[1];

m1[0].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ).val[0] );
m1[1].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ).val[1] );
m1[2].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ).val[0] );
m1[3].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ).val[1] );
m1[4].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ).val[0] );
m1[5].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ).val[1] );
m1[6].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ).val[0] );
m1[7].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ).val[1] );

m3[0].val[1] = vmovl_s16( vget_high_s16( m1[0].val[0] ) );
m3[0].val[0] = vmovl_s16( vget_low_s16( m1[0].val[0] ) );
m3[1].val[1] = vmovl_high_s16( m1[1].val[0] );
m3[1].val[1] = vmovl_s16( vget_high_s16( m1[1].val[0] ) );
m3[1].val[0] = vmovl_s16( vget_low_s16( m1[1].val[0] ) );
m3[2].val[1] = vmovl_high_s16( m1[2].val[0] );
m3[2].val[1] = vmovl_s16( vget_high_s16( m1[2].val[0] ) );
m3[2].val[0] = vmovl_s16( vget_low_s16( m1[2].val[0] ) );
m3[3].val[1] = vmovl_high_s16( m1[3].val[0] );
m3[3].val[1] = vmovl_s16( vget_high_s16( m1[3].val[0] ) );
m3[3].val[0] = vmovl_s16( vget_low_s16( m1[3].val[0] ) );
m3[4].val[1] = vmovl_high_s16( m1[4].val[0] );
m3[4].val[1] = vmovl_s16( vget_high_s16( m1[4].val[0] ) );
m3[4].val[0] = vmovl_s16( vget_low_s16( m1[4].val[0] ) );
m3[5].val[1] = vmovl_high_s16( m1[5].val[0] );
m3[5].val[1] = vmovl_s16( vget_high_s16( m1[5].val[0] ) );
m3[5].val[0] = vmovl_s16( vget_low_s16( m1[5].val[0] ) );
m3[6].val[1] = vmovl_high_s16( m1[6].val[0] );
m3[6].val[1] = vmovl_s16( vget_high_s16( m1[6].val[0] ) );
m3[6].val[0] = vmovl_s16( vget_low_s16( m1[6].val[0] ) );
m3[7].val[1] = vmovl_high_s16( m1[7].val[0] );
m3[7].val[1] = vmovl_s16( vget_high_s16( m1[7].val[0] ) );
m3[7].val[0] = vmovl_s16( vget_low_s16( m1[7].val[0] ) );

m4[0].val[0] = m3[0].val[0];
Expand Down Expand Up @@ -503,7 +503,7 @@ static uint32_t xCalcHAD16x16_fast_Neon( const Pel *piOrg, const Pel *piCur, con
m3[4].val[0] = vaddq_s32( m3[4].val[0], m3[6].val[0] );
int32x4_t iSum = vaddq_s32( m3[0].val[0], m3[4].val[0] );

uint32_t sad = ( uint32_t ) vaddvq_s32(iSum);
uint32_t sad = ( uint32_t ) horizontal_add_s32x4(iSum);
uint32_t absDc = vgetq_lane_s32( m4[0].val[0], 0 );
sad -= absDc;
sad += absDc >> 2;
Expand Down Expand Up @@ -612,39 +612,39 @@ static uint32_t xCalcHAD8x8_Neon( const Pel *piOrg, const Pel *piCur, const int
m1[6].val[0] = vaddq_s16( m2[6].val[0], m2[7].val[0] );
m1[7].val[0] = vsubq_s16( m2[6].val[0], m2[7].val[0] ); // 14 bit

m2[0].val[0] = vzip1q_s16( m1[0].val[0], m1[1].val[0] );
m2[1].val[0] = vzip1q_s16( m1[2].val[0], m1[3].val[0] );
m2[2].val[0] = vzip2q_s16( m1[0].val[0], m1[1].val[0] );
m2[3].val[0] = vzip2q_s16( m1[2].val[0], m1[3].val[0] );
m2[4].val[0] = vzip1q_s16( m1[4].val[0], m1[5].val[0] );
m2[5].val[0] = vzip1q_s16( m1[6].val[0], m1[7].val[0] );
m2[6].val[0] = vzip2q_s16( m1[4].val[0], m1[5].val[0] );
m2[7].val[0] = vzip2q_s16( m1[6].val[0], m1[7].val[0] );

m1[0].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ) );
m1[1].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ) );
m1[2].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ) );
m1[3].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ) );
m1[4].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ) );
m1[5].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ) );
m1[6].val[0] = vreinterpretq_s16_s32( vzip1q_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ) );
m1[7].val[0] = vreinterpretq_s16_s32( vzip2q_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ) );

m3[0].val[1] = vmovl_high_s16( m1[0].val[0] );
m2[0].val[0] = vzipq_s16( m1[0].val[0], m1[1].val[0] ).val[0];
m2[1].val[0] = vzipq_s16( m1[2].val[0], m1[3].val[0] ).val[0];
m2[2].val[0] = vzipq_s16( m1[0].val[0], m1[1].val[0] ).val[1];
m2[3].val[0] = vzipq_s16( m1[2].val[0], m1[3].val[0] ).val[1];
m2[4].val[0] = vzipq_s16( m1[4].val[0], m1[5].val[0] ).val[0];
m2[5].val[0] = vzipq_s16( m1[6].val[0], m1[7].val[0] ).val[0];
m2[6].val[0] = vzipq_s16( m1[4].val[0], m1[5].val[0] ).val[1];
m2[7].val[0] = vzipq_s16( m1[6].val[0], m1[7].val[0] ).val[1];

m1[0].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ).val[0] );
m1[1].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[0].val[0]) , vreinterpretq_s32_s16(m2[1].val[0]) ).val[1] );
m1[2].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ).val[0] );
m1[3].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[2].val[0]) , vreinterpretq_s32_s16(m2[3].val[0]) ).val[1] );
m1[4].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ).val[0] );
m1[5].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[4].val[0]) , vreinterpretq_s32_s16(m2[5].val[0]) ).val[1] );
m1[6].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ).val[0] );
m1[7].val[0] = vreinterpretq_s16_s32( vzipq_s32( vreinterpretq_s32_s16(m2[6].val[0]) , vreinterpretq_s32_s16(m2[7].val[0]) ).val[1] );

m3[0].val[1] = vmovl_s16( vget_high_s16( m1[0].val[0] ) );
m3[0].val[0] = vmovl_s16( vget_low_s16( m1[0].val[0] ) );
m3[1].val[1] = vmovl_high_s16( m1[1].val[0] );
m3[1].val[1] = vmovl_s16( vget_high_s16( m1[1].val[0] ) );
m3[1].val[0] = vmovl_s16( vget_low_s16( m1[1].val[0] ) );
m3[2].val[1] = vmovl_high_s16( m1[2].val[0] );
m3[2].val[1] = vmovl_s16( vget_high_s16( m1[2].val[0] ) );
m3[2].val[0] = vmovl_s16( vget_low_s16( m1[2].val[0] ) );
m3[3].val[1] = vmovl_high_s16( m1[3].val[0] );
m3[3].val[1] = vmovl_s16( vget_high_s16( m1[3].val[0] ) );
m3[3].val[0] = vmovl_s16( vget_low_s16( m1[3].val[0] ) );
m3[4].val[1] = vmovl_high_s16( m1[4].val[0] );
m3[4].val[1] = vmovl_s16( vget_high_s16( m1[4].val[0] ) );
m3[4].val[0] = vmovl_s16( vget_low_s16( m1[4].val[0] ) );
m3[5].val[1] = vmovl_high_s16( m1[5].val[0] );
m3[5].val[1] = vmovl_s16( vget_high_s16( m1[5].val[0] ) );
m3[5].val[0] = vmovl_s16( vget_low_s16( m1[5].val[0] ) );
m3[6].val[1] = vmovl_high_s16( m1[6].val[0] );
m3[6].val[1] = vmovl_s16( vget_high_s16( m1[6].val[0] ) );
m3[6].val[0] = vmovl_s16( vget_low_s16( m1[6].val[0] ) );
m3[7].val[1] = vmovl_high_s16( m1[7].val[0] );
m3[7].val[1] = vmovl_s16( vget_high_s16( m1[7].val[0] ) );
m3[7].val[0] = vmovl_s16( vget_low_s16( m1[7].val[0] ) );

m4[0].val[0] = m3[0].val[0];
Expand Down Expand Up @@ -746,7 +746,7 @@ static uint32_t xCalcHAD8x8_Neon( const Pel *piOrg, const Pel *piCur, const int
m3[4].val[0] = vaddq_s32( m3[4].val[0], m3[6].val[0] );
int32x4_t iSum = vaddq_s32( m3[0].val[0], m3[4].val[0] );

uint32_t sad = ( uint32_t ) vaddvq_s32(iSum);
uint32_t sad = ( uint32_t ) horizontal_add_s32x4(iSum);
uint32_t absDc = vgetq_lane_s32( m4[0].val[0], 0 );
sad -= absDc;
sad += absDc >> 2;
Expand Down Expand Up @@ -962,11 +962,11 @@ void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost )
INCY( piCur, iStrideCur );
}

int32x4_t sum = { vaddlvq_s16( sum0 ), vaddlvq_s16( sum1 ), vaddlvq_s16( sum3 ), vaddlvq_s16( sum4 ) };
int32x4_t sum = horizontal_add_long_4d_s16x8( sum0, sum1, sum3, sum4 );

int32x4_t sumTwo;
if( isCalCentrePos )
sumTwo = vdupq_n_s32( vaddlvq_s16( sum2 ) );
sumTwo = vdupq_n_s32( horizontal_add_long_s16x8( sum2 ) );

// vshlq_n_s32 doesnt work because iSubShift ist not a const.
sum = vshlq_s32( sum, vdupq_n_s32( iSubShift ) );
Expand All @@ -977,9 +977,9 @@ void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost )
if( isCalCentrePos )
sumTwo = vshrq_n_s32( sumTwo, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) );

vst1q_s32( (int32_t*) &cost[0], vzip1q_s32( sum, vdupq_n_s32(0) ) );
vst1q_s32( (int32_t*) &cost[0], vzipq_s32( sum, vdupq_n_s32(0) ).val[0] );
if (isCalCentrePos) cost[2] = (vgetq_lane_s32(sumTwo,0));
vst1q_s32( (int32_t*) &cost[3], vzip2q_s32( sum, vdupq_n_s32(0) ) );
vst1q_s32( (int32_t*) &cost[3], vzipq_s32( sum, vdupq_n_s32(0) ).val[1] );
}

template <ARM_VEXT vext>
Expand Down Expand Up @@ -1025,14 +1025,6 @@ void RdCost::_initRdCostARM()
#endif // defined( TARGET_SIMD_X86 )
}

#else // !REAL_TARGET_AARCH64

template<ARM_VEXT vext>
void RdCost::_initRdCostARM()
{}

#endif // REAL_TARGET_AARCH64

template void RdCost::_initRdCostARM<SIMDARM>();

#endif // defined( TARGET_SIMD_ARM )
Expand Down
26 changes: 26 additions & 0 deletions source/Lib/CommonLib/arm/neon/sum_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ static inline int horizontal_add_s32x4( const int32x4_t a )
#endif
}

static inline int horizontal_add_long_s16x8( const int16x8_t a )
{
#if REAL_TARGET_AARCH64
return vaddlvq_s16( a );
#else
return horizontal_add_s32x4( vpaddlq_s16( a ) );
#endif
}

static inline int32x4_t horizontal_add_4d_s32x4( const int32x4_t v0, const int32x4_t v1, const int32x4_t v2,
const int32x4_t v3 )
{
Expand All @@ -83,6 +92,23 @@ static inline int32x4_t horizontal_add_4d_s32x4( const int32x4_t v0, const int32
#endif
}

static inline int32x4_t horizontal_add_long_4d_s16x8( const int16x8_t v0, const int16x8_t v1, const int16x8_t v2,
const int16x8_t v3 )
{
return horizontal_add_4d_s32x4( vpaddlq_s16( v0 ), vpaddlq_s16( v1 ), vpaddlq_s16( v2 ), vpaddlq_s16( v3 ) );
}

static inline int16x8_t pairwise_add_s16x8( const int16x8_t a, const int16x8_t b )
{
#if REAL_TARGET_AARCH64
return vpaddq_s16( a, b );
#else
int16x4_t lo = vpadd_s16( vget_low_s16( a ), vget_low_s16( b ) );
int16x4_t hi = vpadd_s16( vget_high_s16( a ), vget_high_s16( b ) );
return vcombine_s16( lo, hi );
#endif
}

} // namespace vvenc

#endif