Skip to content

Commit

Permalink
importing of PR#444 of vvenc. Stride types adjusted to vvdec needs.
Browse files Browse the repository at this point in the history
original message:

Add Neon implementations of MCIF simdFilter{4,8}xX_N8 (#444)

Mostly a copy of the existing approach for simdFilter16xX_N8_neon.

Also rename the helper functions used by the existing
simdFilter16xX_N8_neon kernel to avoid name clashes with the new two
kernels introduced by this commit.

Running a video encoding job on a Neoverse V2 machine using the
--preset=fast setting shows a ~1.5% improvement in reported FPS.
  • Loading branch information
georges-arm authored and Proudsalsa committed Oct 23, 2024
1 parent a21a274 commit 5cad92a
Showing 1 changed file with 237 additions and 50 deletions.
287 changes: 237 additions & 50 deletions source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,63 +133,51 @@ static void simdInterpolateN2_2D_neon( const ClpRng& clpRng, const Pel* src, con
}
}

static int16x8_t simdFilter16xX_N8_half( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x8_t vsrca00 = vld1q_s16( src + 0 );
int16x8_t vsrca01 = vld1q_s16( src + 1 );
int16x8_t vsrca10 = vld1q_s16( src + 2 );
int16x8_t vsrca11 = vld1q_s16( src + 3 );
int16x8_t vsrcb00 = vld1q_s16( src + 4 );
int16x8_t vsrcb01 = vld1q_s16( src + 5 );
int16x8_t vsrcb10 = vld1q_s16( src + 6 );
int16x8_t vsrcb11 = vld1q_s16( src + 7 );

int32x4_t a0 = vmull_s16( vget_low_s16( vsrca00 ), vget_low_s16( ch ) );
int32x4_t a1 = vmull_s16( vget_low_s16( vsrca01 ), vget_low_s16( ch ) );
int32x4_t a2 = vmull_s16( vget_low_s16( vsrca10 ), vget_low_s16( ch ) );
int32x4_t a3 = vmull_s16( vget_low_s16( vsrca11 ), vget_low_s16( ch ) );

int32x4_t b0 = vmull_s16( vget_low_s16( vsrcb00 ), vget_low_s16( ch ) );
int32x4_t b1 = vmull_s16( vget_low_s16( vsrcb01 ), vget_low_s16( ch ) );
int32x4_t b2 = vmull_s16( vget_low_s16( vsrcb10 ), vget_low_s16( ch ) );
int32x4_t b3 = vmull_s16( vget_low_s16( vsrcb11 ), vget_low_s16( ch ) );

a0 = vmlal_s16( a0, vget_high_s16( vsrca00 ), vget_high_s16( ch ) );
a1 = vmlal_s16( a1, vget_high_s16( vsrca01 ), vget_high_s16( ch ) );
a2 = vmlal_s16( a2, vget_high_s16( vsrca10 ), vget_high_s16( ch ) );
a3 = vmlal_s16( a3, vget_high_s16( vsrca11 ), vget_high_s16( ch ) );

b0 = vmlal_s16( b0, vget_high_s16( vsrcb00 ), vget_high_s16( ch ) );
b1 = vmlal_s16( b1, vget_high_s16( vsrcb01 ), vget_high_s16( ch ) );
b2 = vmlal_s16( b2, vget_high_s16( vsrcb10 ), vget_high_s16( ch ) );
b3 = vmlal_s16( b3, vget_high_s16( vsrcb11 ), vget_high_s16( ch ) );
int16x8_t vsrca0 = vld1q_s16( src + 0 );
int16x8_t vsrca1 = vld1q_s16( src + 1 );
int16x8_t vsrca2 = vld1q_s16( src + 2 );
int16x8_t vsrca3 = vld1q_s16( src + 3 );

int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
int32x4_t vsumb = vpaddq_s32( vpaddq_s32( b0, b1 ), vpaddq_s32( b2, b3 ) );
int32x4_t a0 = vmull_s16( vget_low_s16( vsrca0 ), vget_low_s16( ch ) );
int32x4_t a1 = vmull_s16( vget_low_s16( vsrca1 ), vget_low_s16( ch ) );
int32x4_t a2 = vmull_s16( vget_low_s16( vsrca2 ), vget_low_s16( ch ) );
int32x4_t a3 = vmull_s16( vget_low_s16( vsrca3 ), vget_low_s16( ch ) );

vsuma = vaddq_s32( vsuma, voffset1 );
vsumb = vaddq_s32( vsumb, voffset1 );
a0 = vmlal_s16( a0, vget_high_s16( vsrca0 ), vget_high_s16( ch ) );
a1 = vmlal_s16( a1, vget_high_s16( vsrca1 ), vget_high_s16( ch ) );
a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) );
a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) );

vsuma = vshlq_s32( vsuma, invshift1st );
vsumb = vshlq_s32( vsumb, invshift1st );
int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
vsuma = vaddq_s32( vsuma, voffset1 );
vsuma = vshlq_s32( vsuma, invshift1st );
return vqmovn_s32( vsuma );
}

return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );
static int16x8_t filter8xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x4_t lo = filter4xX_N8_neon( src + 0, ch, voffset1, invshift1st );
int16x4_t hi = filter4xX_N8_neon( src + 4, ch, voffset1, invshift1st );
return vcombine_s16( lo, hi );
}

static int16x8x2_t simdFilter16xX_N8_step( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
static int16x8x2_t filter16xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x8_t a = simdFilter16xX_N8_half( src + 0, ch, voffset1, invshift1st );
int16x8_t b = simdFilter16xX_N8_half( src + 8, ch, voffset1, invshift1st );
int16x8_t a = filter8xX_N8_neon( src + 0, ch, voffset1, invshift1st );
int16x8_t b = filter8xX_N8_neon( src + 8, ch, voffset1, invshift1st );
return ( int16x8x2_t ){ a, b };
}

template<bool isLast>
static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV )
static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;
Expand All @@ -204,7 +192,93 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x4_t vibdimin = vdup_n_s16( clpRng.min() );
const int16x4_t vibdimax = vdup_n_s16( clpRng.max() );

int16x8_t ch = vld1q_s16( coeffH );
int16x8_t cv = vld1q_s16( coeffV );

int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x4_t vsrcv0 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv1 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv2 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv3 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv4 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv5 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv6 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x4_t vsrcv7 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 );

int16x4_t vsum01;
if( isLast ) // clip
{
vsum01 = vqmovn_s32( vshlq_s32( vsum0, invshift2nd ) );
vsum01 = vmin_s16( vibdimax, vmax_s16( vibdimin, vsum01 ) );
}
else
{
vsum01 = vqshrn_n_s32( vsum0, IF_FILTER_PREC );
}

vsrcv0 = vsrcv1;
vsrcv1 = vsrcv2;
vsrcv2 = vsrcv3;
vsrcv3 = vsrcv4;
vsrcv4 = vsrcv5;
vsrcv5 = vsrcv6;
vsrcv6 = vsrcv7;

vst1_s16( dst, vsum01 );
dst += dstStride;
} while( --height != 0 );
}

template<bool isLast>
static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;

const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
int offset2nd;
if( isLast )
{
offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
}
else
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
Expand All @@ -216,24 +290,131 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x8x2_t vsrcv0 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv0 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv1 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv1 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv2 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv2 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv3 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv3 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv4 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv4 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv5 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv5 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv6 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv6 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x8x2_t vsrcv7 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv7 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
int32x4_t vsum1 = vdupq_n_s32( offset2nd );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 );

int16x8_t vsum01;
if( isLast ) // clip
{
vsum0 = vshlq_s32( vsum0, invshift2nd );
vsum1 = vshlq_s32( vsum1, invshift2nd );

vsum01 = vcombine_s16( vqmovn_s32( vsum0 ), vqmovn_s32( vsum1 ) );
vsum01 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum01 ) );
}
else
{
vsum01 = vcombine_s16( vqshrn_n_s32( vsum0, IF_FILTER_PREC ), vqshrn_n_s32( vsum1, IF_FILTER_PREC ) );
}

vsrcv0 = vsrcv1;
vsrcv1 = vsrcv2;
vsrcv2 = vsrcv3;
vsrcv3 = vsrcv4;
vsrcv4 = vsrcv5;
vsrcv5 = vsrcv6;
vsrcv6 = vsrcv7;

vst1q_s16( dst, vsum01 );
dst += dstStride;
} while( --height != 0 );
}

template<bool isLast>
static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;

const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
int offset2nd;
if( isLast )
{
offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
}
else
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );

int16x8_t ch = vld1q_s16( coeffH );
int16x8_t cv = vld1q_s16( coeffV );

int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x8x2_t vsrcv0 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv1 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv2 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv3 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv4 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv5 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv6 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x8x2_t vsrcv7 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
Expand Down Expand Up @@ -318,6 +499,12 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
template<>
void InterpolationFilter::_initInterpolationFilterARM<NEON>()
{
m_filter4x4[ 0 ][ 0 ] = simdFilter4xX_N8_neon<false>;
m_filter4x4[ 0 ][ 1 ] = simdFilter4xX_N8_neon<true>;

m_filter8x8[ 0 ][ 0 ] = simdFilter8xX_N8_neon<false>;
m_filter8x8[ 0 ][ 1 ] = simdFilter8xX_N8_neon<true>;

m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8_neon<false>;
m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon<true>;

Expand Down

0 comments on commit 5cad92a

Please sign in to comment.