From 5cad92aa93fd00657abb51dc81caaadb480e2fd9 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 21 Oct 2024 16:05:57 +0100
Subject: [PATCH] importing of PR#444 of vvenc. Stride types adjusted to vvdec
 needs.

original message:

Add Neon implementations of MCIF simdFilter{4,8}xX_N8 (#444)

Mostly a copy of the existing approach for simdFilter16xX_N8_neon.

Also rename the helper functions used by the existing
simdFilter16xX_N8_neon kernel to avoid name clashes with the new two
kernels introduced by this commit.

Running a video encoding job on a Neoverse V2 machine using the
--preset=fast setting shows a ~1.5% improvement in reported FPS.
---
 .../arm/neon/InterpolationFilter_neon.cpp     | 287 +++++++++++++++---
 1 file changed, 237 insertions(+), 50 deletions(-)

diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
index 25a23e0d..22abd608 100644
--- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
+++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
@@ -133,63 +133,51 @@ static void simdInterpolateN2_2D_neon( const ClpRng& clpRng, const Pel* src, con
   }
 }
 
-static int16x8_t simdFilter16xX_N8_half( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
+static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
 {
-  int16x8_t vsrca00 = vld1q_s16( src + 0 );
-  int16x8_t vsrca01 = vld1q_s16( src + 1 );
-  int16x8_t vsrca10 = vld1q_s16( src + 2 );
-  int16x8_t vsrca11 = vld1q_s16( src + 3 );
-  int16x8_t vsrcb00 = vld1q_s16( src + 4 );
-  int16x8_t vsrcb01 = vld1q_s16( src + 5 );
-  int16x8_t vsrcb10 = vld1q_s16( src + 6 );
-  int16x8_t vsrcb11 = vld1q_s16( src + 7 );
-
-  int32x4_t a0 = vmull_s16( vget_low_s16( vsrca00 ), vget_low_s16( ch ) );
-  int32x4_t a1 = vmull_s16( vget_low_s16( vsrca01 ), vget_low_s16( ch ) );
-  int32x4_t a2 = vmull_s16( vget_low_s16( vsrca10 ), vget_low_s16( ch ) );
-  int32x4_t a3 = vmull_s16( vget_low_s16( vsrca11 ), vget_low_s16( ch ) );
-
-  int32x4_t b0 = vmull_s16( vget_low_s16( vsrcb00 ), vget_low_s16( ch ) );
-  int32x4_t b1 = vmull_s16( vget_low_s16( vsrcb01 ), vget_low_s16( ch ) );
-  int32x4_t b2 = vmull_s16( vget_low_s16( vsrcb10 ), vget_low_s16( ch ) );
-  int32x4_t b3 = vmull_s16( vget_low_s16( vsrcb11 ), vget_low_s16( ch ) );
-
-  a0 = vmlal_s16( a0, vget_high_s16( vsrca00 ), vget_high_s16( ch ) );
-  a1 = vmlal_s16( a1, vget_high_s16( vsrca01 ), vget_high_s16( ch ) );
-  a2 = vmlal_s16( a2, vget_high_s16( vsrca10 ), vget_high_s16( ch ) );
-  a3 = vmlal_s16( a3, vget_high_s16( vsrca11 ), vget_high_s16( ch ) );
-
-  b0 = vmlal_s16( b0, vget_high_s16( vsrcb00 ), vget_high_s16( ch ) );
-  b1 = vmlal_s16( b1, vget_high_s16( vsrcb01 ), vget_high_s16( ch ) );
-  b2 = vmlal_s16( b2, vget_high_s16( vsrcb10 ), vget_high_s16( ch ) );
-  b3 = vmlal_s16( b3, vget_high_s16( vsrcb11 ), vget_high_s16( ch ) );
+  int16x8_t vsrca0 = vld1q_s16( src + 0 );
+  int16x8_t vsrca1 = vld1q_s16( src + 1 );
+  int16x8_t vsrca2 = vld1q_s16( src + 2 );
+  int16x8_t vsrca3 = vld1q_s16( src + 3 );
 
-  int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
-  int32x4_t vsumb = vpaddq_s32( vpaddq_s32( b0, b1 ), vpaddq_s32( b2, b3 ) );
+  int32x4_t a0 = vmull_s16( vget_low_s16( vsrca0 ), vget_low_s16( ch ) );
+  int32x4_t a1 = vmull_s16( vget_low_s16( vsrca1 ), vget_low_s16( ch ) );
+  int32x4_t a2 = vmull_s16( vget_low_s16( vsrca2 ), vget_low_s16( ch ) );
+  int32x4_t a3 = vmull_s16( vget_low_s16( vsrca3 ), vget_low_s16( ch ) );
 
-  vsuma = vaddq_s32( vsuma, voffset1 );
-  vsumb = vaddq_s32( vsumb, voffset1 );
+  a0 = vmlal_s16( a0, vget_high_s16( vsrca0 ), vget_high_s16( ch ) );
+  a1 = vmlal_s16( a1, vget_high_s16( vsrca1 ), vget_high_s16( ch ) );
+  a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) );
+  a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) );
 
-  vsuma = vshlq_s32( vsuma, invshift1st );
-  vsumb = vshlq_s32( vsumb, invshift1st );
+  int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
+  vsuma           = vaddq_s32( vsuma, voffset1 );
+  vsuma           = vshlq_s32( vsuma, invshift1st );
+  return vqmovn_s32( vsuma );
+}
 
-  return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );
+static int16x8_t filter8xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
+{
+  int16x4_t lo = filter4xX_N8_neon( src + 0, ch, voffset1, invshift1st );
+  int16x4_t hi = filter4xX_N8_neon( src + 4, ch, voffset1, invshift1st );
+  return vcombine_s16( lo, hi );
 }
 
-static int16x8x2_t simdFilter16xX_N8_step( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
+static int16x8x2_t filter16xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
 {
-  int16x8_t a = simdFilter16xX_N8_half( src + 0, ch, voffset1, invshift1st );
-  int16x8_t b = simdFilter16xX_N8_half( src + 8, ch, voffset1, invshift1st );
+  int16x8_t a = filter8xX_N8_neon( src + 0, ch, voffset1, invshift1st );
+  int16x8_t b = filter8xX_N8_neon( src + 8, ch, voffset1, invshift1st );
   return ( int16x8x2_t ){ a, b };
 }
 
 template<bool isLast>
-static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV )
+static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
+                                   int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
 {
   OFFSET( src, srcStride, -3, -3 );
 
-  // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
-  // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
+  // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
+  // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
   const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
   const int shift1st = IF_FILTER_PREC - headRoom;
   const int shift2nd = IF_FILTER_PREC + headRoom;
@@ -204,7 +192,93 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
   {
     offset2nd = 0;
   }
+  const int32x4_t voffset1 = vdupq_n_s32( offset1st );
+
+  const int16x4_t vibdimin = vdup_n_s16( clpRng.min() );
+  const int16x4_t vibdimax = vdup_n_s16( clpRng.max() );
+
+  int16x8_t ch = vld1q_s16( coeffH );
+  int16x8_t cv = vld1q_s16( coeffV );
+
+  int32x4_t invshift1st = vdupq_n_s32( -shift1st );
+  int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );
+
+  int16x4_t vsrcv0 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv1 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv2 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv3 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv4 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv5 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x4_t vsrcv6 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+
+  do
+  {
+    int16x4_t vsrcv7 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
+    src += srcStride;
+
+    int32x4_t vsum0 = vdupq_n_s32( offset2nd );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 );
+    vsum0           = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 );
+
+    int16x4_t vsum01;
+    if( isLast )  // clip
+    {
+      vsum01 = vqmovn_s32( vshlq_s32( vsum0, invshift2nd ) );
+      vsum01 = vmin_s16( vibdimax, vmax_s16( vibdimin, vsum01 ) );
+    }
+    else
+    {
+      vsum01 = vqshrn_n_s32( vsum0, IF_FILTER_PREC );
+    }
+
+    vsrcv0 = vsrcv1;
+    vsrcv1 = vsrcv2;
+    vsrcv2 = vsrcv3;
+    vsrcv3 = vsrcv4;
+    vsrcv4 = vsrcv5;
+    vsrcv5 = vsrcv6;
+    vsrcv6 = vsrcv7;
+
+    vst1_s16( dst, vsum01 );
+    dst += dstStride;
+  } while( --height != 0 );
+}
+
+template<bool isLast>
+static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
+                                   int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
+{
+  OFFSET( src, srcStride, -3, -3 );
 
+  // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
+  // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
+  const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
+  const int shift1st = IF_FILTER_PREC - headRoom;
+  const int shift2nd = IF_FILTER_PREC + headRoom;
+
+  const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
+  int offset2nd;
+  if( isLast )
+  {
+    offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
+  }
+  else
+  {
+    offset2nd = 0;
+  }
   const int32x4_t voffset1 = vdupq_n_s32( offset1st );
 
   const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
@@ -216,24 +290,131 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
   int32x4_t invshift1st = vdupq_n_s32( -shift1st );
   int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );
 
-  int16x8x2_t vsrcv0 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv0 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv1 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv1 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv2 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv2 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv3 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv3 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv4 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv4 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv5 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv5 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
-  int16x8x2_t vsrcv6 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+  int16x8_t vsrcv6 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
   src += srcStride;
 
   do
   {
-    int16x8x2_t vsrcv7 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
+    int16x8_t vsrcv7 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
+    src += srcStride;
+
+    int32x4_t vsum0 = vdupq_n_s32( offset2nd );
+    int32x4_t vsum1 = vdupq_n_s32( offset2nd );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 );
+
+    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 );
+    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 );
+
+    int16x8_t vsum01;
+    if( isLast )  // clip
+    {
+      vsum0 = vshlq_s32( vsum0, invshift2nd );
+      vsum1 = vshlq_s32( vsum1, invshift2nd );
+
+      vsum01 = vcombine_s16( vqmovn_s32( vsum0 ), vqmovn_s32( vsum1 ) );
+      vsum01 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum01 ) );
+    }
+    else
+    {
+      vsum01 = vcombine_s16( vqshrn_n_s32( vsum0, IF_FILTER_PREC ), vqshrn_n_s32( vsum1, IF_FILTER_PREC ) );
+    }
+
+    vsrcv0 = vsrcv1;
+    vsrcv1 = vsrcv2;
+    vsrcv2 = vsrcv3;
+    vsrcv3 = vsrcv4;
+    vsrcv4 = vsrcv5;
+    vsrcv5 = vsrcv6;
+    vsrcv6 = vsrcv7;
+
+    vst1q_s16( dst, vsum01 );
+    dst += dstStride;
+  } while( --height != 0 );
+}
+
+template<bool isLast>
+static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
+                                    int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
+{
+  OFFSET( src, srcStride, -3, -3 );
+
+  // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
+  // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
+  const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
+  const int shift1st = IF_FILTER_PREC - headRoom;
+  const int shift2nd = IF_FILTER_PREC + headRoom;
+
+  const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
+  int offset2nd;
+  if( isLast )
+  {
+    offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
+  }
+  else
+  {
+    offset2nd = 0;
+  }
+  const int32x4_t voffset1 = vdupq_n_s32( offset1st );
+
+  const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
+  const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
+
+  int16x8_t ch = vld1q_s16( coeffH );
+  int16x8_t cv = vld1q_s16( coeffV );
+
+  int32x4_t invshift1st = vdupq_n_s32( -shift1st );
+  int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );
+
+  int16x8x2_t vsrcv0 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv1 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv2 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv3 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv4 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv5 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+  int16x8x2_t vsrcv6 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
+  src += srcStride;
+
+  do
+  {
+    int16x8x2_t vsrcv7 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
     src += srcStride;
 
     int32x4_t vsum0 = vdupq_n_s32( offset2nd );
@@ -318,6 +499,12 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
 template<>
 void InterpolationFilter::_initInterpolationFilterARM<NEON>()
 {
+  m_filter4x4[ 0 ][ 0 ] = simdFilter4xX_N8_neon<false>;
+  m_filter4x4[ 0 ][ 1 ] = simdFilter4xX_N8_neon<true>;
+
+  m_filter8x8[ 0 ][ 0 ] = simdFilter8xX_N8_neon<false>;
+  m_filter8x8[ 0 ][ 1 ] = simdFilter8xX_N8_neon<true>;
+
   m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8_neon<false>;
   m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon<true>;