From d3c1e1a2fb4456e7b58768351a933fd09f3484b4 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 5 Sep 2024 13:42:51 +0000
Subject: [PATCH] Add new AArch64 Neon implementations for TCoeffOps kernels

Add new Neon implementations for the TCoeffOps kernels fastInvCore and
fastFwdCore_2D. The fastFwdCore_1D kernel does not appear on a perf
profile for --preset=fast so is ignored for this commit.

Running a video encoding job on a Neoverse V2 machine using the
--preset=fast setting shows a ~0.8% improvement in reported FPS.
---
 source/Lib/CommonLib/TrQuant_EMT.h           |   8 +-
 source/Lib/CommonLib/arm/InitARM.cpp         |  11 +
 source/Lib/CommonLib/arm/neon/Trafo_neon.cpp | 282 +++++++++++++++++++
 source/Lib/vvenc/vvencimpl.cpp               |  25 +-
 4 files changed, 314 insertions(+), 12 deletions(-)
 create mode 100644 source/Lib/CommonLib/arm/neon/Trafo_neon.cpp
diff --git a/source/Lib/CommonLib/TrQuant_EMT.h b/source/Lib/CommonLib/TrQuant_EMT.h
index 797734d0c..177e6266b 100644
--- a/source/Lib/CommonLib/TrQuant_EMT.h
+++ b/source/Lib/CommonLib/TrQuant_EMT.h
@@ -53,6 +53,7 @@ POSSIBILITY OF SUCH DAMAGE.
 namespace vvenc {
 
 using namespace x86_simd;
+using namespace arm_simd;
 
 #if ENABLE_SIMD_TRAFO
 struct TCoeffOps
@@ -64,7 +65,12 @@ struct TCoeffOps
   template<X86_VEXT vext>
   void _initTCoeffOpsX86();
 #endif
-	
+#if defined( TARGET_SIMD_ARM )
+  void initTCoeffOpsARM();
+  template<ARM_VEXT vext>
+  void _initTCoeffOpsARM();
+#endif
+
   void( *cpyResi8 )         ( const TCoeff*      src,        Pel*    dst, ptrdiff_t stride, unsigned width, unsigned height );
   void( *cpyResi4 )         ( const TCoeff*      src,        Pel*    dst, ptrdiff_t stride, unsigned width, unsigned height );
   void( *cpyCoeff8 )        ( const Pel*         src, ptrdiff_t stride,   TCoeff* dst, unsigned width, unsigned height );
diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
index c51e03db3..362d9ff23 100644
--- a/source/Lib/CommonLib/arm/InitARM.cpp
+++ b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -110,6 +110,17 @@ void MCTF::initMCTF_ARM()
 }
 #endif  // ENABLE_SIMD_OPT_MCTF
 
+#if ENABLE_SIMD_TRAFO
+void TCoeffOps::initTCoeffOpsARM()
+{
+  auto vext = read_arm_extension_flags();
+  if( vext >= NEON )
+  {
+    _initTCoeffOpsARM<NEON>();
+  }
+}
+#endif
+
 #endif   // TARGET_SIMD_ARM
 
 }   // namespace
diff --git a/source/Lib/CommonLib/arm/neon/Trafo_neon.cpp b/source/Lib/CommonLib/arm/neon/Trafo_neon.cpp
new file mode 100644
index 000000000..8a886d154
--- /dev/null
+++ b/source/Lib/CommonLib/arm/neon/Trafo_neon.cpp
@@ -0,0 +1,282 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+/**
+ * \file Trafo_neon.cpp
+ * \brief Neon implementation of TCoeffOps for AArch64.
+ */
+
+//  ====================================================================================================================
+//  Includes
+//  ====================================================================================================================
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+#include "TrQuant.h"
+#include "TrQuant_EMT.h"
+#include "sum_neon.h"
+
+//! \ingroup CommonLib
+//! \{
+
+#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_TRAFO
+
+namespace vvenc
+{
+
+static inline int64_t shift_and_round( int64_t x, int shift )
+{
+  return ( x + ( 1 << ( shift - 1 ) ) ) >> shift;
+}
+
+template<unsigned trSize>
+static void fastInvCore_neon( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines,
+                              unsigned reducedLines, unsigned rows )
+{
+  static_assert( trSize % 4 == 0, "trSize should be a multiple of four" );
+  CHECK( rows % 4 != 0, "rows should be a multiple of four" );
+  CHECK( rows == 0, "rows should be non-zero" );
+
+  unsigned i = 0;
+  for( ; i != ( reducedLines & ~3U ); i += 4 )
+  {
+    unsigned j = 0;
+    do
+    {
+      const TCoeff* srci      = src + i;
+      const TMatrixCoeff* itj = it + j;
+      TCoeff* dstij           = dst + i * trSize + j;
+
+      int32x4_t d0 = vld1q_s32( dstij + 0 * trSize );
+      int32x4_t d1 = vld1q_s32( dstij + 1 * trSize );
+      int32x4_t d2 = vld1q_s32( dstij + 2 * trSize );
+      int32x4_t d3 = vld1q_s32( dstij + 3 * trSize );
+
+      unsigned k = rows;
+      do
+      {
+        int16x4_t s = vmovn_s32( vld1q_s32( srci ) );
+        int16x4_t c = vld1_s16( itj );
+
+        d0 = vmlal_lane_s16( d0, c, s, 0 );
+        d1 = vmlal_lane_s16( d1, c, s, 1 );
+        d2 = vmlal_lane_s16( d2, c, s, 2 );
+        d3 = vmlal_lane_s16( d3, c, s, 3 );
+
+        srci += lines;
+        itj += trSize;
+      } while( --k != 0 );
+
+      vst1q_s32( dstij + 0 * trSize, d0 );
+      vst1q_s32( dstij + 1 * trSize, d1 );
+      vst1q_s32( dstij + 2 * trSize, d2 );
+      vst1q_s32( dstij + 3 * trSize, d3 );
+
+      j += 4;
+    } while( j != trSize );
+  }
+
+  for( ; i != reducedLines; ++i )
+  {
+    unsigned j = 0;
+    do
+    {
+      const TCoeff* srci      = src + i;
+      const TMatrixCoeff* itj = it + j;
+      TCoeff* dstij           = dst + i * trSize + j;
+
+      int32_t d0 = *dstij;
+      unsigned k = rows;
+      do
+      {
+        d0 += *srci * *itj;
+
+        srci += lines;
+        itj += trSize;
+      } while( --k != 0 );
+
+      *dstij = d0;
+    } while( ++j != trSize );
+  }
+}
+
+template<unsigned trSize>
+static void fastFwdCore_neon( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line,
+                              unsigned reducedLine, unsigned cutoff, int shift )
+{
+  static_assert( trSize % 4 == 0, "trSize should be a multiple of four" );
+  CHECK( cutoff % 4 != 0, "cutoff should be a multiple of four" );
+  CHECK( cutoff == 0, "cutoff should be non-zero" );
+  CHECK( shift == 0, "shift must be at least one" );
+
+  unsigned i = 0;
+  for( ; i < ( reducedLine & ~3U ); i += 4 )
+  {
+    unsigned j = 0;
+    do
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      int32x4_t sum00 = vdupq_n_s32( 0 );
+      int32x4_t sum01 = vdupq_n_s32( 0 );
+      int32x4_t sum02 = vdupq_n_s32( 0 );
+      int32x4_t sum03 = vdupq_n_s32( 0 );
+      int32x4_t sum10 = vdupq_n_s32( 0 );
+      int32x4_t sum11 = vdupq_n_s32( 0 );
+      int32x4_t sum12 = vdupq_n_s32( 0 );
+      int32x4_t sum13 = vdupq_n_s32( 0 );
+      int32x4_t sum20 = vdupq_n_s32( 0 );
+      int32x4_t sum21 = vdupq_n_s32( 0 );
+      int32x4_t sum22 = vdupq_n_s32( 0 );
+      int32x4_t sum23 = vdupq_n_s32( 0 );
+      int32x4_t sum30 = vdupq_n_s32( 0 );
+      int32x4_t sum31 = vdupq_n_s32( 0 );
+      int32x4_t sum32 = vdupq_n_s32( 0 );
+      int32x4_t sum33 = vdupq_n_s32( 0 );
+      for( unsigned k = 0; k < trSize; k += 4 )
+      {
+        int16x4_t s0 = vmovn_s32( vld1q_s32( srci + 0 * trSize ) );
+        int16x4_t s1 = vmovn_s32( vld1q_s32( srci + 1 * trSize ) );
+        int16x4_t s2 = vmovn_s32( vld1q_s32( srci + 2 * trSize ) );
+        int16x4_t s3 = vmovn_s32( vld1q_s32( srci + 3 * trSize ) );
+        int16x4_t c0 = vld1_s16( tcj + 0 * trSize );
+        int16x4_t c1 = vld1_s16( tcj + 1 * trSize );
+        int16x4_t c2 = vld1_s16( tcj + 2 * trSize );
+        int16x4_t c3 = vld1_s16( tcj + 3 * trSize );
+
+        sum00 = vmlal_s16( sum00, s0, c0 );
+        sum01 = vmlal_s16( sum01, s0, c1 );
+        sum02 = vmlal_s16( sum02, s0, c2 );
+        sum03 = vmlal_s16( sum03, s0, c3 );
+        sum10 = vmlal_s16( sum10, s1, c0 );
+        sum11 = vmlal_s16( sum11, s1, c1 );
+        sum12 = vmlal_s16( sum12, s1, c2 );
+        sum13 = vmlal_s16( sum13, s1, c3 );
+        sum20 = vmlal_s16( sum20, s2, c0 );
+        sum21 = vmlal_s16( sum21, s2, c1 );
+        sum22 = vmlal_s16( sum22, s2, c2 );
+        sum23 = vmlal_s16( sum23, s2, c3 );
+        sum30 = vmlal_s16( sum30, s3, c0 );
+        sum31 = vmlal_s16( sum31, s3, c1 );
+        sum32 = vmlal_s16( sum32, s3, c2 );
+        sum33 = vmlal_s16( sum33, s3, c3 );
+
+        srci += 4;
+        tcj += 4;
+      }
+      int32x4_t sum0 = horizontal_add_4d_s32x4( sum00, sum10, sum20, sum30 );
+      int32x4_t sum1 = horizontal_add_4d_s32x4( sum01, sum11, sum21, sum31 );
+      int32x4_t sum2 = horizontal_add_4d_s32x4( sum02, sum12, sum22, sum32 );
+      int32x4_t sum3 = horizontal_add_4d_s32x4( sum03, sum13, sum23, sum33 );
+
+      sum0 = vrshlq_s32( sum0, vdupq_n_s32( -shift ) );
+      sum1 = vrshlq_s32( sum1, vdupq_n_s32( -shift ) );
+      sum2 = vrshlq_s32( sum2, vdupq_n_s32( -shift ) );
+      sum3 = vrshlq_s32( sum3, vdupq_n_s32( -shift ) );
+
+      TCoeff* dstij = dst + j * line + i;
+      vst1q_s32( dstij + 0 * line, sum0 );
+      vst1q_s32( dstij + 1 * line, sum1 );
+      vst1q_s32( dstij + 2 * line, sum2 );
+      vst1q_s32( dstij + 3 * line, sum3 );
+
+      j += 4;
+    } while( j != cutoff );
+  }
+  for( ; i < reducedLine; ++i )
+  {
+    int j = 0;
+    do
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      int32x4_t sum00 = vdupq_n_s32( 0 );
+      int32x4_t sum01 = vdupq_n_s32( 0 );
+      int32x4_t sum02 = vdupq_n_s32( 0 );
+      int32x4_t sum03 = vdupq_n_s32( 0 );
+      for( int k = 0; k < trSize; k += 4 )
+      {
+        int16x4_t s0 = vmovn_s32( vld1q_s32( srci ) );
+        int16x4_t c0 = vld1_s16( tcj + 0 * trSize );
+        int16x4_t c1 = vld1_s16( tcj + 1 * trSize );
+        int16x4_t c2 = vld1_s16( tcj + 2 * trSize );
+        int16x4_t c3 = vld1_s16( tcj + 3 * trSize );
+
+        sum00 = vmlal_s16( sum00, s0, c0 );
+        sum01 = vmlal_s16( sum01, s0, c1 );
+        sum02 = vmlal_s16( sum02, s0, c2 );
+        sum03 = vmlal_s16( sum03, s0, c3 );
+
+        srci += 4;
+        tcj += 4;
+      }
+      TCoeff* dstij = dst + j * line + i;
+
+      dstij[ 0 * line ] = shift_and_round( horizontal_add_s32x4( sum00 ), shift );
+      dstij[ 1 * line ] = shift_and_round( horizontal_add_s32x4( sum01 ), shift );
+      dstij[ 2 * line ] = shift_and_round( horizontal_add_s32x4( sum02 ), shift );
+      dstij[ 3 * line ] = shift_and_round( horizontal_add_s32x4( sum03 ), shift );
+
+      j += 4;
+    } while( j != cutoff );
+  }
+}
+
+template<>
+void TCoeffOps::_initTCoeffOpsARM<NEON>()
+{
+  fastInvCore[ 0 ]    = fastInvCore_neon<4>;
+  fastInvCore[ 1 ]    = fastInvCore_neon<8>;
+  fastInvCore[ 2 ]    = fastInvCore_neon<16>;
+  fastInvCore[ 3 ]    = fastInvCore_neon<32>;
+  fastInvCore[ 4 ]    = fastInvCore_neon<64>;
+  fastFwdCore_2D[ 0 ] = fastFwdCore_neon<4>;
+  fastFwdCore_2D[ 1 ] = fastFwdCore_neon<8>;
+  fastFwdCore_2D[ 2 ] = fastFwdCore_neon<16>;
+  fastFwdCore_2D[ 3 ] = fastFwdCore_neon<32>;
+  fastFwdCore_2D[ 4 ] = fastFwdCore_neon<64>;
+}
+
+}  // namespace vvenc
+
+#endif
+//! \}
diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp
index ca09b6dff..7a75f3b21 100644
--- a/source/Lib/vvenc/vvencimpl.cpp
+++ b/source/Lib/vvenc/vvencimpl.cpp
@@ -54,25 +54,25 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "vvenc/version.h"
 #include "CommonLib/CommonDef.h"
-#include "CommonLib/Picture.h"
 #include "CommonLib/Nal.h"
+#include "CommonLib/Picture.h"
 #include "EncoderLib/EncGOP.h"
-
 #include "EncoderLib/EncLib.h"
-#if defined( TARGET_SIMD_X86 )
-#  include "CommonLib/x86/CommonDefX86.h"
-#  if ENABLE_SIMD_TRAFO
-#    include "CommonLib/TrQuant_EMT.h"
-#  endif   // ENABLE_SIMD_TRAFO
-#endif     // TARGET_SIMD_X86
+
+#if ENABLE_SIMD_TRAFO
+#include "CommonLib/TrQuant_EMT.h"
+#endif  // ENABLE_SIMD_TRAFO
 
 #if defined( __linux__ )
-#  include <malloc.h>
+#include <malloc.h>
 #endif
 
+#if defined( TARGET_SIMD_X86 )
+#include "CommonLib/x86/CommonDefX86.h"
+#endif  // defined( TARGET_SIMD_X86 )
 #if defined( TARGET_SIMD_ARM )
-#  include "CommonLib/arm/CommonDefARM.h"
-#endif
+#include "CommonLib/arm/CommonDefARM.h"
+#endif  // defined( TARGET_SIMD_ARM )
 
 #if _DEBUG
 #define HANDLE_EXCEPTION 0
@@ -852,6 +852,9 @@ const char* VVEncImpl::setSIMDExtension( const char* simdId )
 #if defined( TARGET_SIMD_X86 )
     g_tCoeffOps.initTCoeffOpsX86();
 #endif
+#if defined( TARGET_SIMD_ARM )
+    g_tCoeffOps.initTCoeffOpsARM();
+#endif
 #endif  // ENABLE_SIMD_TRAFO
 
 #if defined( TARGET_SIMD_ARM )