Add AArch64 Neon implementation of MCTF motionErrorLumaFrac8 (#425)

* Add AArch64 Neon implementation of MCTF motionErrorLumaFrac8 Add a new Arm specialization for MCTF including initialization, and wire up the `m_motionErrorLumaFrac8[1]` member to a new Neon kernel. Running a video encoding job on a Neoverse V2 machine using the --preset=fast setting, this shows a ~5.2% improvement in reported FPS. * Add myself to AUTHORS.md Change-Id: I5a05c399fb9d9559060481680e0f11463a8165c0
fraunhoferhhi · Oct 7, 2024 · 49cb3f5 · 49cb3f5
1 parent 069013e
commit 49cb3f5
Show file tree

Hide file tree

Showing 5 changed files with 192 additions and 1 deletion.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -14,4 +14,5 @@
 * Jens Güther, , Fraunhofer HHI
 * Florian Eisenreich, , Fraunhofer HHI
 * Hossein Pejman, , École de technologie supérieure (ÉTS)
-* Vignesh V Menon, , Fraunhofer HHI
+* Vignesh V Menon, , Fraunhofer HHI
+* George Steed, @georges-arm, Arm
diff --git a/source/Lib/CommonLib/MCTF.cpp b/source/Lib/CommonLib/MCTF.cpp
@@ -569,6 +569,9 @@ MCTF::MCTF()
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_MCTF
   initMCTF_X86();
 #endif
+#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCTF
+  initMCTF_ARM();
+#endif
 
 }
 

diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h
@@ -142,6 +142,12 @@ class MCTF : public EncStage
   void _initMCTF_X86();
 #endif
 
+#ifdef TARGET_SIMD_ARM
+  void initMCTF_ARM();
+  template <ARM_VEXT vext>
+  void _initMCTF_ARM();
+#endif
+
   int ( *m_motionErrorLumaIntX )( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int besterror );
   int ( *m_motionErrorLumaInt8 )( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride, const int w, const int h, const int besterror );
 

diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -56,6 +56,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "CommonLib/IntraPrediction.h"
 #include "CommonLib/LoopFilter.h"
 #include "CommonLib/Picture.h"
+#include "CommonLib/MCTF.h"
 
 #include "CommonLib/AdaptiveLoopFilter.h"
 #include "CommonLib/SampleAdaptiveOffset.h"
@@ -110,6 +111,21 @@ void RdCost::initRdCostARM()
 }
 #endif
 
+#if ENABLE_SIMD_OPT_MCTF
+void MCTF::initMCTF_ARM()
+{
+  auto vext = read_arm_extension_flags();
+  switch( vext )
+  {
+  case NEON:
+    _initMCTF_ARM<NEON>();
+    break;
+  default:
+    break;
+  }
+}
+#endif  // ENABLE_SIMD_OPT_MCTF
+
 #endif   // TARGET_SIMD_ARM
 
 }   // namespace
diff --git a/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp b/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp
@@ -0,0 +1,165 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+/**
+ * \file MCTF_neon.cpp
+ * \brief Neon implementation of MCTF for AArch64.
+ */
+// ====================================================================================================================
+// Includes
+// ====================================================================================================================
+
+#include "MCTF.h"
+
+#include <arm_neon.h>
+
+//! \ingroup CommonLib
+//! \{
+
+#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCTF
+
+namespace vvenc
+{
+
+static int16x8_t motionErrorLumaFrac_loRes_step( const int16x8_t xf, const Pel* rowStart, const Pel maxSampleValue )
+{
+  int16x8_t row04 = vld1q_s16( rowStart + 0 );
+  int16x8_t row15 = vld1q_s16( rowStart + 1 );
+  int16x8_t row26 = vld1q_s16( rowStart + 2 );
+  int16x8_t row37 = vld1q_s16( rowStart + 3 );
+
+  int32x4_t sum0 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row04 ) );
+  int32x4_t sum4 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row04 ) );
+  int32x4_t sum1 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row15 ) );
+  int32x4_t sum5 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row15 ) );
+  int32x4_t sum2 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row26 ) );
+  int32x4_t sum6 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row26 ) );
+  int32x4_t sum3 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row37 ) );
+  int32x4_t sum7 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row37 ) );
+
+  int32x4_t sum01 = vpaddq_s32( sum0, sum1 );
+  int32x4_t sum23 = vpaddq_s32( sum2, sum3 );
+  int32x4_t sum45 = vpaddq_s32( sum4, sum5 );
+  int32x4_t sum67 = vpaddq_s32( sum6, sum7 );
+  int32x4_t sum0123 = vpaddq_s32( sum01, sum23 );
+  int32x4_t sum4567 = vpaddq_s32( sum45, sum67 );
+
+  uint16x8_t sum = vcombine_u16( vqrshrun_n_s32( sum0123, 6 ), vqrshrun_n_s32( sum4567, 6 ) );
+
+  return vminq_s16( vreinterpretq_s16_u16( sum ), vdupq_n_s16( maxSampleValue ) );
+}
+
+int motionErrorLumaFrac_loRes_neon( const Pel* org, const ptrdiff_t origStride, const Pel* buf,
+                                    const ptrdiff_t buffStride, const int w, const int h, const int16_t* xFilter,
+                                    const int16_t* yFilter, const int bitDepth, const int besterror )
+{
+  const Pel maxSampleValue = ( 1 << bitDepth ) - 1;
+
+  CHECK( w & 7, "SIMD blockSize needs to be a multiple of 8" );
+
+  const int16x8_t xf = vreinterpretq_s16_u64( vld1q_dup_u64( ( const uint64_t* )xFilter ) );
+  const int16x4_t yf = vld1_s16( yFilter );
+
+  int error = 0;
+  int x = 0;
+  do
+  {
+    const Pel* rowStart0 = buf + -1 * buffStride + x - 1;
+    int16x8_t xsum0 = motionErrorLumaFrac_loRes_step( xf, rowStart0, maxSampleValue );
+
+    const Pel* rowStart1 = buf + 0 * buffStride + x - 1;
+    int16x8_t xsum1 = motionErrorLumaFrac_loRes_step( xf, rowStart1, maxSampleValue );
+
+    const Pel* rowStart2 = buf + 1 * buffStride + x - 1;
+    int16x8_t xsum2 = motionErrorLumaFrac_loRes_step( xf, rowStart2, maxSampleValue );
+
+    int y = 0;
+    do
+    {
+      const Pel* rowStart = buf + ( y + 2 ) * buffStride + x - 1;
+      int16x8_t xsum3 = motionErrorLumaFrac_loRes_step( xf, rowStart, maxSampleValue );
+
+      const Pel* origRow = org + y * origStride;
+
+      int32x4_t ysumLo = vmull_lane_s16( vget_low_s16( xsum0 ), yf, 0 );
+      ysumLo = vmlal_lane_s16( ysumLo, vget_low_s16( xsum1 ), yf, 1 );
+      ysumLo = vmlal_lane_s16( ysumLo, vget_low_s16( xsum2 ), yf, 2 );
+      ysumLo = vmlal_lane_s16( ysumLo, vget_low_s16( xsum3 ), yf, 3 );
+
+      int32x4_t ysumHi = vmull_lane_s16( vget_high_s16( xsum0 ), yf, 0 );
+      ysumHi = vmlal_lane_s16( ysumHi, vget_high_s16( xsum1 ), yf, 1 );
+      ysumHi = vmlal_lane_s16( ysumHi, vget_high_s16( xsum2 ), yf, 2 );
+      ysumHi = vmlal_lane_s16( ysumHi, vget_high_s16( xsum3 ), yf, 3 );
+
+      uint16x8_t ysum = vcombine_u16( vqrshrun_n_s32( ysumLo, 6 ), vqrshrun_n_s32( ysumHi, 6 ) );
+
+      int16x8_t ysum16 = vreinterpretq_s16_u16( vminq_u16( ysum, vdupq_n_u16( maxSampleValue ) ) );
+      int16x8_t orig = vld1q_s16( origRow + x );
+      int16x8_t diff = vabdq_s16( ysum16, orig );
+
+      int32x4_t diff2 = vmull_s16( vget_low_s16( diff ), vget_low_s16( diff ) );
+      diff2 = vmlal_s16( diff2, vget_high_s16( diff ), vget_high_s16( diff ) );
+
+      error += vaddvq_s32( diff2 );
+      if( error > besterror )
+      {
+        return error;
+      }
+
+      xsum0 = xsum1;
+      xsum1 = xsum2;
+      xsum2 = xsum3;
+    } while( ++y != h );
+    x += 8;
+  } while( x != w );
+
+  return error;
+}
+
+template<>
+void MCTF::_initMCTF_ARM<NEON>()
+{
+  m_motionErrorLumaFrac8[1] = motionErrorLumaFrac_loRes_neon;
+}
+
+} // namespace vvenc
+#endif
+//! \}