Fix compilation for 32-bit Arm platforms

The existing Neon code makes use of some intrinsics which are only available on AArch64, so adjust these implementations to also work on Armv7 platforms. Also introduce a new REAL_TARGET_AARCH64 macro to distinguish between 32-bit and 64-bit in new and existing Arm code. Since these implementations require that Neon is available, also add -mfpu=neon to the compile line for these files. This necessitates removing the SIMD implementation files from LTO in the same way that is already done for the x86 kernels.
fraunhoferhhi · Oct 24, 2024 · 45b8121 · 45b8121
1 parent 3f2dcf4
commit 45b8121
Show file tree

Hide file tree

Showing 5 changed files with 176 additions and 80 deletions.
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
@@ -56,6 +56,9 @@ POSSIBILITY OF SUCH DAMAGE.
 #if defined( __x86_64__ ) || defined( _M_X64 ) || defined( __i386__ ) || defined( __i386 ) || defined( _M_IX86 )
 # define REAL_TARGET_X86 1
 #elif defined( __aarch64__ ) || defined( _M_ARM64 ) || defined( __arm__ ) || defined( _M_ARM )
+# if defined( __aarch64__ ) || defined( _M_ARM64 )
+#  define REAL_TARGET_AARCH64 1
+# endif
 # define REAL_TARGET_ARM 1
 #elif defined( __wasm__ ) || defined( __wasm32__ )
 # define REAL_TARGET_WASM 1

diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
@@ -48,9 +48,10 @@ POSSIBILITY OF SUCH DAMAGE.
 //  Includes
 //  ====================================================================================================================
 
+#include "../InterpolationFilter.h"
 #include "CommonDefARM.h"
 #include "CommonLib/CommonDef.h"
-#include "../InterpolationFilter.h"
+#include "sum_neon.h"
 
 //! \ingroup CommonLib
 //! \{
@@ -150,7 +151,7 @@ static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voff
   a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) );
   a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) );
 
-  int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
+  int32x4_t vsuma = horizontal_add_4d_s32x4( a0, a1, a2, a3 );
   vsuma           = vaddq_s32( vsuma, voffset1 );
   vsuma           = vshlq_s32( vsuma, invshift1st );
   return vqmovn_s32( vsuma );
@@ -224,14 +225,14 @@ static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src
     src += srcStride;
 
     int32x4_t vsum0 = vdupq_n_s32( offset2nd );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 );
-    vsum0           = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv0, vget_low_s16( cv ), 0 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv1, vget_low_s16( cv ), 1 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv2, vget_low_s16( cv ), 2 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv3, vget_low_s16( cv ), 3 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv4, vget_high_s16( cv ), 0 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv5, vget_high_s16( cv ), 1 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv6, vget_high_s16( cv ), 2 );
+    vsum0           = vmlal_lane_s16( vsum0, vsrcv7, vget_high_s16( cv ), 3 );
 
     int16x4_t vsum01;
     if( isLast )  // clip
@@ -313,29 +314,29 @@ static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src
     int32x4_t vsum0 = vdupq_n_s32( offset2nd );
     int32x4_t vsum1 = vdupq_n_s32( offset2nd );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0 ), vget_low_s16( cv ), 0 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0 ), vget_low_s16( cv ), 0 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1 ), vget_low_s16( cv ), 1 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1 ), vget_low_s16( cv ), 1 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2 ), vget_low_s16( cv ), 2 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2 ), vget_low_s16( cv ), 2 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3 ), vget_low_s16( cv ), 3 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3 ), vget_low_s16( cv ), 3 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4 ), vget_high_s16( cv ), 0 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4 ), vget_high_s16( cv ), 0 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5 ), vget_high_s16( cv ), 1 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5 ), vget_high_s16( cv ), 1 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6 ), vget_high_s16( cv ), 2 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6 ), vget_high_s16( cv ), 2 );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7 ), vget_high_s16( cv ), 3 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7 ), vget_high_s16( cv ), 3 );
 
     int16x8_t vsum01;
     if( isLast )  // clip
@@ -422,45 +423,45 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, int sr
     int32x4_t vsum2 = vdupq_n_s32( offset2nd );
     int32x4_t vsum3 = vdupq_n_s32( offset2nd );
 
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), cv, 0 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), cv, 0 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), cv, 0 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), cv, 0 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), cv, 1 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), cv, 1 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), cv, 1 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), cv, 1 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), cv, 2 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), cv, 2 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), cv, 2 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), cv, 2 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), cv, 3 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), cv, 3 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), cv, 3 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), cv, 3 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), cv, 4 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), cv, 4 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), cv, 4 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), cv, 4 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), cv, 5 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), cv, 5 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), cv, 5 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), cv, 5 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), cv, 6 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), cv, 6 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), cv, 6 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), cv, 6 );
-
-    vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), cv, 7 );
-    vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), cv, 7 );
-    vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), cv, 7 );
-    vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), cv, 7 );
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 );
+
+    vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 );
+    vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 );
+    vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 );
+    vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 );
 
     int16x8_t vsum01, vsum23;
     if( isLast ) // clip

diff --git a/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp b/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp
@@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE.
 // ====================================================================================================================
 
 #include "MCTF.h"
+#include "sum_neon.h"
 
 #include <arm_neon.h>
 
@@ -75,13 +76,8 @@ static int16x8_t motionErrorLumaFrac_loRes_step( const int16x8_t xf, const Pel*
   int32x4_t sum3 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row37 ) );
   int32x4_t sum7 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row37 ) );
 
-  int32x4_t sum01 = vpaddq_s32( sum0, sum1 );
-  int32x4_t sum23 = vpaddq_s32( sum2, sum3 );
-  int32x4_t sum45 = vpaddq_s32( sum4, sum5 );
-  int32x4_t sum67 = vpaddq_s32( sum6, sum7 );
-  int32x4_t sum0123 = vpaddq_s32( sum01, sum23 );
-  int32x4_t sum4567 = vpaddq_s32( sum45, sum67 );
-
+  int32x4_t sum0123 = horizontal_add_4d_s32x4( sum0, sum1, sum2, sum3 );
+  int32x4_t sum4567 = horizontal_add_4d_s32x4( sum4, sum5, sum6, sum7 );
   uint16x8_t sum = vcombine_u16( vqrshrun_n_s32( sum0123, 6 ), vqrshrun_n_s32( sum4567, 6 ) );
 
   return vminq_s16( vreinterpretq_s16_u16( sum ), vdupq_n_s16( maxSampleValue ) );
@@ -138,7 +134,7 @@ int motionErrorLumaFrac_loRes_neon( const Pel* org, const ptrdiff_t origStride,
       int32x4_t diff2 = vmull_s16( vget_low_s16( diff ), vget_low_s16( diff ) );
       diff2 = vmlal_s16( diff2, vget_high_s16( diff ), vget_high_s16( diff ) );
 
-      error += vaddvq_s32( diff2 );
+      error += horizontal_add_s32x4( diff2 );
       if( error > besterror )
       {
         return error;

diff --git a/source/Lib/CommonLib/arm/neon/sum_neon.h b/source/Lib/CommonLib/arm/neon/sum_neon.h
@@ -0,0 +1,88 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/** \file     sum_neon.h
+    \brief    Helper functions for adding across vectors
+*/
+
+#pragma once
+
+#include "CommonDef.h"
+
+#if defined( TARGET_SIMD_ARM )
+
+#include <arm_neon.h>
+
+namespace vvenc
+{
+
+static inline int horizontal_add_s32x4( const int32x4_t a )
+{
+#if REAL_TARGET_AARCH64
+  return vaddvq_s32( a );
+#else
+  const int64x2_t b = vpaddlq_s32( a );
+  const int32x2_t c = vadd_s32( vreinterpret_s32_s64( vget_low_s64( b ) ), vreinterpret_s32_s64( vget_high_s64( b ) ) );
+  return vget_lane_s32( c, 0 );
+#endif
+}
+
+static inline int32x4_t horizontal_add_4d_s32x4( const int32x4_t v0, const int32x4_t v1, const int32x4_t v2,
+                                                 const int32x4_t v3 )
+{
+#if REAL_TARGET_AARCH64
+  int32x4_t v01 = vpaddq_s32( v0, v1 );
+  int32x4_t v23 = vpaddq_s32( v2, v3 );
+  return vpaddq_s32( v01, v23 );
+#else
+  int32x4_t res = vdupq_n_s32( 0 );
+  res           = vsetq_lane_s32( horizontal_add_s32x4( v0 ), res, 0 );
+  res           = vsetq_lane_s32( horizontal_add_s32x4( v1 ), res, 1 );
+  res           = vsetq_lane_s32( horizontal_add_s32x4( v2 ), res, 2 );
+  res           = vsetq_lane_s32( horizontal_add_s32x4( v3 ), res, 3 );
+  return res;
+#endif
+}
+
+}  // namespace vvenc
+
+#endif
diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt
@@ -119,7 +119,8 @@ if( VVENC_ENABLE_X86_SIMD )
   endif()
 
   add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} )
-  # disble LTO for the files compiled with special architecture flags
+
+  # Disble LTO for the files compiled with special architecture flags.
   set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES
                                               INTERPROCEDURAL_OPTIMIZATION                OFF
                                               INTERPROCEDURAL_OPTIMIZATION_RELEASE        OFF
@@ -133,13 +134,20 @@ if( VVENC_ENABLE_ARM_SIMD )
   # set needed compile definitions
   set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON )
 
+  if(( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" ) OR ( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" ))
+    # Neon is mandatory in AArch64, so no additional compile flags needed here.
+  else()
+    set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "-mfpu=neon" )
+  endif()
+
   add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} )
-  # NEON is enabled by default for all files, so don't need to disable LTO
-  # set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
-  #                                             INTERPROCEDURAL_OPTIMIZATION                OFF
-  #                                             INTERPROCEDURAL_OPTIMIZATION_RELEASE        OFF
-  #                                             INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
-  #                                             INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL     OFF )
+
+  # Disble LTO for the files compiled with special architecture flags.
+  set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
+                                              INTERPROCEDURAL_OPTIMIZATION                OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_RELEASE        OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL     OFF )
 
   set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib )
 endif()