From 45b8121d7053a9830aaf45bf7d52dcb4f19dc514 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 23 Oct 2024 15:18:24 +0100 Subject: [PATCH] Fix compilation for 32-bit Arm platforms The existing Neon code makes use of some intrinsics which are only available on AArch64, so adjust these implementations to also work on Armv7 platforms. Also introduce a new REAL_TARGET_AARCH64 macro to distinguish between 32-bit and 64-bit in new and existing Arm code. Since these implementations require that Neon is available, also add -mfpu=neon to the compile line for these files. This necessitates removing the SIMD implementation files from LTO in the same way that is already done for the x86 kernels. --- source/Lib/CommonLib/CommonDef.h | 3 + .../arm/neon/InterpolationFilter_neon.cpp | 131 +++++++++--------- source/Lib/CommonLib/arm/neon/MCTF_neon.cpp | 12 +- source/Lib/CommonLib/arm/neon/sum_neon.h | 88 ++++++++++++ source/Lib/vvenc/CMakeLists.txt | 22 ++- 5 files changed, 176 insertions(+), 80 deletions(-) create mode 100644 source/Lib/CommonLib/arm/neon/sum_neon.h diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index c70f810e4..2e500878e 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -56,6 +56,9 @@ POSSIBILITY OF SUCH DAMAGE. #if defined( __x86_64__ ) || defined( _M_X64 ) || defined( __i386__ ) || defined( __i386 ) || defined( _M_IX86 ) # define REAL_TARGET_X86 1 #elif defined( __aarch64__ ) || defined( _M_ARM64 ) || defined( __arm__ ) || defined( _M_ARM ) +# if defined( __aarch64__ ) || defined( _M_ARM64 ) +# define REAL_TARGET_AARCH64 1 +# endif # define REAL_TARGET_ARM 1 #elif defined( __wasm__ ) || defined( __wasm32__ ) # define REAL_TARGET_WASM 1 diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp index dc7178d78..3d7af09f5 100644 --- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -48,9 +48,10 @@ POSSIBILITY OF SUCH DAMAGE. // Includes // ==================================================================================================================== +#include "../InterpolationFilter.h" #include "CommonDefARM.h" #include "CommonLib/CommonDef.h" -#include "../InterpolationFilter.h" +#include "sum_neon.h" //! \ingroup CommonLib //! \{ @@ -150,7 +151,7 @@ static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voff a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) ); a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) ); - int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) ); + int32x4_t vsuma = horizontal_add_4d_s32x4( a0, a1, a2, a3 ); vsuma = vaddq_s32( vsuma, voffset1 ); vsuma = vshlq_s32( vsuma, invshift1st ); return vqmovn_s32( vsuma ); @@ -224,14 +225,14 @@ static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src src += srcStride; int32x4_t vsum0 = vdupq_n_s32( offset2nd ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 ); - vsum0 = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv0, vget_low_s16( cv ), 0 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv1, vget_low_s16( cv ), 1 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv2, vget_low_s16( cv ), 2 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv3, vget_low_s16( cv ), 3 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv4, vget_high_s16( cv ), 0 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv5, vget_high_s16( cv ), 1 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv6, vget_high_s16( cv ), 2 ); + vsum0 = vmlal_lane_s16( vsum0, vsrcv7, vget_high_s16( cv ), 3 ); int16x4_t vsum01; if( isLast ) // clip @@ -313,29 +314,29 @@ static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src int32x4_t vsum0 = vdupq_n_s32( offset2nd ); int32x4_t vsum1 = vdupq_n_s32( offset2nd ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0 ), vget_low_s16( cv ), 0 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0 ), vget_low_s16( cv ), 0 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1 ), vget_low_s16( cv ), 1 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1 ), vget_low_s16( cv ), 1 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2 ), vget_low_s16( cv ), 2 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2 ), vget_low_s16( cv ), 2 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3 ), vget_low_s16( cv ), 3 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3 ), vget_low_s16( cv ), 3 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4 ), vget_high_s16( cv ), 0 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4 ), vget_high_s16( cv ), 0 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5 ), vget_high_s16( cv ), 1 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5 ), vget_high_s16( cv ), 1 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6 ), vget_high_s16( cv ), 2 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6 ), vget_high_s16( cv ), 2 ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7 ), vget_high_s16( cv ), 3 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7 ), vget_high_s16( cv ), 3 ); int16x8_t vsum01; if( isLast ) // clip @@ -422,45 +423,45 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, int sr int32x4_t vsum2 = vdupq_n_s32( offset2nd ); int32x4_t vsum3 = vdupq_n_s32( offset2nd ); - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), cv, 0 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), cv, 0 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), cv, 0 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), cv, 0 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), cv, 1 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), cv, 1 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), cv, 1 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), cv, 1 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), cv, 2 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), cv, 2 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), cv, 2 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), cv, 2 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), cv, 3 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), cv, 3 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), cv, 3 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), cv, 3 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), cv, 4 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), cv, 4 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), cv, 4 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), cv, 4 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), cv, 5 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), cv, 5 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), cv, 5 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), cv, 5 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), cv, 6 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), cv, 6 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), cv, 6 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), cv, 6 ); - - vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), cv, 7 ); - vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), cv, 7 ); - vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), cv, 7 ); - vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), cv, 7 ); + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 ); + + vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 ); + vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 ); + vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 ); + vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 ); int16x8_t vsum01, vsum23; if( isLast ) // clip diff --git a/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp b/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp index 3e63dadd1..8036e907d 100644 --- a/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/MCTF_neon.cpp @@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE. // ==================================================================================================================== #include "MCTF.h" +#include "sum_neon.h" #include @@ -75,13 +76,8 @@ static int16x8_t motionErrorLumaFrac_loRes_step( const int16x8_t xf, const Pel* int32x4_t sum3 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row37 ) ); int32x4_t sum7 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row37 ) ); - int32x4_t sum01 = vpaddq_s32( sum0, sum1 ); - int32x4_t sum23 = vpaddq_s32( sum2, sum3 ); - int32x4_t sum45 = vpaddq_s32( sum4, sum5 ); - int32x4_t sum67 = vpaddq_s32( sum6, sum7 ); - int32x4_t sum0123 = vpaddq_s32( sum01, sum23 ); - int32x4_t sum4567 = vpaddq_s32( sum45, sum67 ); - + int32x4_t sum0123 = horizontal_add_4d_s32x4( sum0, sum1, sum2, sum3 ); + int32x4_t sum4567 = horizontal_add_4d_s32x4( sum4, sum5, sum6, sum7 ); uint16x8_t sum = vcombine_u16( vqrshrun_n_s32( sum0123, 6 ), vqrshrun_n_s32( sum4567, 6 ) ); return vminq_s16( vreinterpretq_s16_u16( sum ), vdupq_n_s16( maxSampleValue ) ); @@ -138,7 +134,7 @@ int motionErrorLumaFrac_loRes_neon( const Pel* org, const ptrdiff_t origStride, int32x4_t diff2 = vmull_s16( vget_low_s16( diff ), vget_low_s16( diff ) ); diff2 = vmlal_s16( diff2, vget_high_s16( diff ), vget_high_s16( diff ) ); - error += vaddvq_s32( diff2 ); + error += horizontal_add_s32x4( diff2 ); if( error > besterror ) { return error; diff --git a/source/Lib/CommonLib/arm/neon/sum_neon.h b/source/Lib/CommonLib/arm/neon/sum_neon.h new file mode 100644 index 000000000..2bfd5951e --- /dev/null +++ b/source/Lib/CommonLib/arm/neon/sum_neon.h @@ -0,0 +1,88 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** \file sum_neon.h + \brief Helper functions for adding across vectors +*/ + +#pragma once + +#include "CommonDef.h" + +#if defined( TARGET_SIMD_ARM ) + +#include + +namespace vvenc +{ + +static inline int horizontal_add_s32x4( const int32x4_t a ) +{ +#if REAL_TARGET_AARCH64 + return vaddvq_s32( a ); +#else + const int64x2_t b = vpaddlq_s32( a ); + const int32x2_t c = vadd_s32( vreinterpret_s32_s64( vget_low_s64( b ) ), vreinterpret_s32_s64( vget_high_s64( b ) ) ); + return vget_lane_s32( c, 0 ); +#endif +} + +static inline int32x4_t horizontal_add_4d_s32x4( const int32x4_t v0, const int32x4_t v1, const int32x4_t v2, + const int32x4_t v3 ) +{ +#if REAL_TARGET_AARCH64 + int32x4_t v01 = vpaddq_s32( v0, v1 ); + int32x4_t v23 = vpaddq_s32( v2, v3 ); + return vpaddq_s32( v01, v23 ); +#else + int32x4_t res = vdupq_n_s32( 0 ); + res = vsetq_lane_s32( horizontal_add_s32x4( v0 ), res, 0 ); + res = vsetq_lane_s32( horizontal_add_s32x4( v1 ), res, 1 ); + res = vsetq_lane_s32( horizontal_add_s32x4( v2 ), res, 2 ); + res = vsetq_lane_s32( horizontal_add_s32x4( v3 ), res, 3 ); + return res; +#endif +} + +} // namespace vvenc + +#endif diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt index 4e56768bc..057760652 100644 --- a/source/Lib/vvenc/CMakeLists.txt +++ b/source/Lib/vvenc/CMakeLists.txt @@ -119,7 +119,8 @@ if( VVENC_ENABLE_X86_SIMD ) endif() add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} ) - # disble LTO for the files compiled with special architecture flags + + # Disble LTO for the files compiled with special architecture flags. set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES INTERPROCEDURAL_OPTIMIZATION OFF INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF @@ -133,13 +134,20 @@ if( VVENC_ENABLE_ARM_SIMD ) # set needed compile definitions set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON ) + if(( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" ) OR ( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" )) + # Neon is mandatory in AArch64, so no additional compile flags needed here. + else() + set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "-mfpu=neon" ) + endif() + add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} ) - # NEON is enabled by default for all files, so don't need to disable LTO - # set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES - # INTERPROCEDURAL_OPTIMIZATION OFF - # INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF - # INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF - # INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF ) + + # Disble LTO for the files compiled with special architecture flags. + set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES + INTERPROCEDURAL_OPTIMIZATION OFF + INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF + INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF + INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF ) set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib ) endif()