Skip to content

Commit

Permalink
Fix compilation for 32-bit Arm platforms
Browse files Browse the repository at this point in the history
The existing Neon code makes use of some intrinsics which are only
available on AArch64, so adjust these implementations to also work on
Armv7 platforms.

Also introduce a new REAL_TARGET_AARCH64 macro to distinguish between
32-bit and 64-bit in new and existing Arm code.

Since these implementations require that Neon is available, also add
-mfpu=neon to the compile line for these files. This necessitates
removing the SIMD implementation files from LTO in the same way that is
already done for the x86 kernels.
  • Loading branch information
georges-arm committed Oct 24, 2024
1 parent 3f2dcf4 commit 45b8121
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 80 deletions.
3 changes: 3 additions & 0 deletions source/Lib/CommonLib/CommonDef.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ POSSIBILITY OF SUCH DAMAGE.
#if defined( __x86_64__ ) || defined( _M_X64 ) || defined( __i386__ ) || defined( __i386 ) || defined( _M_IX86 )
# define REAL_TARGET_X86 1
#elif defined( __aarch64__ ) || defined( _M_ARM64 ) || defined( __arm__ ) || defined( _M_ARM )
# if defined( __aarch64__ ) || defined( _M_ARM64 )
# define REAL_TARGET_AARCH64 1
# endif
# define REAL_TARGET_ARM 1
#elif defined( __wasm__ ) || defined( __wasm32__ )
# define REAL_TARGET_WASM 1
Expand Down
131 changes: 66 additions & 65 deletions source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ POSSIBILITY OF SUCH DAMAGE.
// Includes
// ====================================================================================================================

#include "../InterpolationFilter.h"
#include "CommonDefARM.h"
#include "CommonLib/CommonDef.h"
#include "../InterpolationFilter.h"
#include "sum_neon.h"

//! \ingroup CommonLib
//! \{
Expand Down Expand Up @@ -150,7 +151,7 @@ static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voff
a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) );
a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) );

int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
int32x4_t vsuma = horizontal_add_4d_s32x4( a0, a1, a2, a3 );
vsuma = vaddq_s32( vsuma, voffset1 );
vsuma = vshlq_s32( vsuma, invshift1st );
return vqmovn_s32( vsuma );
Expand Down Expand Up @@ -224,14 +225,14 @@ static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv0, vget_low_s16( cv ), 0 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv1, vget_low_s16( cv ), 1 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv2, vget_low_s16( cv ), 2 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv3, vget_low_s16( cv ), 3 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv4, vget_high_s16( cv ), 0 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv5, vget_high_s16( cv ), 1 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv6, vget_high_s16( cv ), 2 );
vsum0 = vmlal_lane_s16( vsum0, vsrcv7, vget_high_s16( cv ), 3 );

int16x4_t vsum01;
if( isLast ) // clip
Expand Down Expand Up @@ -313,29 +314,29 @@ static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, int src
int32x4_t vsum0 = vdupq_n_s32( offset2nd );
int32x4_t vsum1 = vdupq_n_s32( offset2nd );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0 ), vget_low_s16( cv ), 0 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0 ), vget_low_s16( cv ), 0 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1 ), vget_low_s16( cv ), 1 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1 ), vget_low_s16( cv ), 1 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2 ), vget_low_s16( cv ), 2 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2 ), vget_low_s16( cv ), 2 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3 ), vget_low_s16( cv ), 3 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3 ), vget_low_s16( cv ), 3 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4 ), vget_high_s16( cv ), 0 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4 ), vget_high_s16( cv ), 0 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5 ), vget_high_s16( cv ), 1 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5 ), vget_high_s16( cv ), 1 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6 ), vget_high_s16( cv ), 2 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6 ), vget_high_s16( cv ), 2 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7 ), vget_high_s16( cv ), 3 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7 ), vget_high_s16( cv ), 3 );

int16x8_t vsum01;
if( isLast ) // clip
Expand Down Expand Up @@ -422,45 +423,45 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, int sr
int32x4_t vsum2 = vdupq_n_s32( offset2nd );
int32x4_t vsum3 = vdupq_n_s32( offset2nd );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), cv, 0 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), cv, 0 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), cv, 0 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), cv, 0 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), cv, 1 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), cv, 1 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), cv, 1 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), cv, 1 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), cv, 2 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), cv, 2 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), cv, 2 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), cv, 2 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), cv, 3 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), cv, 3 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), cv, 3 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), cv, 3 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), cv, 4 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), cv, 4 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), cv, 4 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), cv, 4 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), cv, 5 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), cv, 5 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), cv, 5 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), cv, 5 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), cv, 6 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), cv, 6 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), cv, 6 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), cv, 6 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), cv, 7 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), cv, 7 );
vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), cv, 7 );
vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), cv, 7 );
vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), vget_low_s16( cv ), 0 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), vget_low_s16( cv ), 0 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), vget_low_s16( cv ), 1 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), vget_low_s16( cv ), 1 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), vget_low_s16( cv ), 2 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), vget_low_s16( cv ), 2 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), vget_low_s16( cv ), 3 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), vget_low_s16( cv ), 3 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), vget_high_s16( cv ), 0 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), vget_high_s16( cv ), 0 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), vget_high_s16( cv ), 1 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), vget_high_s16( cv ), 1 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), vget_high_s16( cv ), 2 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), vget_high_s16( cv ), 2 );

vsum0 = vmlal_lane_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 );
vsum1 = vmlal_lane_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), vget_high_s16( cv ), 3 );
vsum2 = vmlal_lane_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 );
vsum3 = vmlal_lane_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), vget_high_s16( cv ), 3 );

int16x8_t vsum01, vsum23;
if( isLast ) // clip
Expand Down
12 changes: 4 additions & 8 deletions source/Lib/CommonLib/arm/neon/MCTF_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE.
// ====================================================================================================================

#include "MCTF.h"
#include "sum_neon.h"

#include <arm_neon.h>

Expand Down Expand Up @@ -75,13 +76,8 @@ static int16x8_t motionErrorLumaFrac_loRes_step( const int16x8_t xf, const Pel*
int32x4_t sum3 = vmull_s16( vget_low_s16( xf ), vget_low_s16( row37 ) );
int32x4_t sum7 = vmull_s16( vget_high_s16( xf ), vget_high_s16( row37 ) );

int32x4_t sum01 = vpaddq_s32( sum0, sum1 );
int32x4_t sum23 = vpaddq_s32( sum2, sum3 );
int32x4_t sum45 = vpaddq_s32( sum4, sum5 );
int32x4_t sum67 = vpaddq_s32( sum6, sum7 );
int32x4_t sum0123 = vpaddq_s32( sum01, sum23 );
int32x4_t sum4567 = vpaddq_s32( sum45, sum67 );

int32x4_t sum0123 = horizontal_add_4d_s32x4( sum0, sum1, sum2, sum3 );
int32x4_t sum4567 = horizontal_add_4d_s32x4( sum4, sum5, sum6, sum7 );
uint16x8_t sum = vcombine_u16( vqrshrun_n_s32( sum0123, 6 ), vqrshrun_n_s32( sum4567, 6 ) );

return vminq_s16( vreinterpretq_s16_u16( sum ), vdupq_n_s16( maxSampleValue ) );
Expand Down Expand Up @@ -138,7 +134,7 @@ int motionErrorLumaFrac_loRes_neon( const Pel* org, const ptrdiff_t origStride,
int32x4_t diff2 = vmull_s16( vget_low_s16( diff ), vget_low_s16( diff ) );
diff2 = vmlal_s16( diff2, vget_high_s16( diff ), vget_high_s16( diff ) );

error += vaddvq_s32( diff2 );
error += horizontal_add_s32x4( diff2 );
if( error > besterror )
{
return error;
Expand Down
88 changes: 88 additions & 0 deletions source/Lib/CommonLib/arm/neon/sum_neon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/* -----------------------------------------------------------------------------
The copyright in this software is being made available under the Clear BSD
License, included below. No patent rights, trademark rights and/or
other Intellectual Property Rights other than the copyrights concerning
the Software are granted under this license.
The Clear BSD License
Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted (subject to the limitations in the disclaimer below) provided that
the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------------------- */

/** \file sum_neon.h
\brief Helper functions for adding across vectors
*/

#pragma once

#include "CommonDef.h"

#if defined( TARGET_SIMD_ARM )

#include <arm_neon.h>

namespace vvenc
{

static inline int horizontal_add_s32x4( const int32x4_t a )
{
#if REAL_TARGET_AARCH64
return vaddvq_s32( a );
#else
const int64x2_t b = vpaddlq_s32( a );
const int32x2_t c = vadd_s32( vreinterpret_s32_s64( vget_low_s64( b ) ), vreinterpret_s32_s64( vget_high_s64( b ) ) );
return vget_lane_s32( c, 0 );
#endif
}

static inline int32x4_t horizontal_add_4d_s32x4( const int32x4_t v0, const int32x4_t v1, const int32x4_t v2,
const int32x4_t v3 )
{
#if REAL_TARGET_AARCH64
int32x4_t v01 = vpaddq_s32( v0, v1 );
int32x4_t v23 = vpaddq_s32( v2, v3 );
return vpaddq_s32( v01, v23 );
#else
int32x4_t res = vdupq_n_s32( 0 );
res = vsetq_lane_s32( horizontal_add_s32x4( v0 ), res, 0 );
res = vsetq_lane_s32( horizontal_add_s32x4( v1 ), res, 1 );
res = vsetq_lane_s32( horizontal_add_s32x4( v2 ), res, 2 );
res = vsetq_lane_s32( horizontal_add_s32x4( v3 ), res, 3 );
return res;
#endif
}

} // namespace vvenc

#endif
22 changes: 15 additions & 7 deletions source/Lib/vvenc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ if( VVENC_ENABLE_X86_SIMD )
endif()

add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} )
# disble LTO for the files compiled with special architecture flags

# Disble LTO for the files compiled with special architecture flags.
set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES
INTERPROCEDURAL_OPTIMIZATION OFF
INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF
Expand All @@ -133,13 +134,20 @@ if( VVENC_ENABLE_ARM_SIMD )
# set needed compile definitions
set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON )

if(( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" ) OR ( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" ))
# Neon is mandatory in AArch64, so no additional compile flags needed here.
else()
set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "-mfpu=neon" )
endif()

add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} )
# NEON is enabled by default for all files, so don't need to disable LTO
# set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
# INTERPROCEDURAL_OPTIMIZATION OFF
# INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF
# INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
# INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF )

# Disble LTO for the files compiled with special architecture flags.
set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
INTERPROCEDURAL_OPTIMIZATION OFF
INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF
INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF )

set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib )
endif()
Expand Down

0 comments on commit 45b8121

Please sign in to comment.