From 3d091bdf8ffaadf7de9f6ebad6d9a6d465cc88af Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 18 Oct 2024 09:26:52 +0100 Subject: [PATCH] PR#431 porting from vvenc. Refactor AArch64 Interpolation Filter 16x16 implementation (#431) * Move InterpolationFilter{ARM.h => _neon.cpp} Since this header is only used in one place and would not share any code with an eventual SVE implementation, simply move it to a .cpp file similar to MCTF.cpp. * Refactor simdFilter16xX_N8_neon The use of the vsrcv temporary array rather than simple local variables meant that LLVM emitted an unnecessary number of load/store instructions in the inner loops. Refactoring this to make the dependency between loop iterations more explicit allows for much nicer generated code. Running a video encoding job on a Neoverse V2 machine using the --preset=fast setting shows a ~1.8% improvement in reported FPS. --- .../CommonLib/arm/InterpolationFilterARM.h | 412 ------------------ .../arm/neon/InterpolationFilter_neon.cpp | 288 +++++++++++- 2 files changed, 287 insertions(+), 413 deletions(-) delete mode 100644 source/Lib/CommonLib/arm/InterpolationFilterARM.h diff --git a/source/Lib/CommonLib/arm/InterpolationFilterARM.h b/source/Lib/CommonLib/arm/InterpolationFilterARM.h deleted file mode 100644 index 06a7980c..00000000 --- a/source/Lib/CommonLib/arm/InterpolationFilterARM.h +++ /dev/null @@ -1,412 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -/** - * \file - * \brief Implementation of InterpolationFilter class - */ -// ==================================================================================================================== -// Includes -// ==================================================================================================================== - -#include "CommonDefARM.h" -#include "CommonLib/CommonDef.h" -#include "../InterpolationFilter.h" - - -namespace vvdec -{ - -#ifdef TARGET_SIMD_ARM -# if __ARM_ARCH >= 8 - -template -static void simdInterpolateN2_2D( const ClpRng& clpRng, - const Pel* src, - const ptrdiff_t srcStride, - Pel* dst, - const ptrdiff_t dstStride, - int width, - int height, - TFilterCoeff const* ch, - TFilterCoeff const* cv ) -{ - // #if EISENREICH_MA - // const int shift1st = IF_FILTER_PREC_BILINEAR - ( IF_INTERNAL_PREC_BILINEAR - clpRng.bd ); - // const int offset1st = 1 << ( shift1st - 1 ); - // - // const int shift2nd = 4; - // const int offset2nd = 1 << ( shift2nd - 1 ); - // - // int16x8_t mmOffset1 = vdupq_n_s16( offset1st ); - // int16x8_t mmOffset2 = vdupq_n_s16( offset2nd ); - // int16x8_t mmCoeffH = vdupq_n_s16( ch[1] ); - // int16x8_t mmCoeffV = vdupq_n_s16( cv[1] ); - // - // for( int row = -1; row < height; row++ ) - // { - // __m128i mmPix = _mm_loadu_si64( ( const __m128i* )( src ) ); - // __m128i mmPix1 = _mm_loadu_si64( ( const __m128i* )( src + 1 ) ); - // __m128i mmFiltered - // = _mm_add_epi16 ( mmOffset1, _mm_slli_epi16( mmPix, 4 ) ); - // mmFiltered = _mm_add_epi16 ( mmFiltered, _mm_mullo_epi16( _mm_sub_epi16( mmPix1, mmPix ), mmCoeffH ) ); - // mmFiltered = _mm_srai_epi16( mmFiltered, shift1st ); - // - // mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); - // } - // - // #else - const int shift1st = IF_FILTER_PREC_BILINEAR - ( IF_INTERNAL_PREC_BILINEAR - clpRng.bd ); - const int offset1st = 1 << ( shift1st - 1 ); - - const int shift2nd = 4; - const int offset2nd = 1 << ( shift2nd - 1 ); - - int16x8_t mmOffset1 = vdupq_n_s16( offset1st ); - int16x8_t mmOffset2 = vdupq_n_s16( offset2nd ); - int16x8_t mmCoeffH = vdupq_n_s16( ch[ 1 ] ); - int16x8_t mmCoeffV = vdupq_n_s16( cv[ 1 ] ); - - int16x8_t mmLastH[ 16 ]; - - int16x8_t mmLast4H; - - // workaround for over-sensitive compilers - mmLastH[ 0 ] = vdupq_n_s16( 0 ); - - int16x8_t shift1inv = vdupq_n_s16( -shift1st ); - int16x8_t shift2inv = vdupq_n_s16( -shift2nd ); - - for( int row = -1; row < height; row++ ) - { - int16x8_t mmPix = vld1q_s16( src ); - int16x8_t mmPix1 = vld1q_s16( src + 1 ); - - int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); - - mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); - mmFiltered = vshlq_s16( mmFiltered, shift1inv ); - - if( row >= 0 ) - { - int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mmLast4H, 16 ); - mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mmLast4H ), mmCoeffV ); - mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); - - vst1q_lane_s64( (int64_t*) dst, (int64x2_t) mmFiltered2, 0 ); - } - - mmLast4H = mmFiltered; - - for( int x = 4; x < width; x += 8 ) - { - int16x8_t mmPix = vld1q_s16( src + x ); - int16x8_t mmPix1 = vld1q_s16( src + x + 1 ); - - int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); - mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); - mmFiltered = vshlq_s16( mmFiltered, shift1inv ); - - int idx = x >> 3; - int16x8_t mLast = mmLastH[ idx ]; - mmLastH[ idx ] = mmFiltered; - - if( row >= 0 ) - { - int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mLast, 16 ); - mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mLast ), mmCoeffV ); - mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); - - vst1q_s16( ( dst + x ), mmFiltered2 ); - } - } - if( row >= 0 ) - dst += dstStride; - - src += srcStride; - } -} - -template -void simdFilter16xX_N8( const ClpRng& clpRng, - const Pel* src, - const ptrdiff_t srcStride, - Pel* dst, - const ptrdiff_t dstStride, - int width, - int height, - TFilterCoeff const* coeffH, - TFilterCoeff const* coeffV ) -{ - OFFSET( src, srcStride, -3, -3 ); - - int offset1st, offset2nd; - int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); - const int shift1st = IF_FILTER_PREC - headRoom; - int shift2nd = IF_FILTER_PREC; - int extHeight = height + 7; - // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be - // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 - - // shift1st -= headRoom; - offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st ); - - if( isLast ) - { - shift2nd += headRoom; - offset2nd = 1 << ( shift2nd - 1 ); - offset2nd += IF_INTERNAL_OFFS << IF_FILTER_PREC; - } - else - { - offset2nd = 0; - } - const int32x4_t voffset1 = vdupq_n_s32( offset1st ); - - const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); - const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); - - int64x1x2_t vcoeff0 = vld2_s64( (int64_t*) coeffH ); - // int16x8_t vcoeff1 = vld1q_s16(coeffV); // apparently its faster to access a pointer - int16x8_t vsum; - -// MSVC requires neon variables to be initialised when their elements are set with vsetq_lane (it outputs a warning). However, this is not the case for datatypes like int16x8x4_t. -#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) - int32x4_t vsuma = vdupq_n_s32(0), vsumb = vdupq_n_s32(0); -#else - int32x4_t vsuma, vsumb; -#endif - - int32x4_t vsrcv[ 2 ][ 9 ]; - - int32x4_t invshift1st = vdupq_n_s32( -shift1st ); - int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); - - for( int row = 0; row < extHeight; row++ ) - { - -// MSVC requires neon variables to be initialised when their elements are set with vsetq_lane (it outputs a warning). However, this is not the case for datatypes like int16x8x4_t. -#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) - int32x4_t vsrc0 = vdupq_n_s32(0), vsrc1 = vdupq_n_s32(0); -#else - int32x4_t vsrc0, vsrc1; -#endif - - int16x4_t vsrca00, vsrca01, vsrca10, vsrca11; - int16x4_t vsrcb00, vsrcb01, vsrcb10, vsrcb11; - - vsrca00 = vld1_s16( &src[ 0 ] ); - vsrca01 = vld1_s16( &src[ 1 ] ); - vsrca10 = vld1_s16( &src[ 2 ] ); - vsrca11 = vld1_s16( &src[ 3 ] ); - - for( int j = 0; j < 2; j++ ) - { - // int16x8_t vsrca0, vsrca1, vsrcb0, vsrcb1; - - vsrcb00 = vld1_s16( &src[ ( j << 3 ) + 4 ] ); - vsrcb01 = vld1_s16( &src[ ( j << 3 ) + 5 ] ); - vsrcb10 = vld1_s16( &src[ ( j << 3 ) + 6 ] ); - vsrcb11 = vld1_s16( &src[ ( j << 3 ) + 7 ] ); - - GCC_WARNING_DISABLE_maybe_uninitialized // when building for aarch64 without LTO gcc complains about vsum{a,b},vsrc{0,1} not being initialized - - vsuma = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsuma, 0 ); - vsuma = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsuma, 1 ); - vsuma = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsuma, 2 ); - vsuma = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsuma, 3 ); - - vsumb = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsumb, 0 ); - vsumb = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsumb, 1 ); - vsumb = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsumb, 2 ); - vsumb = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ), vsumb, 3 ); - - vsrc1 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc1, 0 ); - vsrc1 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc1, 1 ); - vsrc1 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc1, 2 ); - vsrc1 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc1, 3 ); - - vsrca00 = vld1_s16( &src[ ( j << 3 ) + 8 ] ); - vsrca01 = vld1_s16( &src[ ( j << 3 ) + 9 ] ); - vsrca10 = vld1_s16( &src[ ( j << 3 ) + 10 ] ); - vsrca11 = vld1_s16( &src[ ( j << 3 ) + 11 ] ); - - vsrc0 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc0, 0 ); - vsrc0 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc0, 1 ); - vsrc0 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc0, 2 ); - vsrc0 = vsetq_lane_s32( vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ), vsrc0, 3 ); - - GCC_WARNING_RESET - - vsuma = vaddq_s32( vsuma, vsrc1 ); - vsumb = vaddq_s32( vsumb, vsrc0 ); - - vsuma = vaddq_s32( vsuma, voffset1 ); - vsumb = vaddq_s32( vsumb, voffset1 ); - - vsuma = vshlq_s32( vsuma, invshift1st ); - vsumb = vshlq_s32( vsumb, invshift1st ); - // vsum = vsetq_lane_s64( (int64_t) vreinterpret_s64_s16( vqrshrn_n_s32( vsuma, shift1st ) ), vsum, 0); - // vsum = vqrshrn_n_s32( vsumb, shift1st ); - // funktioniert nicht, weil shift1st nicht als Konstante akzeptiert wird. - - vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb ); - - if( row < 7 ) - { - vsrcv[ j ][ row + 1 ] = (int32x4_t) vsum; - } - else - { - vsrcv[ j ][ 8 ] = (int32x4_t) vsum; - vsuma = vsumb = vdupq_n_s32( offset2nd ); - - for( int i = 0; i < 8; i += 2 ) - { - vsrc0 = vsrcv[ j ][ i + 1 ]; - vsrc1 = vsrcv[ j ][ i + 2 ]; - int16x4_t vsrc0l = vget_low_s16( (int16x8_t) vsrc0 ); // 0a 0b 0c 0d - int16x4_t vsrc0h = vget_high_s16( (int16x8_t) vsrc0 ); // 0e 0f 0g 0h - int16x4_t vsrc1l = vget_low_s16( (int16x8_t) vsrc1 ); // 1a 1b 1c 1d - int16x4_t vsrc1h = vget_high_s16( (int16x8_t) vsrc1 ); // 1e 1f 1g 1h - vsuma = vmlal_n_s16( vsuma, vsrc0l, coeffV[ i ] ); // 0a * c0 + offset2nd, 0b * c0 + offset2nd - vsuma = vmlal_n_s16( vsuma, vsrc1l, coeffV[ i + 1 ] ); // 1a * c1 + 0a * c1 + offset2nd, 1b * c1 + 0b * c0 + offset2nd - vsumb = vmlal_n_s16( vsumb, vsrc0h, coeffV[ i ] ); - vsumb = vmlal_n_s16( vsumb, vsrc1h, coeffV[ i + 1 ] ); - vsrcv[ j ][ i ] = vsrc0; - vsrcv[ j ][ i + 1 ] = vsrc1; - } - vsuma = vshlq_s32( vsuma, invshift2nd ); - vsumb = vshlq_s32( vsumb, invshift2nd ); - - vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb ); - - if( isLast ) // clip - { - vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); - } - - vst1q_s16( &dst[ j << 3 ], vsum ); - - INCY( dst, j * dstStride ); - } - } - - INCY( src, srcStride ); - } -} - -template -void InterpolationFilter::_initInterpolationFilterARM() -{ - // [taps][bFirst][bLast] - // m_filterHor[0][0][0] = simdFilter; - // m_filterHor[0][0][1] = simdFilter; - // m_filterHor[0][1][0] = simdFilter; - // m_filterHor[0][1][1] = simdFilter; - // - // m_filterHor[1][0][0] = simdFilter; - // m_filterHor[1][0][1] = simdFilter; - // m_filterHor[1][1][0] = simdFilter; - // m_filterHor[1][1][1] = simdFilter; - // - // m_filterHor[2][0][0] = simdFilter; - // m_filterHor[2][0][1] = simdFilter; - // m_filterHor[2][1][0] = simdFilter; - // m_filterHor[2][1][1] = simdFilter; - // - // m_filterVer[0][0][0] = simdFilter; - // m_filterVer[0][0][1] = simdFilter; - // m_filterVer[0][1][0] = simdFilter; - // m_filterVer[0][1][1] = simdFilter; - // - // m_filterVer[1][0][0] = simdFilter; - // m_filterVer[1][0][1] = simdFilter; - // m_filterVer[1][1][0] = simdFilter; - // m_filterVer[1][1][1] = simdFilter; - // - // m_filterVer[2][0][0] = simdFilter; - // m_filterVer[2][0][1] = simdFilter; - // m_filterVer[2][1][0] = simdFilter; - // m_filterVer[2][1][1] = simdFilter; - // - // m_filterCopy[0][0] = simdFilterCopy; - // m_filterCopy[0][1] = simdFilterCopy; - // m_filterCopy[1][0] = simdFilterCopy; - // m_filterCopy[1][1] = simdFilterCopy; - // - // m_filter4x4[0][0] = simdFilter4x4_N6; - // m_filter4x4[0][1] = simdFilter4x4_N6; - // - // m_filter4x4[1][0] = simdFilter4x4_N4; - // m_filter4x4[1][1] = simdFilter4x4_N4; - // - // m_filter8x8[0][0] = simdFilter8xX_N8; - // m_filter8x8[0][1] = simdFilter8xX_N8; - // - // m_filter8x8[1][0] = simdFilter8xX_N4; - // m_filter8x8[1][1] = simdFilter8xX_N4; - - m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8; - m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8; - - // m_filter16x16[1][0] = simdFilter16xX_N4; - // m_filter16x16[1][1] = simdFilter16xX_N4; - - m_filterN2_2D = simdInterpolateN2_2D; - - // m_weightedGeoBlk = xWeightedGeoBlk_SSE; -} - -# else // !__ARM_ARCH >= 8 - -template -void InterpolationFilter::_initInterpolationFilterARM() -{} - -# endif // !__ARM_ARCH >= 8 - -template void InterpolationFilter::_initInterpolationFilterARM(); - -#endif // #ifdef TARGET_SIMD_ARM - -} // namespace vvdec diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp index e1de9a76..25a23e0d 100644 --- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -40,4 +40,290 @@ POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------------------- */ -#include "../InterpolationFilterARM.h" +/** + * \file InterpolationFilter_neon.cpp + * \brief Neon implementation of InterpolationFilter for AArch64. + */ +// ==================================================================================================================== +// Includes +// ==================================================================================================================== + +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" +#include "../InterpolationFilter.h" + +//! \ingroup CommonLib +//! \{ + +#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF + +namespace vvdec +{ + +static void simdInterpolateN2_2D_neon( const ClpRng& clpRng, const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *ch, TFilterCoeff const *cv ) +{ + const int shift1st = IF_FILTER_PREC_BILINEAR - ( IF_INTERNAL_PREC_BILINEAR - clpRng.bd ); + const int offset1st = 1 << ( shift1st - 1 ); + + const int shift2nd = 4; + const int offset2nd = 1 << ( shift2nd - 1 ); + + int16x8_t mmOffset1 = vdupq_n_s16( offset1st ); + int16x8_t mmOffset2 = vdupq_n_s16( offset2nd ); + int16x8_t mmCoeffH = vdupq_n_s16( ch[ 1 ] ); + int16x8_t mmCoeffV = vdupq_n_s16( cv[ 1 ] ); + + int16x8_t mmLastH[ 16 ]; + + int16x8_t mmLast4H; + + // workaround for over-sensitive compilers + mmLastH[ 0 ] = vdupq_n_s16( 0 ); + + int16x8_t shift1inv = vdupq_n_s16( -shift1st ); + int16x8_t shift2inv = vdupq_n_s16( -shift2nd ); + + for( int row = -1; row < height; row++ ) + { + int16x8_t mmPix = vld1q_s16( src ); + int16x8_t mmPix1 = vld1q_s16( src + 1 ); + + int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); + + mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); + mmFiltered = vshlq_s16( mmFiltered, shift1inv ); + + if( row >= 0 ) + { + int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mmLast4H, 16 ); + mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mmLast4H ), mmCoeffV ); + mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); + + vst1q_lane_s64( (int64_t*) dst, (int64x2_t) mmFiltered2, 0 ); + } + + mmLast4H = mmFiltered; + + for( int x = 4; x < width; x += 8 ) + { + int16x8_t mmPix = vld1q_s16( src + x ); + int16x8_t mmPix1 = vld1q_s16( src + x + 1 ); + + int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); + mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); + mmFiltered = vshlq_s16( mmFiltered, shift1inv ); + + int idx = x >> 3; + int16x8_t mLast = mmLastH[ idx ]; + mmLastH[ idx ] = mmFiltered; + + if( row >= 0 ) + { + int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mLast, 16 ); + mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mLast ), mmCoeffV ); + mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); + + vst1q_s16( ( dst + x ), mmFiltered2 ); + } + } + if( row >= 0 ) + dst += dstStride; + + src += srcStride; + } +} + +static int16x8_t simdFilter16xX_N8_half( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) +{ + int16x8_t vsrca00 = vld1q_s16( src + 0 ); + int16x8_t vsrca01 = vld1q_s16( src + 1 ); + int16x8_t vsrca10 = vld1q_s16( src + 2 ); + int16x8_t vsrca11 = vld1q_s16( src + 3 ); + int16x8_t vsrcb00 = vld1q_s16( src + 4 ); + int16x8_t vsrcb01 = vld1q_s16( src + 5 ); + int16x8_t vsrcb10 = vld1q_s16( src + 6 ); + int16x8_t vsrcb11 = vld1q_s16( src + 7 ); + + int32x4_t a0 = vmull_s16( vget_low_s16( vsrca00 ), vget_low_s16( ch ) ); + int32x4_t a1 = vmull_s16( vget_low_s16( vsrca01 ), vget_low_s16( ch ) ); + int32x4_t a2 = vmull_s16( vget_low_s16( vsrca10 ), vget_low_s16( ch ) ); + int32x4_t a3 = vmull_s16( vget_low_s16( vsrca11 ), vget_low_s16( ch ) ); + + int32x4_t b0 = vmull_s16( vget_low_s16( vsrcb00 ), vget_low_s16( ch ) ); + int32x4_t b1 = vmull_s16( vget_low_s16( vsrcb01 ), vget_low_s16( ch ) ); + int32x4_t b2 = vmull_s16( vget_low_s16( vsrcb10 ), vget_low_s16( ch ) ); + int32x4_t b3 = vmull_s16( vget_low_s16( vsrcb11 ), vget_low_s16( ch ) ); + + a0 = vmlal_s16( a0, vget_high_s16( vsrca00 ), vget_high_s16( ch ) ); + a1 = vmlal_s16( a1, vget_high_s16( vsrca01 ), vget_high_s16( ch ) ); + a2 = vmlal_s16( a2, vget_high_s16( vsrca10 ), vget_high_s16( ch ) ); + a3 = vmlal_s16( a3, vget_high_s16( vsrca11 ), vget_high_s16( ch ) ); + + b0 = vmlal_s16( b0, vget_high_s16( vsrcb00 ), vget_high_s16( ch ) ); + b1 = vmlal_s16( b1, vget_high_s16( vsrcb01 ), vget_high_s16( ch ) ); + b2 = vmlal_s16( b2, vget_high_s16( vsrcb10 ), vget_high_s16( ch ) ); + b3 = vmlal_s16( b3, vget_high_s16( vsrcb11 ), vget_high_s16( ch ) ); + + int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) ); + int32x4_t vsumb = vpaddq_s32( vpaddq_s32( b0, b1 ), vpaddq_s32( b2, b3 ) ); + + vsuma = vaddq_s32( vsuma, voffset1 ); + vsumb = vaddq_s32( vsumb, voffset1 ); + + vsuma = vshlq_s32( vsuma, invshift1st ); + vsumb = vshlq_s32( vsumb, invshift1st ); + + return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) ); +} + +static int16x8x2_t simdFilter16xX_N8_step( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) +{ + int16x8_t a = simdFilter16xX_N8_half( src + 0, ch, voffset1, invshift1st ); + int16x8_t b = simdFilter16xX_N8_half( src + 8, ch, voffset1, invshift1st ); + return ( int16x8x2_t ){ a, b }; +} + +template +static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV ) +{ + OFFSET( src, srcStride, -3, -3 ); + + // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be + // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 + const int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); + const int shift1st = IF_FILTER_PREC - headRoom; + const int shift2nd = IF_FILTER_PREC + headRoom; + + const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st ); + int offset2nd; + if( isLast ) + { + offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC ); + } + else + { + offset2nd = 0; + } + + const int32x4_t voffset1 = vdupq_n_s32( offset1st ); + + const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); + const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); + + int16x8_t ch = vld1q_s16( coeffH ); + int16x8_t cv = vld1q_s16( coeffV ); + + int32x4_t invshift1st = vdupq_n_s32( -shift1st ); + int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); + + int16x8x2_t vsrcv0 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv1 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv2 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv3 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv4 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv5 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv6 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + + do + { + int16x8x2_t vsrcv7 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + src += srcStride; + + int32x4_t vsum0 = vdupq_n_s32( offset2nd ); + int32x4_t vsum1 = vdupq_n_s32( offset2nd ); + int32x4_t vsum2 = vdupq_n_s32( offset2nd ); + int32x4_t vsum3 = vdupq_n_s32( offset2nd ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0.val[ 0 ] ), cv, 0 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0.val[ 0 ] ), cv, 0 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv0.val[ 1 ] ), cv, 0 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv0.val[ 1 ] ), cv, 0 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1.val[ 0 ] ), cv, 1 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1.val[ 0 ] ), cv, 1 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv1.val[ 1 ] ), cv, 1 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv1.val[ 1 ] ), cv, 1 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2.val[ 0 ] ), cv, 2 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2.val[ 0 ] ), cv, 2 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv2.val[ 1 ] ), cv, 2 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv2.val[ 1 ] ), cv, 2 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3.val[ 0 ] ), cv, 3 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3.val[ 0 ] ), cv, 3 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv3.val[ 1 ] ), cv, 3 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv3.val[ 1 ] ), cv, 3 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4.val[ 0 ] ), cv, 4 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4.val[ 0 ] ), cv, 4 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv4.val[ 1 ] ), cv, 4 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv4.val[ 1 ] ), cv, 4 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5.val[ 0 ] ), cv, 5 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5.val[ 0 ] ), cv, 5 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv5.val[ 1 ] ), cv, 5 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv5.val[ 1 ] ), cv, 5 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6.val[ 0 ] ), cv, 6 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6.val[ 0 ] ), cv, 6 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv6.val[ 1 ] ), cv, 6 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv6.val[ 1 ] ), cv, 6 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7.val[ 0 ] ), cv, 7 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7.val[ 0 ] ), cv, 7 ); + vsum2 = vmlal_laneq_s16( vsum2, vget_low_s16( vsrcv7.val[ 1 ] ), cv, 7 ); + vsum3 = vmlal_laneq_s16( vsum3, vget_high_s16( vsrcv7.val[ 1 ] ), cv, 7 ); + + int16x8_t vsum01, vsum23; + if( isLast ) // clip + { + vsum0 = vshlq_s32( vsum0, invshift2nd ); + vsum1 = vshlq_s32( vsum1, invshift2nd ); + vsum2 = vshlq_s32( vsum2, invshift2nd ); + vsum3 = vshlq_s32( vsum3, invshift2nd ); + + vsum01 = vcombine_s16( vqmovn_s32( vsum0 ), vqmovn_s32( vsum1 ) ); + vsum23 = vcombine_s16( vqmovn_s32( vsum2 ), vqmovn_s32( vsum3 ) ); + + vsum01 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum01 ) ); + vsum23 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum23 ) ); + } + else + { + vsum01 = vcombine_s16( vqshrn_n_s32( vsum0, IF_FILTER_PREC ), vqshrn_n_s32( vsum1, IF_FILTER_PREC ) ); + vsum23 = vcombine_s16( vqshrn_n_s32( vsum2, IF_FILTER_PREC ), vqshrn_n_s32( vsum3, IF_FILTER_PREC ) ); + } + + vsrcv0 = vsrcv1; + vsrcv1 = vsrcv2; + vsrcv2 = vsrcv3; + vsrcv3 = vsrcv4; + vsrcv4 = vsrcv5; + vsrcv5 = vsrcv6; + vsrcv6 = vsrcv7; + + vst1q_s16( dst + 0, vsum01 ); + vst1q_s16( dst + 8, vsum23 ); + dst += dstStride; + } while( --height != 0 ); +} + +template<> +void InterpolationFilter::_initInterpolationFilterARM() +{ + m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8_neon; + m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon; + + m_filterN2_2D = simdInterpolateN2_2D_neon; +} + +} // namespace vvdec +#endif +//! \}