From ac10055ff13dbc0f4035a4bf0b6b94cccdd9bc3d Mon Sep 17 00:00:00 2001 From: Adam Wieckowski <70575289+adamjw24@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:04:02 +0100 Subject: [PATCH] IFP and RC harmonization, fixes for ARM, early GCC 11 (#350) --- .github/workflows/Build.yml | 2 +- .gitlab-ci-internal.yml | 2 + CMakeLists.txt | 29 +++- Makefile | 4 + cmake/modules/vvencCompilerSupport.cmake | 18 ++ include/vvenc/vvencCfg.h | 14 +- source/Lib/CommonLib/CommonDef.h | 2 - source/Lib/CommonLib/DepQuant.cpp | 6 +- source/Lib/CommonLib/DepQuant.h | 4 +- source/Lib/CommonLib/InterPrediction.cpp | 16 +- source/Lib/CommonLib/InterPrediction.h | 6 +- source/Lib/CommonLib/Picture.cpp | 5 + source/Lib/CommonLib/Picture.h | 2 + source/Lib/CommonLib/Slice.cpp | 19 ++- source/Lib/CommonLib/Slice.h | 3 +- source/Lib/CommonLib/TimeProfiler.h | 1 + source/Lib/CommonLib/TrQuant.cpp | 2 +- source/Lib/CommonLib/TypeDef.h | 2 + source/Lib/CommonLib/UnitTools.cpp | 10 +- source/Lib/CommonLib/UnitTools.h | 5 +- source/Lib/CommonLib/arm/BufferARM.h | 18 +- source/Lib/CommonLib/arm/RdCostARM.h | 103 +----------- source/Lib/CommonLib/x86/CommonDefX86.cpp | 4 +- source/Lib/CommonLib/x86/DepQuantX86.h | 10 -- source/Lib/CommonLib/x86/FixMissingIntrin.h | 2 + .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 14 +- source/Lib/EncoderLib/EncCu.cpp | 28 ++-- source/Lib/EncoderLib/EncGOP.cpp | 158 ++++++++++++++---- source/Lib/EncoderLib/EncGOP.h | 2 + source/Lib/EncoderLib/EncLib.cpp | 8 + source/Lib/EncoderLib/EncSlice.cpp | 26 +-- source/Lib/EncoderLib/GOPCfg.cpp | 18 +- source/Lib/EncoderLib/GOPCfg.h | 10 +- source/Lib/EncoderLib/InterSearch.cpp | 70 ++++---- source/Lib/EncoderLib/InterSearch.h | 6 +- source/Lib/EncoderLib/PreProcess.cpp | 12 +- source/Lib/apputils/VVEncAppCfg.h | 24 ++- source/Lib/vvenc/vvencCfg.cpp | 71 +++++++- source/Lib/vvenc/vvencimpl.cpp | 6 + 39 files changed, 462 insertions(+), 280 deletions(-) diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml index 708003b69..05d41e438 100644 --- a/.github/workflows/Build.yml +++ b/.github/workflows/Build.yml @@ -73,6 +73,6 @@ jobs: run: | mkdir build cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -A "${{ matrix.config.msvc_arch }}" + cmake .. -DCMAKE_BUILD_TYPE=Release -DVVENC_OVERRIDE_COMPILER_CHECK=ON -A "${{ matrix.config.msvc_arch }}" cmake --build . --config Release shell: cmd diff --git a/.gitlab-ci-internal.yml b/.gitlab-ci-internal.yml index 4e918ebf1..9f94dac8d 100644 --- a/.gitlab-ci-internal.yml +++ b/.gitlab-ci-internal.yml @@ -214,6 +214,7 @@ test_vc193x_Win32: extends: .build_test_msvc_template variables: MSVC_ARCH: Win32 + CONFIG_OPTIONS: "-DVVENC_OVERRIDE_COMPILER_CHECK=1" tags: - vc193x @@ -221,6 +222,7 @@ test_vc193x: extends: .build_test_msvc_template variables: MSVC_ARCH: x64 + CONFIG_OPTIONS: "-DVVENC_OVERRIDE_COMPILER_CHECK=1" tags: - vc193x diff --git a/CMakeLists.txt b/CMakeLists.txt index 5565dcb4c..85e6a783e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,10 +31,31 @@ if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm" endif() # we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86 -set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" ) +set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" ) set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" ) include( vvencCompilerSupport ) +check_problematic_compiler( VVENC_PROBLEMATIC_COMPILER "MSVC" 19.38 "" ) + +if( VVENC_PROBLEMATIC_COMPILER ) + set( VVENC_OVERRIDE_COMPILER_CHECK OFF CACHE BOOL "Build with known problematic compiler version" ) + + if( VVENC_OVERRIDE_COMPILER_CHECK ) + set( VVENC_PROBLEMATIC_COMPILER_MSG_TYPE WARNING ) + set( VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE + "The performance will not be optimal due to workarounds." ) + else() + set( VVENC_PROBLEMATIC_COMPILER_MSG_TYPE FATAL_ERROR ) + set( VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE + "Set -DVVENC_OVERRIDE_COMPILER_CHECK=ON to build with this compiler anyways, which enables workarounds impacting performance.") + endif() + + message( ${VVENC_PROBLEMATIC_COMPILER_MSG_TYPE} + "Binaries compiled with ${CMAKE_CXX_COMPILER} version ${CMAKE_CXX_COMPILER_VERSION} are known not to behave as intended. " + "The problematic version range is ${VVENC_PROBLEMATIC_COMPILER_VERSION_RANGE}. Please consider using a different compiler.\n" + ${VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE} ) + +endif() # enable sse4.1 build for all source files for gcc and clang if( VVENC_ENABLE_X86_SIMD ) @@ -81,14 +102,14 @@ endif() # enable install target set( VVENC_ENABLE_INSTALL ON CACHE BOOL "Enable or disable install target" ) -# enable postfix +# enable postfix set( VVENC_ENABLE_BUILD_TYPE_POSTFIX OFF CACHE BOOL "Enable or disable build type postfix for apps and libs" ) set( VVENC_ENABLE_LINK_TIME_OPT ON CACHE BOOL "Enable link time optimization for release and profile builds" ) set( VVENC_ENABLE_THIRDPARTY_JSON ON CACHE BOOL "Enable use of thirdparty json library" ) -set( VVENC_INSTALL_FULLFEATURE_APP OFF CACHE BOOL "Install the full-feature app: vvencFFapp" ) +set( VVENC_INSTALL_FULLFEATURE_APP OFF CACHE BOOL "Install the full-feature app: vvencFFapp" ) if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) @@ -300,7 +321,7 @@ if( VVENC_ENABLE_INSTALL ) set( CMAKE_INSTALL_RPATH ${RPATH_BASE} ${RPATH_BASE}/${RPATH_REL_DIR} ) message( STATUS "CMAKE_INSTALL_RPATH=${CMAKE_INSTALL_RPATH}" ) - endif() + endif() endif() diff --git a/Makefile b/Makefile index d0da05c8e..3cb0353d0 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,10 @@ ifneq ($(install-ffapp),) CONFIG_OPTIONS += -DVVENC_INSTALL_FULLFEATURE_APP=$(install-ffapp) endif +ifneq ($(override-compiler-check),) +CONFIG_OPTIONS += -DVVENC_OVERRIDE_COMPILER_CHECK=$(override-compiler-check) +endif + ifeq ($(j),) # Query cmake for the number of cores NUM_JOBS := $(shell cmake -P cmake/modules/vvencNumCores.cmake) diff --git a/cmake/modules/vvencCompilerSupport.cmake b/cmake/modules/vvencCompilerSupport.cmake index 4d54f90ea..c6e4d170f 100644 --- a/cmake/modules/vvencCompilerSupport.cmake +++ b/cmake/modules/vvencCompilerSupport.cmake @@ -83,3 +83,21 @@ function( _emscripten_enable_wasm_simd128 ) set( CMAKE_REQUIRED_FLAGS -msimd128 PARENT_SCOPE ) endif() endfunction() + +function( check_problematic_compiler output_var compiler_id first_bad_version first_fixed_version ) + if( CMAKE_CXX_COMPILER_ID STREQUAL "${compiler_id}" + AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "${first_bad_version}" + AND ( + NOT "${first_fixed_version}" + OR CMAKE_CXX_COMPILER_VERSION VERSION_LESS "${first_fixed_version}" ) ) + + set( ${output_var} TRUE PARENT_SCOPE ) + + if( "${first_fixed_version}" ) + set( ${output_var}_VERSION_RANGE "(${first_bad_version}...${first_fixed_version}]" PARENT_SCOPE ) + else() + set( ${output_var}_VERSION_RANGE "(${first_bad_version}...)" PARENT_SCOPE ) + endif() + + endif() +endfunction() diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h index c1f46ced9..c54e6bc85 100644 --- a/include/vvenc/vvencCfg.h +++ b/include/vvenc/vvencCfg.h @@ -430,7 +430,7 @@ typedef struct vvenc_config int m_framesToBeEncoded; // number of encoded frames (default: 0, all) int m_inputBitDepth[ 2 ]; // bit-depth of input pictures (2d array for luma,chroma) - int m_numThreads; // number of worker threads ( if <0: <720p 4threads, else 8threads (limited to available cores)) + int m_numThreads; // number of worker threads ( if <0: <720p 4threads, <5K 2880p 8threads, else 12threads (limited to available cores)) int m_QP; // QP value of key-picture (0-63, default: 32) int m_RCTargetBitrate; // target bitrate in bps (default: 0 (RC disabled)) @@ -761,7 +761,7 @@ typedef struct vvenc_config bool m_picReordering; bool m_reservedFlag; bool m_poc0idr; - int8_t m_fppLinesSynchro; + int8_t m_ifpLines; bool m_blockImportanceMapping; bool m_saoScc; bool m_addGOP32refPics; @@ -774,8 +774,14 @@ typedef struct vvenc_config // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate). // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate. // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier - int m_forceScc; - double m_reservedDouble[9]; + int8_t m_forceScc; + bool m_ifp; + + int8_t m_reservedInt8[2]; + + int m_minIntraDist; + int m_reservedInt; + double m_reservedDouble[8]; // internal state variables bool m_configDone; // state variable, Private context used for internal data ( do not change ) diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index b804a2e49..e5f5ea64d 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -501,8 +501,6 @@ static constexpr uint8_t MAX_TMP_BUFS = 6; static constexpr int QPA_MAX_NOISE_LEVELS = 8; -static constexpr int FPPLS_ALF_DERIVE_LINES = 1; ///< number of CTU lines for ALF filter derivation -static constexpr int FPPLS_CCALF_DERIVE_LINES = 1; ///< number of CTU lines for CCALF filter derivation // ==================================================================================================================== diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index a77964072..76b7e99de 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -1436,7 +1436,7 @@ void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffB const uint32_t log2TrHeight = Log2(height); const bool isLfnstApplied = tu.cu->lfnstIdx > 0 && (CU::isSepTree(*tu.cu) ? true : isLuma(compID)); const bool enableScalingLists = getUseScalingList(width, height, (tu.mtsIdx[compID] == MTS_SKIP), isLfnstApplied); - static_cast(p)->quant( tu, pSrc, compID, cQP, Quant::m_dLambda, ctx, uiAbsSum, enableScalingLists, Quant::getQuantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) ); + p->quant( tu, pSrc, compID, cQP, Quant::m_dLambda, ctx, uiAbsSum, enableScalingLists, Quant::getQuantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) ); } else { @@ -1460,7 +1460,7 @@ void DepQuant::dequant( const TransformUnit& tu, CoeffBuf& dstCoeff, const Compo const uint32_t log2TrHeight = Log2(height); const bool isLfnstApplied = tu.cu->lfnstIdx > 0 && (CU::isSepTree(*tu.cu) ? true : isLuma(compID)); const bool enableScalingLists = getUseScalingList(width, height, (tu.mtsIdx[compID] == MTS_SKIP), isLfnstApplied); - static_cast(p)->dequant( tu, dstCoeff, compID, cQP, enableScalingLists, Quant::getDequantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) ); + p->dequant( tu, dstCoeff, compID, cQP, enableScalingLists, Quant::getDequantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) ); } else { @@ -1472,7 +1472,7 @@ void DepQuant::init( int rdoq, bool useRDOQTS, int thrVal ) { QuantRDOQ2::init( rdoq, useRDOQTS, thrVal ); - static_cast(p)->init( thrVal ); + p->init( thrVal ); } } // namespace vvenc diff --git a/source/Lib/CommonLib/DepQuant.h b/source/Lib/CommonLib/DepQuant.h index ba8de5339..3cb56318a 100644 --- a/source/Lib/CommonLib/DepQuant.h +++ b/source/Lib/CommonLib/DepQuant.h @@ -241,8 +241,8 @@ class DepQuantImpl public: virtual ~DepQuantImpl() {} virtual void quant ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ) = 0; - virtual void dequant ( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* quantCoeff ); - virtual void init ( int dqTrVal ); + void dequant ( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* quantCoeff ); + void init ( int dqTrVal ); protected: DQIntern::Quantizer m_quant; diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 8d9e1c6ee..0f5a55216 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -246,7 +246,7 @@ void InterPrediction::destroy() m_IBCBuffer.destroy(); } -void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int ctuSize, const int fppLinesSynchro ) +void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int ctuSize, const int ifpLines ) { // if it has been initialised before, but the chroma format has changed, release the memory and start again. if( m_yuvPred[L0].getOrigin( COMP_Y ) != nullptr && m_currChromaFormat != chFormat ) @@ -279,7 +279,7 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int c m_IBCBufferWidth = g_IBCBufferSize / ctuSize; m_IBCBuffer.create(UnitArea(chFormat, Area(0, 0, m_IBCBufferWidth, ctuSize))); } - InterPredInterpolation::m_fppLinesSynchro = fppLinesSynchro; + InterPredInterpolation::m_ifpLines = ifpLines; } // ==================================================================================================================== @@ -615,7 +615,7 @@ InterPredInterpolation::InterPredInterpolation() , m_skipPROF(false) , m_encOnly(false) , m_isBi(false) - , m_fppLinesSynchro(0) + , m_ifpLines(0) { } @@ -727,7 +727,7 @@ void InterPredInterpolation::xPredInterBlk ( const ComponentID compID, const Cod bool wrapRef = false; Mv mv(_mv); - CHECKD( m_fppLinesSynchro && !srcPadBuf && !CU::isMvInRangeFPP( cu[compID].y, cu[compID].height, mv.ver, m_fppLinesSynchro, *cu.cs->pcv, getComponentScaleY(compID, chFmt) ), "xPredInterBlk: CTU line-wise FPP MV restriction failed!\n" ); + CHECKD( m_ifpLines && !srcPadBuf && cu.cs->picture != refPic && !CU::isMvInRangeFPP( cu[compID].y, cu[compID].height, mv.ver, m_ifpLines, *cu.cs->pcv, getComponentScaleY(compID, chFmt) ), "xPredInterBlk: CTU line-wise FPP MV restriction failed!\n" ); if( !isIBC && cu.cs->pcv->wrapArround ) { wrapRef = wrapClipMv( mv, cu.blocks[0].pos(), cu.blocks[0].size(), *cu.cs); @@ -1796,7 +1796,7 @@ void InterPredInterpolation::xPredAffineBlk(const ComponentID compID, const Codi iMvScaleTmpVer = curMv.ver; } - CHECKD( m_fppLinesSynchro && !CU::isMvInRangeFPP( puY + h, blockHeight, iMvScaleTmpVer, m_fppLinesSynchro, *pps.pcv, iScaleY ), "xPredAffineBlk: FPP MV restriction failed!\n" ); + CHECKD( m_ifpLines && !CU::isMvInRangeFPP( puY + h, blockHeight, iMvScaleTmpVer, m_ifpLines, *pps.pcv, iScaleY ), "xPredAffineBlk: FPP MV restriction failed!\n" ); // get the MV in high precision int xFrac, yFrac, xInt, yInt; @@ -1896,10 +1896,10 @@ void InterPredInterpolation::xPredAffineBlk(const ComponentID compID, const Codi } -bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const Mv* _mv, const int fppLinesSynchro, const int mvPrecShift ) +bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const Mv* _mv, const int ifpLines, const int mvPrecShift ) { const PreCalcValues& pcv = *cu.cs->pcv; - if( cu.ly() >= ( ( pcv.heightInCtus - 1 - fppLinesSynchro ) << pcv.maxCUSizeLog2 ) ) + if( cu.ly() >= ( ( pcv.heightInCtus - 1 - ifpLines ) << pcv.maxCUSizeLog2 ) ) return true; const ChromaFormat chFmt = cu.chromaFormat; @@ -1941,7 +1941,7 @@ bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const } const bool subblkMVSpreadOverLimit = InterPrediction::isSubblockVectorSpreadOverLimit(iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY, cu.interDir); - const int yRefMax = ( ( ( cu.ly() >> pcv.maxCUSizeLog2 ) + fppLinesSynchro + 1 ) << pcv.maxCUSizeLog2 ) - 1; + const int yRefMax = ( ( ( cu.ly() >> pcv.maxCUSizeLog2 ) + ifpLines + 1 ) << pcv.maxCUSizeLog2 ) - 1; const int dctifMarginVerBot = 4; auto roundMvVal = [&](int mvVal, int shift) diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index 0c836f355..b50d96603 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -82,7 +82,7 @@ class InterPredInterpolation InterpolationFilter m_if; Pel* m_filteredBlock [LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][MAX_NUM_COMP]; Pel* m_filteredBlockTmp [LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][MAX_NUM_COMP]; - int m_fppLinesSynchro; + int m_ifpLines; int xRightShiftMSB ( int numer, int denom ); void xApplyBDOF ( PelBuf& yuvDst, const ClpRng& clpRng ); @@ -122,7 +122,7 @@ class InterPredInterpolation PelUnitBuf &predDst, PelUnitBuf &predSrc0, PelUnitBuf &predSrc1); static bool isSubblockVectorSpreadOverLimit(int a, int b, int c, int d, int predType); - bool xIsAffineMvInRangeFPP (const CodingUnit& cu, const Mv* _mv, const int fppLinesSynchro, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL); + bool xIsAffineMvInRangeFPP (const CodingUnit& cu, const Mv* _mv, const int ifpLines, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL); }; class DMVR : public InterPredInterpolation @@ -171,7 +171,7 @@ class InterPrediction : public DMVR InterPrediction(); virtual ~InterPrediction(); - void init ( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, const int fppLinesSynchro = 0 ); + void init ( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, const int ifpLines = 0 ); void destroy (); // inter diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp index 730863722..c011dacd1 100644 --- a/source/Lib/CommonLib/Picture.cpp +++ b/source/Lib/CommonLib/Picture.cpp @@ -161,6 +161,7 @@ Picture::Picture() , isFinished ( false ) , isLongTerm ( false ) , isFlush ( false ) + , isInProcessList ( false ) , precedingDRAP ( false ) , gopEntry ( nullptr ) , refCounter ( 0 ) @@ -226,6 +227,7 @@ void Picture::reset() isFinished = false; isLongTerm = false; isFlush = false; + isInProcessList = false; isMeanQPLimited = false; precedingDRAP = false; @@ -236,6 +238,9 @@ void Picture::reset() gopAdaptedQP = 0; actualHeadBits = 0; actualTotalBits = 0; + encRCPic = nullptr; + picApsGlobal = nullptr; + refApsGlobal = nullptr; std::fill_n( m_sharedBufs, (int)NUM_PIC_TYPES, nullptr ); std::fill_n( m_bufsOrigPrev, NUM_QPA_PREV_FRAMES, nullptr ); diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h index a6c052bdb..46a2a54c2 100644 --- a/source/Lib/CommonLib/Picture.h +++ b/source/Lib/CommonLib/Picture.h @@ -114,6 +114,7 @@ struct PicApsGlobal{ int poc; unsigned tid; bool initalized = false; + int refCnt = 0; ParameterSetMap apsMap; PicApsGlobal( int _p ) : poc(_p), tid(MAX_UINT), apsMap( MAX_NUM_APS * MAX_NUM_APS_TYPE ) {} PicApsGlobal( int _p, unsigned _t ) : poc(_p), tid(_t), apsMap( MAX_NUM_APS * MAX_NUM_APS_TYPE ) {} @@ -216,6 +217,7 @@ struct Picture : public UnitArea bool isFinished; bool isLongTerm; bool isFlush; + bool isInProcessList; bool precedingDRAP; // preceding a DRAP picture in decoding order const GOPEntry* gopEntry; diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp index e3643fdbf..5c15f6039 100644 --- a/source/Lib/CommonLib/Slice.cpp +++ b/source/Lib/CommonLib/Slice.cpp @@ -443,7 +443,7 @@ void Slice::updateRefPicCounter( int step ) } } -bool Slice::checkRefPicsReconstructed() const +bool Slice::checkAllRefPicsReconstructed() const { for ( int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++ ) { @@ -460,6 +460,23 @@ bool Slice::checkRefPicsReconstructed() const return true; } +bool Slice::checkAllRefPicsAccessible() const +{ + for ( int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++ ) + { + int numOfActiveRef = numRefIdx[ refList ]; + for ( int i = 0; i < numOfActiveRef; i++ ) + { + if ( ! refPicList[ refList ][ i ]->isInProcessList ) + { + return false; + } + } + } + + return true; +} + void Slice::checkColRefIdx(uint32_t curSliceSegmentIdx, const Picture* pic) const { Slice* curSlice = pic->slices[ curSliceSegmentIdx ]; diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h index 9097a0ade..8e0001979 100644 --- a/source/Lib/CommonLib/Slice.h +++ b/source/Lib/CommonLib/Slice.h @@ -1265,7 +1265,8 @@ class Slice void resetSlicePart(); void constructRefPicList(const PicList& rcListPic, bool extBorder, const bool usingLongTerm = true); void updateRefPicCounter( int step ); - bool checkRefPicsReconstructed() const; + bool checkAllRefPicsAccessible() const; + bool checkAllRefPicsReconstructed() const; void setRefPOCList(); void setSMVDParam(); void checkColRefIdx(uint32_t curSliceSegmentIdx, const Picture* pic) const; diff --git a/source/Lib/CommonLib/TimeProfiler.h b/source/Lib/CommonLib/TimeProfiler.h index c0a50e39d..19955488d 100644 --- a/source/Lib/CommonLib/TimeProfiler.h +++ b/source/Lib/CommonLib/TimeProfiler.h @@ -91,6 +91,7 @@ namespace vvenc { E_( P_INTRA_CHROMA ) \ E_( P_INTRA ) \ E_( P_QUANT ) \ + E_( P_DEQUANT ) \ E_( P_TRAFO ) \ E_( P_RESHAPER ) \ E_( P_DEBLOCK_FILTER ) \ diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp index e75e85063..18f7afd0f 100644 --- a/source/Lib/CommonLib/TrQuant.cpp +++ b/source/Lib/CommonLib/TrQuant.cpp @@ -292,7 +292,7 @@ void TrQuant::xDeQuant(const TransformUnit& tu, const ComponentID &compID, const QpParam &cQP) { - PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_QUANT ); + PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_DEQUANT ); m_quant->dequant( tu, dstCoeff, compID, cQP ); } diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 5ee3fd5f9..e08aaf3e8 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -71,6 +71,8 @@ namespace vvenc { #define FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED 1 // Some compilers fail on particular code fragments, remove this when the compiler is fixed (or new version is used) +#define IFP_RC_DETERMINISTIC 0 // Enables Rate Control deterministic behavior (same results) when using IFP + // ==================================================================================================================== // General settings // ==================================================================================================================== diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp index 3a046355e..f5175df08 100644 --- a/source/Lib/CommonLib/UnitTools.cpp +++ b/source/Lib/CommonLib/UnitTools.cpp @@ -3548,18 +3548,18 @@ bool CU::isMTSAllowed(const CodingUnit &cu, const ComponentID compID) return mtsAllowed; } -bool CU::isMvInRangeFPP( const int yB, const int nH, const int yMv, const int fppLinesSynchro, const PreCalcValues& pcv, const int chromaShift, const int mvPrecShift ) +bool CU::isMvInRangeFPP( const int yB, const int nH, const int yMv, const int ifpLines, const PreCalcValues& pcv, const int chromaShift, const int mvPrecShift ) { //const int dctifMarginVerBot = 4 >> yCompScale; const int ctuLogScale = pcv.maxCUSizeLog2 - chromaShift; - const int yBMax = ( pcv.heightInCtus - 1 - fppLinesSynchro ) << ctuLogScale; - const int yRefMax = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1; + const int yBMax = ( pcv.heightInCtus - 1 - ifpLines ) << ctuLogScale; + const int yRefMax = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1; if( yB < yBMax && ( yB + nH + ( 4 >> chromaShift ) + (yMv >> (mvPrecShift + chromaShift) ) - 1 > yRefMax ) ) return false; return true; } -bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int fppLinesSynchro ) +bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int ifpLines ) { const CMotionBuf mb = cu.getMotionBuf(); const ComponentID compID = COMP_Y; @@ -3584,7 +3584,7 @@ bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int fppLinesSynchro const Mv& mv = mi.mv[i]; const int refMaxPosY = cuBottom + dctifMarginVerBot + (mv.ver >> mvPrecShift); const int refCtuRow = std::min( (int)((refMaxPosY > 0) ? refMaxPosY >> maxCUSizeShift: -1), (int)(cu.cs->pcv->heightInCtus - 1)); - if( refCtuRow > ( curCtuRow + fppLinesSynchro ) ) + if( refCtuRow > ( curCtuRow + ifpLines ) ) return false; } } diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h index 9a13fb36f..26cdc7e09 100644 --- a/source/Lib/CommonLib/UnitTools.h +++ b/source/Lib/CommonLib/UnitTools.h @@ -181,9 +181,8 @@ namespace CU void getIBCMergeCandidates (const CodingUnit& cu, MergeCtx& mrgCtx, const int& mrgCandIdx = -1); void fillIBCMvpCand (CodingUnit& cu, AMVPInfo& amvpInfo); void getIbcMVPsEncOnly (CodingUnit& cu, Mv* mvPred, int& nbPred); - //bool isMvInRangeFPP (const CodingUnit &cu, const Mv& mv, const int fppLinesSynchro, const ComponentID compID = COMP_Y, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL ); - bool isMvInRangeFPP (const int yB, const int nH, const int yMv, const int fppLinesSynchro, const PreCalcValues& pcv, const int yCompScale = 0, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL); - bool isMotionBufInRangeFPP (const CodingUnit& cu, const int fppLinesSynchro); + bool isMvInRangeFPP (const int yB, const int nH, const int yMv, const int ifpLines, const PreCalcValues& pcv, const int yCompScale = 0, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL); + bool isMotionBufInRangeFPP (const CodingUnit& cu, const int ifpLines); } // TU tools diff --git a/source/Lib/CommonLib/arm/BufferARM.h b/source/Lib/CommonLib/arm/BufferARM.h index 1d84911d7..d87e60ae8 100644 --- a/source/Lib/CommonLib/arm/BufferARM.h +++ b/source/Lib/CommonLib/arm/BufferARM.h @@ -65,7 +65,8 @@ namespace vvenc template void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, const Pel* lut ) { - if( ( width & 31 ) == 0 ) + + if( ( width & 31 ) == 0 && ( height & 3 ) == 0 ) { int16x8x4_t xtmp1; int16x8x4_t xtmp2; @@ -219,7 +220,7 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p dst += ( dstStride << 2 ); } } - else if( ( width & 15 ) == 0 ) + else if( ( width & 15 ) == 0 && ( height & 3 ) == 0 ) { int16x8x2_t xtmp1; int16x8x2_t xtmp2; @@ -309,7 +310,7 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p dst += ( dstStride << 2 ); } } - else if( ( width & 7 ) == 0 ) + else if( ( width & 7 ) == 0 && ( height & 3 ) == 0 ) { int16x8_t xtmp1; int16x8_t xtmp2; @@ -366,7 +367,16 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p dst += ( dstStride << 2 ); } } - + else + { +#define RSP_SGNL_OP( ADDR ) dst[ADDR] = lut[src[ADDR]] +#define RSP_SGNL_INC src += srcStride; dst += dstStride; + + SIZE_AWARE_PER_EL_OP( RSP_SGNL_OP, RSP_SGNL_INC ) + +#undef RSP_SGNL_OP +#undef RSP_SGNL_INC + } return; } diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h index f98b94d74..c70605491 100644 --- a/source/Lib/CommonLib/arm/RdCostARM.h +++ b/source/Lib/CommonLib/arm/RdCostARM.h @@ -57,100 +57,6 @@ namespace vvenc #ifdef TARGET_SIMD_ARM #if __ARM_ARCH >= 8 -template -Distortion xGetSAD_MxN_SIMD( const DistParam& rcDtParam ) -{ - if( rcDtParam.bitDepth > 10 ) - return isWdt16 ? RdCost::xGetSAD16( rcDtParam ) : RdCost::xGetSAD8( rcDtParam ); - - // assert( rcDtParam.iCols == iWidth); - const short* pSrc1 = (const short*) rcDtParam.org.buf; - const short* pSrc2 = (const short*) rcDtParam.cur.buf; - const int iRows = rcDtParam.org.height; - const int iSubShift = rcDtParam.subShift; - const ptrdiff_t iStrideSrc1 = rcDtParam.org.stride << iSubShift; - const ptrdiff_t iStrideSrc2 = rcDtParam.cur.stride << iSubShift; - - uint32_t uiSum = 0; - - int16x8_t vsum16 = vdupq_n_s16( 0 ); - - for( int i = 0; i < ( iRows >> 3 ); i++ ) - { - // 0 - int16x8_t vsrc1 = vld1q_s16( pSrc1 ); - int16x8_t vsrc2 = vld1q_s16( pSrc2 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - - if( isWdt16 ) - { - vsrc1 = vld1q_s16( pSrc1 + 8 ); - vsrc2 = vld1q_s16( pSrc2 + 8 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - } - - pSrc1 += iStrideSrc1; - pSrc2 += iStrideSrc2; - - // 1 - vsrc1 = vld1q_s16( pSrc1 ); - vsrc2 = vld1q_s16( pSrc2 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - - if( isWdt16 ) - { - vsrc1 = vld1q_s16( pSrc1 + 8 ); - vsrc2 = vld1q_s16( pSrc2 + 8 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - } - - pSrc1 += iStrideSrc1; - pSrc2 += iStrideSrc2; - - // 2 - vsrc1 = vld1q_s16( pSrc1 ); - vsrc2 = vld1q_s16( pSrc2 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - - if( isWdt16 ) - { - vsrc1 = vld1q_s16( pSrc1 + 8 ); - vsrc2 = vld1q_s16( pSrc2 + 8 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - } - - pSrc1 += iStrideSrc1; - pSrc2 += iStrideSrc2; - - // 3 - vsrc1 = vld1q_s16( pSrc1 ); - vsrc2 = vld1q_s16( pSrc2 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - - if( isWdt16 ) - { - vsrc1 = vld1q_s16( pSrc1 + 8 ); - vsrc2 = vld1q_s16( pSrc2 + 8 ); - - vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); - } - - pSrc1 += iStrideSrc1; - pSrc2 += iStrideSrc2; - } - - uiSum = vaddlvq_s16( vsum16 ); - uiSum <<= iSubShift; - return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); -} - template void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost ) { @@ -221,10 +127,9 @@ void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost ) if( isCalCentrePos ) sumTwo = vshrq_n_s32( sumTwo, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) ); - vst1q_lane_u64( (uint64_t*) &cost[ 0 ], (uint64x2_t) sum, 0 ); - if( isCalCentrePos ) - cost[ 2 ] = vgetq_lane_s32( sumTwo, 0 ); - vst1q_lane_u64( (uint64_t*) &cost[ 3 ], (uint64x2_t) sum, 1 ); + vst1q_s32( (int32_t*) &cost[0], vzip1q_s32( sum, vdupq_n_s32(0) ) ); + if (isCalCentrePos) cost[2] = (vgetq_lane_s32(sumTwo,0)); + vst1q_s32( (int32_t*) &cost[3], vzip2q_s32( sum, vdupq_n_s32(0) ) ); } template @@ -245,8 +150,6 @@ void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, b template void RdCost::_initRdCostARM() { - m_afpDistortFunc[0][DF_SAD8 ] = xGetSAD_MxN_SIMD; - m_afpDistortFunc[0][DF_SAD16 ] = xGetSAD_MxN_SIMD; m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD; } diff --git a/source/Lib/CommonLib/x86/CommonDefX86.cpp b/source/Lib/CommonLib/x86/CommonDefX86.cpp index 66e441976..6b666d486 100644 --- a/source/Lib/CommonLib/x86/CommonDefX86.cpp +++ b/source/Lib/CommonLib/x86/CommonDefX86.cpp @@ -266,7 +266,7 @@ X86_VEXT read_x86_extension_flags( X86_VEXT request ) static const X86_VEXT max_supported = _get_x86_extensions(); static X86_VEXT ext_flags = max_supported; #else - static const X86_VEXT max_supported = AVX; // disable AVX2 for non-x86 because the SIMD-Everywhere implementation is buggy + static const X86_VEXT max_supported = AVX2; // disable AVX2 for non-x86 because the SIMD-Everywhere implementation is buggy static X86_VEXT ext_flags = SIMD_EVERYWHERE_EXTENSION_LEVEL; // default to SSE42 for WASM and SIMD-everywhere #endif @@ -276,8 +276,6 @@ X86_VEXT read_x86_extension_flags( X86_VEXT request ) { #ifdef REAL_TARGET_X86 THROW( "requested SIMD level (" << request << ") not supported by current CPU (max " << max_supported << ")." ); -#else - THROW( "requested SIMD level (" << request << ") not supported because the SIMD-Everywhere implementation for AVX2 is buggy." ); #endif } diff --git a/source/Lib/CommonLib/x86/DepQuantX86.h b/source/Lib/CommonLib/x86/DepQuantX86.h index 459170370..77429699a 100644 --- a/source/Lib/CommonLib/x86/DepQuantX86.h +++ b/source/Lib/CommonLib/x86/DepQuantX86.h @@ -115,8 +115,6 @@ namespace DQIntern int cffBitsCtxOffset; bool anyRemRegBinsLt4; - unsigned effWidth; - unsigned effHeight; int initRemRegBins; }; @@ -1157,11 +1155,6 @@ namespace DQIntern { } - void init( int dqTrVal ) - { - m_quant.init( dqTrVal ); - } - void quant( TransformUnit &tu, const CCoeffBuf &srcCoeff, const ComponentID compID, const QpParam &cQP, const double lambda, const Ctx &ctx, TCoeff &absSum, bool enableScalingLists, int *quantCoeff ) { //===== reset / pre-init ===== @@ -1308,8 +1301,6 @@ namespace DQIntern int effectWidth = std::min( 32, effWidth ); int effectHeight = std::min( 32, effHeight ); - m_state_curr.effWidth = effectWidth; - m_state_curr.effHeight = effectHeight; m_state_curr.initRemRegBins = ( effectWidth * effectHeight * MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT ) / 16; m_state_curr.anyRemRegBinsLt4 = true; // for the first coeff use scalar impl., because it check against the init state, which // prohibits some paths @@ -1504,7 +1495,6 @@ namespace DQIntern private: CommonCtx m_commonCtx; - Quantizer m_quant; Decisions m_trellis[MAX_TB_SIZEY * MAX_TB_SIZEY][2]; Rom m_scansRom; diff --git a/source/Lib/CommonLib/x86/FixMissingIntrin.h b/source/Lib/CommonLib/x86/FixMissingIntrin.h index 7a07c7edb..6ef103748 100644 --- a/source/Lib/CommonLib/x86/FixMissingIntrin.h +++ b/source/Lib/CommonLib/x86/FixMissingIntrin.h @@ -83,6 +83,8 @@ static inline __m128i _mm_loadu_si32( const void* p ) { return _mm_cvtsi32_si128( *(int32_t*)p ); } +#elif defined( REAL_TARGET_X86 ) && defined( __GNUC__ ) && !defined( __llvm__ ) && !defined( __INTEL_COMPILER ) && __GNUC__ <= 11 && __GNUC_MINOR__ <= 2 +#define _mm_loadu_si32( p ) _mm_cvtsi32_si128( *(int32_t*)( p ) ) #endif #ifdef MISSING_INTRIN_mm_loadu_si64 diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index d9d0c19eb..b3a0cba2e 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -1001,7 +1001,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW m_CtxCache = &ctxCache; - int alfUnitSize = encCfg.m_fppLinesSynchro ? m_encCfg->m_CTUSize: m_encCfg->m_alfUnitSize; + int alfUnitSize = encCfg.m_ifpLines ? m_encCfg->m_CTUSize: m_encCfg->m_alfUnitSize; initASU( alfUnitSize ); const int numBins = m_encCfg->m_useNonLinearAlfLuma || m_encCfg->m_useNonLinearAlfChroma ? MaxAlfNumClippingValues : 1; @@ -1654,7 +1654,7 @@ void EncAdaptiveLoopFilter::deriveFilter( Picture& pic, CodingStructure& cs, con { return; } - const int numAsus = m_encCfg->m_fppLinesSynchro && numCtus != m_numAsusInPic ? numCtus: m_numAsusInPic; + const int numAsus = m_encCfg->m_ifpLines && numCtus != m_numAsusInPic ? numCtus: m_numAsusInPic; initCABACEstimator( cs.slice ); @@ -1740,7 +1740,7 @@ void EncAdaptiveLoopFilter::deriveFilter( Picture& pic, CodingStructure& cs, con m_CABACEstimator->getCtx() = AlfCtx( ctxStart ); alfEncoderCtb( cs, alfParam, lambdaChromaWeight, numAsus, numCtus ); - if( m_encCfg->m_fppLinesSynchro ) + if( m_encCfg->m_ifpLines ) { reconstructCoeffFixedAPSs( cs, !cs.slice->lumaApsId.empty() && cs.slice->alfEnabled[COMP_Y], cs.slice->chromaApsId >= 0 && (cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr]), true ); @@ -1986,11 +1986,11 @@ void EncAdaptiveLoopFilter::initEncProcess( Slice& slice ) } // NOTE: ALF is here enabled per default. However it can be disabled during filter derivation part. - // In line synchronized FPP mode, it cannot be disabled. + // In lines synchronized IFP mode, it cannot be disabled. slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = slice.sps->alfEnabled; slice.ccAlfCbEnabled = slice.ccAlfCrEnabled = slice.sps->ccalfEnabled; - if( m_encCfg->m_fppLinesSynchro ) + if( m_encCfg->m_ifpLines ) { // CCALF m_ccAlfFilterParam.ccAlfFilterEnabled[0] = slice.ccAlfCbEnabled; @@ -4936,8 +4936,8 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa cs.slice->ccAlfCbApsId = newApsId; cs.slice->ccAlfCrApsId = newApsId; - // in case of FPP line synchro, we always trying to use ALF (with final decision at CTU level) - if (costOff <= costMin && !m_encCfg->m_fppLinesSynchro) + // in case of IFP lines synchro, we always trying to use ALF (with final decision at CTU level) + if (costOff <= costMin && !m_encCfg->m_ifpLines) { memset( cs.slice->alfEnabled, 0, sizeof( cs.slice->alfEnabled ) ); cs.slice->numAps = (0); diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 8e1e10425..3b962c03f 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -1715,9 +1715,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC continue; } mergeCtx.setMergeInfo( cu, uiMergeCand ); - if( m_pcEncCfg->m_fppLinesSynchro && - ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) || - ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && + ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) || + ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ) ) { // skip candidate @@ -1907,9 +1907,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC continue; } mergeCtx.setMmvdMergeCandiInfo(cu, mmvdMergeCand); - if( m_pcEncCfg->m_fppLinesSynchro && - ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) || - ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && + ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) || + ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ) ) { // skip candidate @@ -2124,9 +2124,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC tempCS->initStructData(encTestMode.qp); continue; } - if( m_pcEncCfg->m_fppLinesSynchro && !m_pcEncCfg->m_useFastMrg && - ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) || - ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && !m_pcEncCfg->m_useFastMrg && + ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) || + ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ) ) { // skip candidate @@ -2392,10 +2392,10 @@ void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bes continue; } - if( m_pcEncCfg->m_fppLinesSynchro ) + if( m_pcEncCfg->m_ifpLines ) { - skipCandFpp[L0][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 0].mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ); - skipCandFpp[L1][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 1].mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ); + skipCandFpp[L0][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 0].mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ); + skipCandFpp[L1][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 1].mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ); if( skipCandFpp[L0][mergeCand] || skipCandFpp[L1][mergeCand] ) continue; } @@ -2638,7 +2638,7 @@ void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bes cu.mmvdMergeIdx = MAX_UINT; CU::spanGeoMotionInfo(cu, mergeCtx, cu.geoSplitDir, cu.geoMergeIdx0, cu.geoMergeIdx1); - if( m_pcEncCfg->m_fppLinesSynchro && + if( m_pcEncCfg->m_ifpLines && ( skipCandFpp[L0][cu.geoMergeIdx0] || skipCandFpp[L1][cu.geoMergeIdx0] || skipCandFpp[L0][cu.geoMergeIdx1] || skipCandFpp[L1][cu.geoMergeIdx1] ) ) { tempCS->initStructData(encTestMode.qp); @@ -4050,7 +4050,7 @@ bool EncCu::xCheckSATDCostAffineMerge(CodingStructure*& tempCS, CodingUnit& cu, CU::spanMotionInfo( cu ); } - if( m_pcEncCfg->m_fppLinesSynchro && ( !( CU::isMotionBufInRangeFPP( cu, m_pcEncCfg->m_fppLinesSynchro ) ) ) ) + if( m_pcEncCfg->m_ifpLines && ( !( CU::isMotionBufInRangeFPP( cu, m_pcEncCfg->m_ifpLines ) ) ) ) { // Do not use this mode continue; diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 78b498021..b1f784412 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -315,7 +315,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList ) const bool lockStepMode = (m_pcEncCfg->m_RCTargetBitrate > 0 || (m_pcEncCfg->m_LookAhead > 0 && !m_isPreAnalysis)) && (m_pcEncCfg->m_maxParallelFrames > 0); // get list of pictures to be encoded and used for RC update - if( m_procList.empty() && ! m_gopEncListInput.empty() ) + if( m_procList.empty() && (!m_gopEncListInput.empty() || !m_rcInputReorderList.empty()) ) { xGetProcessingLists( m_procList, m_rcUpdateList, lockStepMode ); } @@ -344,7 +344,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList ) const VVEncCfg* encCfg = m_pcEncCfg; auto picItr = find_if( m_procList.begin(), m_procList.end(), [encCfg]( auto pic ) { // if ALF enabled and ALFTempPred is used, ensure that refAps is initialized - return ( encCfg->m_fppLinesSynchro || pic->slices[ 0 ]->checkRefPicsReconstructed() ) + return ( encCfg->m_ifpLines || pic->slices[ 0 ]->checkAllRefPicsReconstructed() ) && ( !encCfg->m_alf || ( !pic->refApsGlobal || pic->refApsGlobal->initalized ) ); } ); const bool nextPicReady = picItr != m_procList.end(); @@ -398,6 +398,11 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList ) } } + if( lockStepMode && m_pcEncCfg->m_ifpLines && !m_rcUpdateList.empty() ) + { + xUpdateRcIfp(); + } + // picture/AU output // // in lock-step mode: @@ -405,7 +410,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList ) // if the next picture to output belongs to the current chunk, do output (evaluation) when all pictures of the chunk are finished if( m_gopEncListOutput.empty() || !m_gopEncListOutput.front()->isReconstructed || - ( lockStepMode && !m_rcUpdateList.empty() && m_gopEncListOutput.front() == m_rcUpdateList.front() && !xLockStepPicsFinished() ) ) + ( lockStepMode && !m_pcEncCfg->m_ifpLines && !m_rcUpdateList.empty() && m_gopEncListOutput.front() == m_rcUpdateList.front() && !xLockStepPicsFinished() ) ) { return; } @@ -420,7 +425,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList ) // update pending RC // first pic has been written to bitstream // therefore we have at least for this picture a valid total bit and head bit count - if( !m_rcUpdateList.empty() && m_rcUpdateList.front() == outPic ) + if( !m_rcUpdateList.empty() && m_rcUpdateList.front() == outPic && (!lockStepMode || !m_pcEncCfg->m_ifpLines) ) { if( m_pcEncCfg->m_RCTargetBitrate > 0 ) { @@ -475,6 +480,8 @@ void EncGOP::xSyncAlfAps( Picture& pic ) if( !refAps ) return; CHECK( !refAps->initalized, "Attempt referencing from an uninitialized APS" ); + pic.refApsGlobal->refCnt--; + CHECK( pic.refApsGlobal->refCnt < 0, "Not expected APS ref. counter\n" ); // copy ref APSs to current picture const ParameterSetMap& src = refAps->apsMap; @@ -1315,8 +1322,11 @@ void EncGOP::xSetupPicAps( Picture* pic ) // additional +2 offset, due two max possible processing delay of two GOPs (Threads=1 mode) if( m_globalApsList.size() > ( std::max( (int)MAX_NUM_APS, m_pcEncCfg->m_GOPSize ) * ( m_pcEncCfg->m_maxParallelFrames + 2 ) ) ) { - delete m_globalApsList.front(); - m_globalApsList.pop_front(); + if( m_globalApsList.front()->refCnt == 0 ) + { + delete m_globalApsList.front(); + m_globalApsList.pop_front(); + } } pic->picApsGlobal = m_globalApsList.back(); @@ -1354,6 +1364,8 @@ void EncGOP::xSetupPicAps( Picture* pic ) curApsItr--; refAps = *curApsItr; } + if( refAps ) + refAps->refCnt++; } //CHECK( !refAps, "Faied to get reference APS" ); @@ -1409,41 +1421,128 @@ void EncGOP::xInitPicsInCodingOrder( const PicList& picList ) CHECK( picList.size() && m_pcEncCfg->m_maxParallelFrames <= 0 && m_gopEncListOutput.size() != 1, "no new picture for encoding found" ); } +void EncGOP::xUpdateRcIfp() +{ + // deterministic behavior: RC update on next finished frame in sliding window coding order, + // evaluate only one finished frame at front of the list that makes place for the next frame + // whose parameters can be set using the finished frame bits info + // + // non-deterministic behavior: RC update on any finished frame + +#if IFP_RC_DETERMINISTIC + if( m_rcUpdateList.front()->isReconstructed && m_rcUpdateList.back()->encRCPic && ( m_rcUpdateList.front()->isFlush || m_rcUpdateList.size() == m_pcEncCfg->m_maxParallelFrames ) ) + { +#endif + for( auto it = m_rcUpdateList.begin(); it != m_rcUpdateList.end(); ) + { + auto pic = *it; + if( pic->isReconstructed ) + { + pic->actualTotalBits = pic->sliceDataStreams[0].getNumberOfWrittenBits(); + pic->refCounter--; + m_pcRateCtrl->updateAfterPicEncRC( pic ); + it = m_rcUpdateList.erase( it ); + } + else + { + ++it; + } +#if IFP_RC_DETERMINISTIC + // in deterministic case, only one frame is allowed to update the RC + break; +#endif + } +#if IFP_RC_DETERMINISTIC + } +#endif +} + +inline void getReorderedProcList( std::list& inputList, std::list& procList, const int maxSize, bool isIFP ) +{ + // deliver frames of the same TID (temporal layer) and from the same GOP + const int procTL = inputList.size() ? inputList.front()->TLayer : -1; + const int gopNum = inputList.size() ? inputList.front()->gopEntry->m_gopNum : -1; + for( auto it = inputList.begin(); it != inputList.end(); ) + { + auto pic = *it; + if( pic->gopEntry->m_gopNum == gopNum + && pic->TLayer == procTL + && ( isIFP ? pic->slices[ 0 ]->checkAllRefPicsAccessible(): pic->slices[ 0 ]->checkAllRefPicsReconstructed() ) ) + { + pic->isInProcessList = true; + procList.push_back ( pic ); + it = inputList.erase( it ); + } + else + { + ++it; + } + if( (int)procList.size() >= maxSize ) + break; + } +} + void EncGOP::xGetProcessingLists( std::list& procList, std::list& rcUpdateList, const bool lockStepMode ) { - // in lockstep mode, process only pics of same temporal layer + // in lockstep mode, frames are reordered in a specific processing order if( lockStepMode ) { - // start new parallel chunk only, if next output picture is not reconstructed - if( rcUpdateList.empty() ) + if( m_pcEncCfg->m_ifpLines ) { - const int procTL = m_gopEncListInput.size() ? m_gopEncListInput.front()->TLayer : -1; - const int gopNum = m_gopEncListInput.size() ? m_gopEncListInput.front()->gopEntry->m_gopNum : -1; - const int minSerialDepth = m_pcEncCfg->m_maxParallelFrames > 2 ? 1 : 2; // up to this temporal layer encode pictures only in serial mode - const int maxSize = procTL <= minSerialDepth ? 1 : m_pcEncCfg->m_maxParallelFrames; - for( auto it = m_gopEncListInput.begin(); it != m_gopEncListInput.end(); ) + // in IFP lockstep mode: + // we need an additional reordering list to ensure causality of the coding order (ref.pics) on irregular GOP structures + // in the first step, the reordered list is filled + // in the second, the frames from reordered list are moved to proc. list up to required update-list size + const int maxUpdateListSize = m_pcEncCfg->m_maxParallelFrames; + if( rcUpdateList.size() < maxUpdateListSize && ( !m_gopEncListInput.empty() || !m_rcInputReorderList.empty())) { - auto pic = *it; - if( pic->gopEntry->m_gopNum == gopNum - && pic->TLayer == procTL - && pic->slices[ 0 ]->checkRefPicsReconstructed() ) - { - procList.push_back ( pic ); - rcUpdateList.push_back( pic ); - it = m_gopEncListInput.erase( it ); - } - else + while( rcUpdateList.size() < maxUpdateListSize && ( !m_gopEncListInput.empty() || !m_rcInputReorderList.empty()) ) { - ++it; + if( !m_rcInputReorderList.empty() ) + { + auto pic = m_rcInputReorderList.front(); + m_rcInputReorderList.pop_front(); + pic->refCounter++; + procList.push_back( pic ); + rcUpdateList.push_back( pic ); + } + else + { + while( m_rcInputReorderList.size() < maxUpdateListSize && !m_gopEncListInput.empty() ) + { + getReorderedProcList( m_gopEncListInput, m_rcInputReorderList, maxUpdateListSize, true ); + } + } } - if( (int)procList.size() >= maxSize ) - break; } } + else if( rcUpdateList.empty() ) + { + // retrieve next lockstep chunk + const int procTL = m_gopEncListInput.size() ? m_gopEncListInput.front()->TLayer : -1; + const int minSerialDepth = m_pcEncCfg->m_maxParallelFrames > 2 ? 1 : 2; // up to this temporal layer encode pictures only in serial mode + const int maxSize = procTL <= minSerialDepth ? 1 : m_pcEncCfg->m_maxParallelFrames; + getReorderedProcList( m_gopEncListInput, procList, maxSize, false ); + std::copy( procList.begin(), procList.end(), std::back_inserter(rcUpdateList) ); + } } else { - procList.splice( procList.end(), m_gopEncListInput ); + if( m_pcEncCfg->m_ifpLines ) + { + // in case of IFP, using the reordered list brings an additional speedup + while( !m_gopEncListInput.empty() ) + { + size_t inputListSize = m_gopEncListInput.size(); + getReorderedProcList( m_gopEncListInput, procList, (int)procList.size() + m_pcEncCfg->m_maxParallelFrames, true ); + CHECK( m_gopEncListInput.size() == inputListSize, "IFP processing list derivation: attempting to run in a deadlock" ); + } + } + else + { + // just pass the input list to processing list + procList.splice( procList.end(), m_gopEncListInput ); + } m_gopEncListInput.clear(); if( ! m_gopEncListOutput.empty() ) rcUpdateList.push_back( m_gopEncListOutput.front() ); @@ -1797,8 +1896,6 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod alfAPS->ccAlfParam.reset(); } } - pic.picApsGlobal = nullptr; - pic.refApsGlobal = nullptr; CHECK( slice->enableDRAPSEI && m_pcEncCfg->m_maxParallelFrames, "Dependent Random Access Point is not supported by Frame Parallel Processing" ); pic.isInitDone = true; @@ -2502,7 +2599,6 @@ void EncGOP::xAddPSNRStats( const Picture* pic, CPelUnitBuf cPicD, AccessUnitLis } } } - const uint32_t uibits = numRBSPBytes * 8; if (m_isPreAnalysis || !m_pcRateCtrl->rcIsFinalPass) diff --git a/source/Lib/EncoderLib/EncGOP.h b/source/Lib/EncoderLib/EncGOP.h index 633182318..27c3dccea 100644 --- a/source/Lib/EncoderLib/EncGOP.h +++ b/source/Lib/EncoderLib/EncGOP.h @@ -147,6 +147,7 @@ class EncGOP : public EncStage std::list m_gopEncListOutput; std::list m_procList; std::list m_rcUpdateList; + std::list m_rcInputReorderList; // used in RC in IFP lines synchro mode std::deque m_globalApsList; std::vector m_globalCtuQpVector; @@ -200,6 +201,7 @@ class EncGOP : public EncStage void xSelectReferencePictureList ( Slice* slice ) const; void xSyncAlfAps ( Picture& pic ); + void xUpdateRcIfp (); void xWritePicture ( Picture& pic, AccessUnitList& au, bool isEncodeLtRef ); int xWriteParameterSets ( Picture& pic, AccessUnitList& accessUnit, HLSWriter& hlsWriter ); int xWritePictureSlices ( Picture& pic, AccessUnitList& accessUnit, HLSWriter& hlsWriter ); diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index c8280ee4c..bf58e6210 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -56,6 +56,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "EncStage.h" #include "PreProcess.h" #include "EncGOP.h" +#include "CommonLib/x86/CommonDefX86.h" //! \ingroup EncoderLib //! \{ @@ -111,6 +112,13 @@ void EncLib::initEncoderLib( const vvenc_config& encCfg ) // copy config parameter const_cast(m_encCfg) = encCfg; +#if defined( REAL_TARGET_X86 ) && defined( _MSC_VER ) && _MSC_VER >= 1938 + if( read_x86_extension_flags() >= x86_simd::AVX2 ) + { + msg.log( VVENC_WARNING, "WARNING: MSVC version >= 17.8 produces invalid AVX2 code, partially disabling AVX2!\n" ); + } + +#endif // setup modified configs for rate control if( m_encCfg.m_RCNumPasses > 1 || m_encCfg.m_LookAhead ) { diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp index 722c3d19d..269596226 100644 --- a/source/Lib/EncoderLib/EncSlice.cpp +++ b/source/Lib/EncoderLib/EncSlice.cpp @@ -892,7 +892,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) const UnitArea& ctuArea = ctuEncParam->ctuArea; const bool wppSyncEnabled = cs.sps->entropyCodingSyncEnabled; const TaskType currState = processStates[ ctuRsAddr ]; - const int syncLines = encSlice->m_pcEncCfg->m_fppLinesSynchro; + const int syncLines = encSlice->m_pcEncCfg->m_ifpLines; DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) ); DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); @@ -910,7 +910,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) // encode case CTU_ENCODE: { - // CTU line-wise frame parallel processing synchronization + // CTU line-wise inter-frame parallel processing synchronization if( syncLines ) { const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ); @@ -1161,18 +1161,18 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat ); // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER; } break; case ALF_DERIVE_FILTER: { - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; if( ctuRsAddr == deriveFilterCtu ) { // ensure statistics from all previous ctu's have been collected - int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, FPPLS_ALF_DERIVE_LINES): pcv.heightInCtus; + int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus; for( int y = 0; y < numCheckLines; y++ ) { for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) @@ -1200,7 +1200,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) if( ctuRsAddr == deriveFilterCtu ) { encSlice->m_pALF->initDerivation( slice ); - encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES: pcv.sizeInCtus ); + encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus ); encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false ); } else if( syncLines ) @@ -1225,7 +1225,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case ALF_RECONSTRUCT: { // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT ) return false; @@ -1277,7 +1277,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat ); // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER; } break; @@ -1285,11 +1285,11 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case CCALF_DERIVE_FILTER: { // synchronization dependencies - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; if( ctuRsAddr == deriveFilterCtu ) { // ensure statistics from all previous ctu's have been collected - int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, FPPLS_CCALF_DERIVE_LINES): pcv.heightInCtus; + int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus; for( int y = 0; y < numCheckLines; y++ ) { for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) @@ -1316,7 +1316,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) { if( ctuRsAddr == deriveFilterCtu ) { - encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES: pcv.sizeInCtus ); + encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus ); } else if( syncLines ) { @@ -1337,7 +1337,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case CCALF_RECONSTRUCT: { // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT ) return false; @@ -1379,7 +1379,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) if(ctuPosY + 1 == pcv.heightInCtus) recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin ); - // for FPP lines synchro, do an additional increment signaling that CTU row is ready + // for IFP lines synchro, do an additional increment signaling that CTU row is ready if( syncLines ) ++(pic->m_tileColsDone->at( ctuPosY )); } diff --git a/source/Lib/EncoderLib/GOPCfg.cpp b/source/Lib/EncoderLib/GOPCfg.cpp index 0923a5e67..087a50556 100644 --- a/source/Lib/EncoderLib/GOPCfg.cpp +++ b/source/Lib/EncoderLib/GOPCfg.cpp @@ -55,7 +55,7 @@ namespace vvenc { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode ) +void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode, int minIntraDist ) { CHECK( gopSize < 1, "gop size has to be greater than 0" ); @@ -119,6 +119,8 @@ void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int go m_cnOffset = 0; m_numTillGop = poc0idr ? 0 : (int)m_gopList->size() - 1; m_numTillIntra = poc0idr ? 0 : (int)m_gopList->size() - 1; + m_minIntraDist = minIntraDist; + m_lastIntraPOC = -1; } void GOPCfg::getNextGopEntry( GOPEntry& gopEntry ) @@ -158,6 +160,7 @@ void GOPCfg::getNextGopEntry( GOPEntry& gopEntry ) gopEntry.m_temporalId = isTl0 ? 0 : 1; gopEntry.m_isStartOfIntra = isStartOfIntra; gopEntry.m_isValid = true; + if( isStartOfIntra ) m_lastIntraPOC = m_nextPoc; // continue with next frame m_nextPoc += 1; @@ -180,6 +183,7 @@ void GOPCfg::getNextGopEntry( GOPEntry& gopEntry ) gopEntry.m_sliceType = 'I'; gopEntry.m_isStartOfIntra = true; gopEntry.m_temporalId = 0; + m_lastIntraPOC = m_nextPoc; } // check for end of current gop @@ -234,6 +238,7 @@ void GOPCfg::startIntraPeriod( GOPEntry& gopEntry ) gopEntry.m_isStartOfIntra = true; gopEntry.m_isStartOfGop = true; gopEntry.m_temporalId = 0; + m_lastIntraPOC = gopEntry.m_POC; // start with first gop list m_gopList = &m_defaultGopLists[ 0 ]; @@ -251,7 +256,7 @@ void GOPCfg::startIntraPeriod( GOPEntry& gopEntry ) } } -void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry ) const +void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry ) { gopEntry.m_isStartOfGop = true; if( gopEntry.m_gopNum == 0 && ! gopEntry.m_isStartOfIntra ) @@ -259,6 +264,7 @@ void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry ) const gopEntry.m_isStartOfIntra = true; gopEntry.m_sliceType = 'I'; gopEntry.m_temporalId = 0; + m_lastIntraPOC = gopEntry.m_POC; } } @@ -275,6 +281,14 @@ void GOPCfg::getDefaultRPLLists( RPLList& rpl0, RPLList& rpl1 ) const } } +bool GOPCfg::isSTAallowed( int poc ) const +{ + int intraDistBack = poc - m_lastIntraPOC; + int intraDistForward = m_numTillIntra + 1; + + return ( intraDistBack >= m_minIntraDist && intraDistForward >= m_minIntraDist ); +} + bool GOPCfg::hasNonZeroTemporalId() const { return m_maxTid > 0; diff --git a/source/Lib/EncoderLib/GOPCfg.h b/source/Lib/EncoderLib/GOPCfg.h index 39504273c..d9e9a93e6 100644 --- a/source/Lib/EncoderLib/GOPCfg.h +++ b/source/Lib/EncoderLib/GOPCfg.h @@ -93,6 +93,8 @@ class GOPCfg int m_maxTid; int m_firstPassMode; int m_defaultNumActive[ 2 ]; + int m_minIntraDist; + int m_lastIntraPOC; public: GOPCfg( MsgLog& _m ) @@ -115,6 +117,8 @@ class GOPCfg , m_maxTid ( 0 ) , m_firstPassMode ( 0 ) , m_defaultNumActive{ 0, 0 } + , m_minIntraDist ( -1 ) + , m_lastIntraPOC ( -1 ) { }; @@ -122,17 +126,19 @@ class GOPCfg { }; - void initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode ); + void initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode, int m_minIntraDist ); void getNextGopEntry( GOPEntry& gopEntry ); void startIntraPeriod( GOPEntry& gopEntry ); - void fixStartOfLastGop( GOPEntry& gopEntry ) const; + void fixStartOfLastGop( GOPEntry& gopEntry ); void getDefaultRPLLists( RPLList& rpl0, RPLList& rpl1 ) const; + void setLastIntraSTA( int poc ) { m_lastIntraPOC = poc; } int getMaxTLayer() const { return m_maxTid; } const std::vector& getMaxDecPicBuffering() const { return m_maxDecPicBuffering; } const std::vector& getNumReorderPics() const { return m_numReorderPics; } int getDefaultNumActive( int l ) const { return m_defaultNumActive[ l ]; } + bool isSTAallowed( int poc ) const; bool hasNonZeroTemporalId() const; bool hasLeadingPictures() const; bool isChromaDeltaQPEnabled() const; diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp index a8feb7ca3..9233faef0 100644 --- a/source/Lib/EncoderLib/InterSearch.cpp +++ b/source/Lib/EncoderLib/InterSearch.cpp @@ -217,7 +217,7 @@ InterSearch::~InterSearch() void InterSearch::init( const VVEncCfg& encCfg, TrQuant* pTrQuant, RdCost* pRdCost, EncModeCtrl* pModeCtrl, CodingStructure **pSaveCS ) { - InterPrediction::init( pRdCost, encCfg.m_internChromaFormat, encCfg.m_CTUSize, encCfg.m_fppLinesSynchro ); + InterPrediction::init( pRdCost, encCfg.m_internChromaFormat, encCfg.m_CTUSize, encCfg.m_ifpLines ); m_numBVs = 0; m_pcEncCfg = &encCfg; m_pcTrQuant = pTrQuant; @@ -1198,7 +1198,7 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub xCopyAMVPInfo(&aacAMVPInfo[1][bestBiPRefIdxL1], &amvp[REF_PIC_LIST_1]); aaiMvpIdxBi[1][bestBiPRefIdxL1] = bestBiPMvpL1; cMvPredBi [1][bestBiPRefIdxL1] = amvp[REF_PIC_LIST_1].mvCand[bestBiPMvpL1]; - if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cMvPredBi[1][bestBiPRefIdxL1].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cMvPredBi[1][bestBiPRefIdxL1].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) { // this mvp cannot be used for mv, skip Bi-pred uiCostBi = std::numeric_limits::max(); @@ -1390,10 +1390,10 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub cCurMvField.setMvField( aacAMVPInfo[curRefList][refIdxCur].mvCand[i], refIdxCur ); cTarMvField.setMvField( aacAMVPInfo[tarRefList][refIdxTar].mvCand[j], refIdxTar ); GCC_WARNING_RESET - if( m_pcEncCfg->m_fppLinesSynchro ) + if( m_pcEncCfg->m_ifpLines ) { - xCheckAndClipMvToFppLine( cCurMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ); - xCheckAndClipMvToFppLine( cTarMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ); + xCheckAndClipMvToFppLine( cCurMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv ); + xCheckAndClipMvToFppLine( cTarMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv ); } Distortion cost = xGetSymCost( cu, origBuf, eCurRefList, cCurMvField, cTarMvField, BcwIdx ); if ( cost < costStart ) @@ -1507,9 +1507,9 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub // save results if ( symCost < uiCostBi - && ( !m_pcEncCfg->m_fppLinesSynchro || - ( CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cCurMvField.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) && - CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTarMvField.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) ) + && ( !m_pcEncCfg->m_ifpLines || + ( CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cCurMvField.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) && + CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTarMvField.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ) ) { uiCostBi = symCost; @@ -1826,7 +1826,7 @@ void InterSearch::xEstimateMvPredAMVP( CodingUnit& cu, CPelUnitBuf& origBuf, Ref for( i = 0 ; i < pcAMVPInfo->numCand; i++) { Mv mvCand = pcAMVPInfo->mvCand[i]; - if( m_pcEncCfg->m_fppLinesSynchro ) + if( m_pcEncCfg->m_ifpLines ) xClipMvSearch( mvCand, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, true ); Distortion uiTmpCost = xGetTemplateCost( cu, origBuf, predBuf, mvCand, i, AMVP_MAX_NUM_CANDS, refPicList, iRefIdx ); @@ -2055,7 +2055,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic Mv bestInitMv = (bBi ? rcMv : rcMvPred); Mv cTmpMv = bestInitMv; - xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro ); + xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines ); cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT); m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor; Distortion uiBestSad = m_cDistParam.distFunc(m_cDistParam); @@ -2080,7 +2080,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic if( j < i ) continue; - xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro); + xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines); cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT); m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor; @@ -2136,7 +2136,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic DTRACE(g_trace_ctx, D_ME, " MECost: %6d (%d) MV:%d,%d\n", (int)refPicList, (int)bBi, ruiCost, ruiBits, rcMv.hor << 2, rcMv.ver << 2); } -void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int fppLinesSynchro ) +void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int ifpLines ) { if( pcv.wrapArround ) { @@ -2147,9 +2147,9 @@ void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Siz int iHorMax = ( pcv.lumaWidth + iOffset - ( int ) pos.x - 1 ) << iMvShift; int iHorMin = ( -( int ) pcv.maxCUSize - iOffset - ( int ) pos.x + 1 ) * (1 << iMvShift); - int maxLumaHeight = fppLinesSynchro && ((pos.y >> pcv.maxCUSizeLog2) + fppLinesSynchro + 1 < pcv.heightInCtus) ? + int maxLumaHeight = ifpLines && ((pos.y >> pcv.maxCUSizeLog2) + ifpLines + 1 < pcv.heightInCtus) ? - (((pos.y >> pcv.maxCUSizeLog2) + fppLinesSynchro + 1) << pcv.maxCUSizeLog2 ) - size.height - 4 // 4 samples from DCTIF vertical bottom part + (((pos.y >> pcv.maxCUSizeLog2) + ifpLines + 1) << pcv.maxCUSizeLog2 ) - size.height - 4 // 4 samples from DCTIF vertical bottom part : pcv.lumaHeight + iOffset; @@ -2160,26 +2160,26 @@ void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Siz rcMv.ver = ( std::min( iVerMax, std::max( iVerMin, rcMv.ver ) ) ); } -void InterSearch::xClipMvToFppLine( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv ) +void InterSearch::xClipMvToFppLine( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv ) { const int yCompScale = 0; const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL; const int ctuLogScale = pcv.maxCUSizeLog2 - yCompScale; - const int yRefMax = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1; + const int yRefMax = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1; const int yRefMv = yB + nH + ( 4 >> yCompScale ) + (mv.ver >> mvPrecShift) - 1; CHECKD( yRefMv <= yRefMax, "Not expected" ); mv.ver -= ( yRefMv - yRefMax ) << mvPrecShift; } -void InterSearch::xCheckAndClipMvToFppLine( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv ) +void InterSearch::xCheckAndClipMvToFppLine( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv ) { const int yCompScale = 0; const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL; const int ctuLogScale = pcv.maxCUSizeLog2 - yCompScale; - const int yBMax = ( pcv.heightInCtus - 1 - fppLinesSynchro ) << ctuLogScale; + const int yBMax = ( pcv.heightInCtus - 1 - ifpLines ) << ctuLogScale; if( yB < yBMax ) { - const int yRefMax = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1; + const int yRefMax = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1; const int yRefMv = yB + nH + ( 4 >> yCompScale ) + (mv.ver >> mvPrecShift) - 1; if( yRefMv > yRefMax ) { @@ -2210,7 +2210,7 @@ void InterSearch::xSetSearchRange ( const CodingUnit& cu, else { clipMv( mvTL, cu.lumaPos(), cu.lumaSize(), pcv); - xClipMvSearch( mvBR, cu.lumaPos(), cu.lumaSize(), pcv, m_pcEncCfg->m_fppLinesSynchro ); + xClipMvSearch( mvBR, cu.lumaPos(), cu.lumaSize(), pcv, m_pcEncCfg->m_ifpLines ); } mvTL.divideByPowerOf2( iMvShift ); @@ -2343,7 +2343,7 @@ void InterSearch::xTZSearch( const CodingUnit& cu, const bool bNewZeroNeighbourhoodTest = bExtendedSettings; int iSearchRange = m_iSearchRange; - xClipMvSearch( rcMv, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro ); + xClipMvSearch( rcMv, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, m_pcEncCfg->m_ifpLines ); rcMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_QUARTER); rcMv.divideByPowerOf2(2); @@ -2371,7 +2371,7 @@ void InterSearch::xTZSearch( const CodingUnit& cu, const BlkUniMvInfo* curMvInfo = m_BlkUniMvInfoBuffer->getBlkUniMvInfo(i); Mv cTmpMv = curMvInfo->uniMvs[refPicList][iRefIdxPred]; - xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro); + xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines); cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT); m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor; @@ -2633,9 +2633,9 @@ void InterSearch::xPatternSearchIntRefine(CodingUnit& cu, TZSearchStruct& cStru cTestMv[iMVPIdx] += cBaseMvd[iMVPIdx]; cTestMv[iMVPIdx] += amvpInfo.mvCand[iMVPIdx]; - if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTestMv[iMVPIdx].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTestMv[iMVPIdx].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) { - xClipMvToFppLine( cTestMv[iMVPIdx], cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ); + xClipMvToFppLine( cTestMv[iMVPIdx], cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv ); cTestMv[iMVPIdx].roundTransPrecInternal2AmvrVertical(cu.imv); } @@ -2846,7 +2846,7 @@ Distortion InterSearch::xSymRefineMvSearch( CodingUnit& cu, CPelUnitBuf& origBuf mvOffset <<= nSearchStepShift; MvField mvCand = mvCurCenter, mvPair; mvCand.mv += mvOffset; - if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvCand.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvCand.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) { continue; // Skip this pos } @@ -2865,7 +2865,7 @@ Distortion InterSearch::xSymRefineMvSearch( CodingUnit& cu, CPelUnitBuf& origBuf mvPair.refIdx = rTarMvField.refIdx; mvPair.mv.set( rcMvTarPred.hor - (mvCand.mv.hor - rcMvCurPred.hor), rcMvTarPred.ver - (mvCand.mv.ver - rcMvCurPred.ver) ); - if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvPair.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) + if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvPair.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) { continue; // Skip this pos } @@ -4475,7 +4475,7 @@ void InterSearch::xSymMvdCheckBestMvp( PelUnitBuf predBufA = m_tmpPredStorage[curRefList].getCompactBuf( cu ); const Picture* picRefA = cu.slice->getRefPic(curRefList, cCurMvField.refIdx); Mv mvA = cCurMvField.mv; - xClipMvSearch( mvA, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_fppLinesSynchro ); + xClipMvSearch( mvA, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_ifpLines ); xPredInterBlk( COMP_Y, cu, picRefA, mvA, predBufA, false, cu.slice->clpRngs[ COMP_Y ], false, false ); bufTmp = m_tmpStorageLCU.getBuf( UnitAreaRelative( cu, cu ) ); @@ -4501,7 +4501,7 @@ void InterSearch::xSymMvdCheckBestMvp( PelUnitBuf predBufB = m_tmpPredStorage[tarRefList].getCompactBuf( cu ); const Picture* picRefB = cu.slice->getRefPic(tarRefList, cTarMvField.refIdx); Mv mvB = cTarMvField.mv; - xClipMvSearch( mvB, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_fppLinesSynchro ); + xClipMvSearch( mvB, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_ifpLines ); xPredInterBlk( COMP_Y, cu, picRefB, mvB, predBufB, false, cu.slice->clpRngs[ COMP_Y ], false, false ); // calc distortion @@ -4948,7 +4948,7 @@ void InterSearch::xPredAffineInterSearch( CodingUnit& cu, ::memcpy(tmp.affMVs[1][bestBiPRefIdxL1], pcMvTemp, sizeof(Mv) * 3); iRefIdxBi[1] = bestBiPRefIdxL1; - if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, pcMvTemp, m_pcEncCfg->m_fppLinesSynchro ) ) + if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, pcMvTemp, m_pcEncCfg->m_ifpLines ) ) { // this mvp cannot be used for mv, skip Bi-pred uiCostBi = MAX_DISTORTION; @@ -5020,7 +5020,7 @@ void InterSearch::xPredAffineInterSearch( CodingUnit& cu, // First iterate, get prediction block of opposite direction if (iIter == 0 && !slice.picHeader->mvdL1Zero) { - if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, aacMv[1 - iRefList], m_pcEncCfg->m_fppLinesSynchro ) ) + if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, aacMv[1 - iRefList], m_pcEncCfg->m_ifpLines ) ) { continue; } @@ -5226,7 +5226,7 @@ Distortion InterSearch::xGetAffineTemplateCost(CodingUnit& cu, CPelUnitBuf& orig Mv mv[3]; memcpy(mv, acMvCand, sizeof(mv)); - if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, mv, m_pcEncCfg->m_fppLinesSynchro ) ) + if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, mv, m_pcEncCfg->m_ifpLines ) ) { return MAX_DISTORTION>>1; } @@ -5451,7 +5451,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, { acMvTemp[2].roundAffinePrecInternal2Amvr(cu.imv); } - if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) ) + if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) ) { xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.cs->slice->clpRngs[COMP_Y], refPicList); @@ -5589,7 +5589,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, clipMv(acMvTemp[i], cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv); } - if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) ) + if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) ) { xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList); @@ -5620,7 +5620,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, auto checkCPMVRdCost = [&](Mv ctrlPtMv[3]) { - if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, ctrlPtMv, m_pcEncCfg->m_fppLinesSynchro ) ) + if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, ctrlPtMv, m_pcEncCfg->m_ifpLines ) ) { xPredAffineBlk(COMP_Y, cu, refPic, ctrlPtMv, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList); // get error @@ -5713,7 +5713,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, acMvTemp[j].set(centerMv[j].hor + (testPos[i][0] * (1 << mvShift)), centerMv[j].ver + (testPos[i][1] * (1 << mvShift))); clipMv(acMvTemp[j], cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv); - if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) ) + if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) ) { xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList); diff --git a/source/Lib/EncoderLib/InterSearch.h b/source/Lib/EncoderLib/InterSearch.h index 6e6b4c21f..741fa9282 100644 --- a/source/Lib/EncoderLib/InterSearch.h +++ b/source/Lib/EncoderLib/InterSearch.h @@ -513,10 +513,10 @@ class InterSearch : public InterPrediction, AffineGradientSearch const bool bFastSettings = false ); - void xClipMvSearch ( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int fppLinesSynchro ); + void xClipMvSearch ( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int ifpLines ); - void xClipMvToFppLine ( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv ); - void xCheckAndClipMvToFppLine ( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv ); + void xClipMvToFppLine ( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv ); + void xCheckAndClipMvToFppLine ( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv ); void xSetSearchRange ( const CodingUnit& cu, const Mv& cMvPred, const int iSrchRng, diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp index e7b6a69a2..e111f71d1 100644 --- a/source/Lib/EncoderLib/PreProcess.cpp +++ b/source/Lib/EncoderLib/PreProcess.cpp @@ -75,7 +75,7 @@ PreProcess::~PreProcess() void PreProcess::init( const VVEncCfg& encCfg, bool isFinalPass ) { - m_gopCfg.initGopList( encCfg.m_DecodingRefreshType, encCfg.m_poc0idr, encCfg.m_IntraPeriod, encCfg.m_GOPSize, encCfg.m_leadFrames, encCfg.m_picReordering, encCfg.m_GOPList, encCfg.m_vvencMCTF, encCfg.m_FirstPassMode ); + m_gopCfg.initGopList( encCfg.m_DecodingRefreshType, encCfg.m_poc0idr, encCfg.m_IntraPeriod, encCfg.m_GOPSize, encCfg.m_leadFrames, encCfg.m_picReordering, encCfg.m_GOPList, encCfg.m_vvencMCTF, encCfg.m_FirstPassMode, encCfg.m_minIntraDist ); CHECK( m_gopCfg.getMaxTLayer() != encCfg.m_maxTLayer, "max temporal layer of gop configuration does not match pre-configured value" ); m_encCfg = &encCfg; @@ -389,10 +389,11 @@ void PreProcess::xDetectSTA( Picture* pic, const PicList& picList ) { const Picture* prevTl0 = xGetPrevTl0Pic( pic, picList ); - int picMemorySTA = 0; - bool isSta = false; - - if( prevTl0 && prevTl0->picVisActTL0 > 0 ) + int picMemorySTA = 0; + bool isSta = false; + bool intraAllowed = m_gopCfg.isSTAallowed( pic->poc ); + + if( prevTl0 && prevTl0->picVisActTL0 > 0 && intraAllowed ) { const int scThreshold = ( ( pic->isSccStrong ? 6 : ( pic->isSccWeak ? 5 : 4 ) ) * ( m_isHighRes ? 19 : 15 ) ) >> 2; @@ -412,6 +413,7 @@ void PreProcess::xDetectSTA( Picture* pic, const PicList& picList ) picShared->m_picMemorySTA = picMemorySTA; picShared->m_gopEntry.m_sliceType = 'I'; picShared->m_gopEntry.m_scType = SCT_TL0_SCENE_CUT; + m_gopCfg.setLastIntraSTA( pic->poc ); if( m_encCfg->m_sliceTypeAdapt == 2 ) { diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h index 7f28a2936..050318368 100644 --- a/source/Lib/apputils/VVEncAppCfg.h +++ b/source/Lib/apputils/VVEncAppCfg.h @@ -390,6 +390,14 @@ const std::vector> BitrateOrScaleAbrevToIntMap = { "x", -16 } // negative value: multiplier of target bitrate, with a fixed-point accuracy of 4 bit }; +const std::vector> IfpToValueMap = +{ + { "0", false }, + { "off", false }, + { "1", 1 }, + { "on", 1 }, +}; + //// ==================================================================================================================== //// string <-> enum //// ==================================================================================================================== @@ -547,7 +555,9 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) IStreamToInt8 toSliceTypeAdapt ( &c->m_sliceTypeAdapt ); IStreamToInt8 toSelectiveRDOQ ( &c->m_useSelectiveRDOQ ); - IStreamToInt8 toFppLinesSynchro ( &c->m_fppLinesSynchro ); + IStreamToInt8 toForceScc ( &c->m_forceScc ); + IStreamToInt8 toIfpLines ( &c->m_ifpLines ); + IStreamToEnum toUseIfp ( &c->m_ifp, &IfpToValueMap ); po::Options opts; if( m_easyMode ) @@ -635,7 +645,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("rcstatsfile", m_RCStatsFileName, "rate control statistics file name") ("qp,q", c->m_QP, "quantization parameter, QP (0, 1, .. 63)") ("qpa", toQPA, "enable perceptually motivated QP adaptation based on XPSNR model (0: off, 1: on)", true) - ("threads,t", c->m_numThreads, "number of threads (multithreading; -1: resolution < 720p: 4, >= 720p: 8 threads)") + ("threads,t", c->m_numThreads, "number of threads (multithreading; -1: resolution < 720p: 4, < 5K 2880p: 8, >= 5K 2880p: 12 threads)") + ("ifp", toUseIfp, "inter-frame parallelization(IFP) (0: off, 1: on, with sync. offset of two CTU lines)") ("refreshtype,-rt", toDecRefreshType, "intra refresh type (idr, cra, cra_cre: CRA, constrained RASL picture encoding)") ("refreshsec,-rs", c->m_IntraPeriodSec, "intra period/refresh in seconds") ("intraperiod,-ip", c->m_IntraPeriod, "intra period in frames (0: specify intra period in seconds instead, see -refreshsec)") @@ -646,7 +657,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) { opts.setSubSection("Threading, performance"); opts.addOptions() - ("Threads,t", c->m_numThreads, "Number of threads") + ("Threads,t", c->m_numThreads, "number of threads (multithreading; -1: resolution < 720p: 4, < 5K 2880p: 8, >= 5K 2880p: 12 threads)") ("preset", toPreset, "select preset for specific encoding setting (faster, fast, medium, slow, slower, medium_lowDecEnergy)") ("Tiles", toNumTiles, "Set number of tile columns and rows") ; @@ -672,6 +683,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("MaxBitrate", toMaxRate, "Rate control: approximate maximum instantaneous bitrate [bits/second] (0: no rate cap; least constraint)" ) ("PerceptQPA,-qpa", c->m_usePerceptQPA, "Enable perceptually motivated QP adaptation, XPSNR based (0:off, 1:on)", true) ("STA", toSliceTypeAdapt, "Enable slice type adaptation at GOPSize>8 (-1: auto, 0: off, 1: adapt slice type, 2: adapt NAL unit type)") + ("MinIntraDistance", c->m_minIntraDist, "With STA: set a minimum coded frame distance to the previous intra frame (-1: GOPSize)" ) ; opts.setSubSection("Quantization parameters"); @@ -858,7 +870,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("AddGOP32refPics", c->m_addGOP32refPics, "Use different QP offsets and reference pictures in GOP structure") ("NumRefPics", c->m_numRefPics, "Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)" ) ("NumRefPicsSCC", c->m_numRefPicsSCC, "Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics)" ) - ("ForceSCC", c->m_forceScc, "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" ) + ("ForceSCC", toForceScc, "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" ) ; opts.setSubSection("Low-level QT-BTT partitioning options"); @@ -1071,7 +1083,9 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("TileColumnWidthArray", toTileColumnWidth, "Tile column widths in units of CTUs. Last column width in list will be repeated uniformly to cover any remaining picture width") ("TileRowHeightArray", toTileRowHeight, "Tile row heights in units of CTUs. Last row height in list will be repeated uniformly to cover any remaining picture height") ("TileParallelCtuEnc", c->m_tileParallelCtuEnc, "Allow parallel CTU block search in different tiles") - ("FppLinesSynchro", toFppLinesSynchro, "(experimental) Number of CTU-lines synchronization due to MV restriction for FPP mode") + ("FppLinesSynchro", toIfpLines, "(deprecated) Inter-Frame Parallelization(IFP) explicit CTU-lines synchronization offset (-1: default mode with two lines, 0: off)") + ("IFPLines", toIfpLines, "Inter-Frame Parallelization(IFP) explicit CTU-lines synchronization offset (-1: default mode with two lines, 0: off)") + ("IFP", toUseIfp, "Inter-Frame Parallelization(IFP) (0: off, 1: on, with default setting of IFPLines)") ; opts.setSubSection("Coding tools"); diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp index a6e814e62..89cebf119 100644 --- a/source/Lib/vvenc/vvencCfg.cpp +++ b/source/Lib/vvenc/vvencCfg.cpp @@ -260,6 +260,16 @@ static inline std::string vvenc_getDecodingRefreshTypeStr( int type, bool poc0i return cType; } +static inline int getNumThreadsDefault( vvenc_config *c ) +{ + const int minSize = std::min( c->m_SourceWidth, c->m_SourceHeight ); + if( minSize >= 2880 ) + return 12; + else if( minSize >= 720 ) + return 8; + return 4; +} + VVENC_DECL void vvenc_GOPEntry_default(vvencGOPEntry *GOPEntry ) { GOPEntry->m_POC = -1; @@ -381,6 +391,7 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c ) c->m_usePerceptQPA = false; ///< perceptually motivated input-adaptive QP modification, abbrev. perceptual QP adaptation (QPA) c->m_sliceTypeAdapt = -1; ///< perceptually and objectively motivated slice type adaptation (STA) + c->m_minIntraDist = -1; c->m_RCNumPasses = -1; c->m_RCPass = -1; @@ -652,7 +663,8 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c ) c->m_maxParallelFrames = -1; c->m_ensureWppBitEqual = -1; c->m_tileParallelCtuEnc = true; - c->m_fppLinesSynchro = 0; + c->m_ifpLines = -1; + c->m_ifp = false; c->m_picPartitionFlag = false; memset( c->m_tileColumnWidth, 0, sizeof(c->m_tileColumnWidth) ); @@ -690,6 +702,7 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c ) c->m_forceScc = 0; c->m_reservedFlag = false; + c->m_reservedInt = 0; memset( c->m_reservedDouble, 0, sizeof(c->m_reservedDouble) ); // init default preset @@ -756,6 +769,7 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) vvenc_confirmParameter( c, c->m_leadFrames < 0 || c->m_leadFrames > VVENC_MAX_GOP, "Lead frames exceeds supported range (0 to 64)" ); vvenc_confirmParameter( c, c->m_trailFrames < 0 || c->m_trailFrames > VVENC_MCTF_RANGE, "Trail frames exceeds supported range (0 to 4)" ); vvenc_confirmParameter( c, c->m_sliceTypeAdapt < -1 || c->m_sliceTypeAdapt > 2, "Slice type adaptation (STA) invalid parameter given, range is (-1 .. 2)" ); + vvenc_confirmParameter( c, c->m_minIntraDist < -1, "Minimum intra distance cannot be smaller than -1" ); if( VVENC_RC_OFF == c->m_RCTargetBitrate ) { @@ -796,6 +810,12 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) // vvenc::MsgLog msg(c->m_msgCtx, c->m_msgFnc); +#if !IFP_RC_DETERMINISTIC + if( c->m_RCTargetBitrate != 0 && c->m_ifp ) + { + msg.log( VVENC_WARNING, "Using RC with IFP. Results are non-deterministic!\n" ); + } +#endif if( c->m_FirstPassMode > 2 && c->m_RCTargetBitrate != 0 ) { @@ -906,13 +926,13 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) if( c->m_numThreads < 0 ) { const int numCores = std::thread::hardware_concurrency(); - c->m_numThreads = std::min( c->m_SourceWidth, c->m_SourceHeight ) < 720 ? 4 : 8; + c->m_numThreads = getNumThreadsDefault( c ); c->m_numThreads = std::min( c->m_numThreads, numCores ); } if( c->m_ensureWppBitEqual < 0 ) c->m_ensureWppBitEqual = c->m_numThreads ? 1 : 0 ; if( c->m_useAMaxBT < 0 ) c->m_useAMaxBT = c->m_numThreads ? 0 : 1 ; if( c->m_cabacInitPresent < 0 ) c->m_cabacInitPresent = c->m_numThreads ? 0 : 1 ; - if( c->m_alfTempPred < 0 ) c->m_alfTempPred = c->m_fppLinesSynchro ? 0 : 1 ; + if( c->m_alfTempPred < 0 ) c->m_alfTempPred = c->m_ifp ? 0 : 1 ; if( c->m_saoEncodingRate < 0.0 ) c->m_saoEncodingRate = c->m_numThreads ? 0.0 : 0.75; if( c->m_saoEncodingRateChroma < 0.0 ) c->m_saoEncodingRateChroma = c->m_numThreads ? 0.0 : 0.5 ; if( c->m_maxParallelFrames < 0 ) @@ -920,6 +940,12 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) c->m_maxParallelFrames = std::min( c->m_numThreads, 4 ); } + if( c->m_ifpLines > 0 && !c->m_ifp ) + { + msg.log( VVENC_WARNING, "Given IFPLines=%d, but IFP is not enabled, reseting IFPLines to 0.\n", c->m_ifpLines ); + } + c->m_ifpLines = !c->m_ifp ? 0: (c->m_ifpLines == -1 ? 2: c->m_ifpLines); + if( c->m_alfUnitSize < 0 ) c->m_alfUnitSize = c->m_CTUSize; @@ -1328,6 +1354,20 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) } vvenc_confirmParameter( c, c->m_GOPSize <= 8 && c->m_sliceTypeAdapt > 0, "Slice type adaptation for GOPSize <= 8 not supported" ); + if( c->m_minIntraDist < 0 ) + { + if( c->m_sliceTypeAdapt > 0 ) + { + c->m_minIntraDist = std::min( c->m_GOPSize, c->m_IntraPeriod ); + } + else + { + c->m_minIntraDist = 0; + } + } + vvenc_confirmParameter( c, c->m_minIntraDist > 0 && c->m_sliceTypeAdapt == 0, "STA: Setting a minimal intra distance only works with slice type adaptation enabled" ); + vvenc_confirmParameter( c, c->m_minIntraDist > c->m_IntraPeriod && c->m_sliceTypeAdapt > 0, "STA: Minimal intra distance can not be larger than intra period" ); + // set number of lead / trail frames in segment mode const int staFrames = c->m_sliceTypeAdapt ? c->m_GOPSize : 0; const int mctfFrames = c->m_vvencMCTF.MCTF || c->m_usePerceptQPA ? VVENC_MCTF_RANGE : 0; @@ -2033,9 +2073,18 @@ static bool checkCfgParameter( vvenc_config *c ) vvenc_confirmParameter(c, c->m_traceFile[0] != '\0' && c->m_maxParallelFrames > 1 && c->m_numThreads > 1, "Tracing and frame parallel encoding not supported" ); #endif vvenc_confirmParameter(c, c->m_maxParallelFrames > c->m_GOPSize && c->m_GOPSize != 1, "Max parallel frames should be less then GOP size" ); - vvenc_confirmParameter(c, c->m_fppLinesSynchro && c->m_alfTempPred != 0, "FPP CTU-lines synchro: ALFTempPred is not supported (must be disabled)" ); - vvenc_confirmParameter(c, c->m_fppLinesSynchro && c->m_numTileRows > 1, "FPP CTU-lines synchro: Only single tile row is supported" ); - vvenc_confirmParameter(c, c->m_fppLinesSynchro < 0, "fppLinesSynchro must be >= 0" ); + vvenc_confirmParameter(c, c->m_ifpLines && c->m_alfTempPred != 0, "IFP: ALFTempPred is not supported (must be disabled)" ); + vvenc_confirmParameter(c, c->m_ifpLines && c->m_numTileRows > 1, "IFP: Only single tile row is supported" ); + vvenc_confirmParameter(c, c->m_ifpLines < 0, "IFPLines must be >= 0" ); + vvenc_confirmParameter(c, c->m_ifp && c->m_ifpLines == 0, "IFP requires IFPLines=[-1 or >0]" ); + } + if( c->m_ifpLines ) + { + const int minNumThreadsIfp = getNumThreadsDefault( c ) * 2; + if( c->m_numThreads < minNumThreadsIfp ) + { + msg.log( VVENC_WARNING, "Using IFP at low number of threads (<%d) does not provide more speedup, consider disabling IFP.\n", minNumThreadsIfp ); + } } vvenc_confirmParameter(c, c->m_explicitAPSid < 0 || c->m_explicitAPSid > 7, "ExplicitAPDid out of range [0 .. 7]" ); @@ -2339,7 +2388,7 @@ VVENC_DECL int vvenc_init_default( vvenc_config *c, int width, int height, int f c->m_RCMaxBitrate = 0; // maximum instantaneous bitrate in bps c->m_numThreads = -1; // number of worker threads (-1: auto, 0: off, else set worker threads) - + iRet = vvenc_init_preset( c, preset ); return iRet; } @@ -2980,6 +3029,12 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve } else css << "single-pass"; +#if !IFP_RC_DETERMINISTIC + if( c->m_ifpLines && c->m_numThreads > 1 ) + { + css << " (non-deterministic due to IFP)"; + } +#endif } else { @@ -3220,7 +3275,7 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve css << "\n" << loglvl << "PARALLEL PROCESSING CFG: "; css << "NumThreads:" << c->m_numThreads << " "; css << "MaxParallelFrames:" << c->m_maxParallelFrames << " "; - css << "FppLinesSynchro:" << ( int ) c->m_fppLinesSynchro << " "; + css << "IFP:" << (c->m_ifp ? 1: 0) << " (IFPLines:" << (int)c->m_ifpLines << ")" << " "; if( c->m_picPartitionFlag ) { css << "TileParallelCtuEnc:" << c->m_tileParallelCtuEnc << " "; diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp index 3eb4f5758..d791f8b2e 100644 --- a/source/Lib/vvenc/vvencimpl.cpp +++ b/source/Lib/vvenc/vvencimpl.cpp @@ -70,6 +70,9 @@ POSSIBILITY OF SUCH DAMAGE. # include #endif +#if defined( TARGET_SIMD_ARM ) +# include "CommonLib/arm/CommonDefARM.h" +#endif #if _DEBUG #define HANDLE_EXCEPTION 0 @@ -796,6 +799,9 @@ const char* VVEncImpl::setSIMDExtension( const char* simdId ) try { read_x86_extension_flags( request_ext ); +#if defined( TARGET_SIMD_ARM ) + read_arm_extension_flags( request_ext == x86_simd::UNDEFINED ? arm_simd::UNDEFINED : request_ext != x86_simd::SCALAR ? arm_simd::NEON : arm_simd::SCALAR ); +#endif } catch( Exception& ) {