From d674ee087bfc227a4b19023973c2afaff7f6275c Mon Sep 17 00:00:00 2001 From: Adam Wieckowski <70575289+adamjw24@users.noreply.github.com> Date: Tue, 13 Feb 2024 14:11:41 +0100 Subject: [PATCH] Harminization IFP, AlfUnitSize, bugfies, update version (#351) * Harminization IFP, AlfUnitSize, bugfies... * Update version to v1.11.0-rc1 --- CMakeLists.txt | 4 +- changelog.txt | 45 ++++++++ source/Lib/CommonLib/x86/DepQuantX86.h | 3 + .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 100 ++++++++++-------- source/Lib/EncoderLib/EncAdaptiveLoopFilter.h | 9 +- source/Lib/EncoderLib/EncCu.cpp | 2 - source/Lib/EncoderLib/EncSlice.cpp | 38 ++++--- source/Lib/EncoderLib/EncSlice.h | 2 + source/Lib/vvenc/vvencCfg.cpp | 6 ++ 9 files changed, 143 insertions(+), 66 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85e6a783e..aeb7e5778 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,10 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.13.0 ) endif() # project name -project( vvenc VERSION 1.10.0 ) +project( vvenc VERSION 1.11.0 ) # set alternative version numbering for release candidates -#set( PROJECT_VERSION_RC rc1 ) +set( PROJECT_VERSION_RC rc1 ) if( PROJECT_VERSION_RC ) set( PROJECT_VERSION "${PROJECT_VERSION}-${PROJECT_VERSION_RC}" ) endif() diff --git a/changelog.txt b/changelog.txt index 970efed75..8ed120fe2 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,5 +1,49 @@ +///////////////////////////////////////// +tag 1.11.0-rc1 + +* libvvenc: + - added library parameters: + - vvenc_config::m_forceScc to force a specific SCC detection decision for all frames + (0: do not use, 1/2/3: no SCC/weak SCC/strong SCC) + - vvenc_config::m_ifp to enable inter-frame parallelism + - vvenc_config::m_minIntraDist to define minimal distance between intra frames (in presentation order) + - changed parameters: + - vvenc_config::m_RCMaxBitrate can now be specified as factor of target bitrate, use the macro + VVENC_SET_MAXRATE_FACTOR to set the value (absolute of negative values will be interpreted as + a 4-bit fixed-point factor) + - vvenc_config::m_numThreads will now per default be set to 12 for 5K and higher resolutions + - renamed library parameters: + - vvenc_config::m_fppLinesSynchro to vvenc_config::m_ifpLines + - added an affine speedup based on Pejman et al., ICIP 2023. + - added vectorized DQ implementation + - finalized inter-frame parallelization (previously FPP) improving performance during + execution with high number of threads (harmonized with rate control) + - improved SCC detection by reducing false positives + - overall speedups: + - 6% for fast/slow/slower + - 12% for medium, 25%+ for multi-threaded execution with long intra periods + +* vvencFFapp: + - added parameters: + - IFP: enables the usage of inter-frame parallelism + - IFPLines: defines the synchronization offset in CTU lines + - ForceSCC: forces a specific SCC detection decision + - MinIntraDistance: defines a minimal distance between intra pictures in presentation order + - changed parameters: + - FppLinesSynchro: now deprecated, please use IFP and IFPLines + - NumThreads: if set to default (-1), will now cause the encoder to use 12 threads for 5K and higher resolutions + - MaxRate: if specified with suffix "x", e.g. "2x", the value will be interpreted as multiple of target bitrate + +* vvencapp: + - added parameters: + - ifp: when set, will enable inter-frame parallelism with a default synchronization offset of two CTU lines + - changed parameters: + - threads, t: if set to default (-1), will now cause the encoder to use 12 threads for 5K and higher resolutions + - maxrate, m: if specified with suffix "x", e.g. "2x", the value will be interpreted as multiple of target bitrate + ///////////////////////////////////////// tag 1.10.0 + * libvvenc: - added library parameters: - vvenc_config::m_poc0idr to force POC0 to be an IDR (otherwise per default it will now be a RASL picture) @@ -20,6 +64,7 @@ tag 1.10.0 ///////////////////////////////////////// tag 1.9.1 + * libvvenc: - added library parameters: - vvenc_config::m_maxPicWidth, vvenc_config::m_maxPicHeight to signal maximal possible picture size when diff --git a/source/Lib/CommonLib/x86/DepQuantX86.h b/source/Lib/CommonLib/x86/DepQuantX86.h index 77429699a..74e90c2c0 100644 --- a/source/Lib/CommonLib/x86/DepQuantX86.h +++ b/source/Lib/CommonLib/x86/DepQuantX86.h @@ -1291,6 +1291,9 @@ namespace DQIntern } m_state_curr.m_gtxFracBitsArray = RateEstimator::gtxFracBits(); + //memset( m_state_curr.tplAcc, 0, sizeof( m_state_curr.tplAcc ) ); // will be set in updateStates{,EOS} before first access + memset( m_state_curr.sum1st, 0, sizeof( m_state_curr.sum1st ) ); // will be accessed in setRiceParam before updateState{,EOS} + //memset( m_state_curr.absVal, 0, sizeof( m_state_curr.absVal ) ); // will be set in updateStates{,EOS} before first access const int numCtx = isLuma( compID ) ? 21 : 11; const CoeffFracBits* const cffBits = gtxFracBits(); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index b3a0cba2e..4db68aeef 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -1000,9 +1000,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW m_CABACEstimator = &cabacEstimator; m_CtxCache = &ctxCache; - - int alfUnitSize = encCfg.m_ifpLines ? m_encCfg->m_CTUSize: m_encCfg->m_alfUnitSize; - initASU( alfUnitSize ); + initASU( m_encCfg->m_alfUnitSize ); const int numBins = m_encCfg->m_useNonLinearAlfLuma || m_encCfg->m_useNonLinearAlfChroma ? MaxAlfNumClippingValues : 1; @@ -1654,7 +1652,9 @@ void EncAdaptiveLoopFilter::deriveFilter( Picture& pic, CodingStructure& cs, con { return; } - const int numAsus = m_encCfg->m_ifpLines && numCtus != m_numAsusInPic ? numCtus: m_numAsusInPic; + const int numCtuLines = numCtus / cs.pcv->widthInCtus; + const int numAsusVer = numCtuLines / m_numCtusInAsuWidth + ( ( numCtuLines % m_numCtusInAsuWidth ) ? 1 : 0 ); + const int numAsus = numAsusVer * m_numAsusInWidth; initCABACEstimator( cs.slice ); @@ -2156,42 +2156,23 @@ void EncAdaptiveLoopFilter::xStoreAlfAsuFilterIdx( CodingStructure& cs, int ctuX } } -double EncAdaptiveLoopFilter::xCodeAlfAsuEnabledFlag( CodingStructure& cs, int ctuIdx, const int compIdx, AlfParam* alfParam, const double ctuLambda ) -{ - m_CABACEstimator->codeAlfCtuEnabledFlag( cs, ctuIdx, compIdx ); - return ctuLambda * FRAC_BITS_SCALE * m_CABACEstimator->getEstFracBits(); -} - -double EncAdaptiveLoopFilter::xCodeAlfAsuLumaFilterIdx( CodingStructure& cs, int asuIdx, int ctuIdx, const double ctuLambda ) -{ - m_CABACEstimator->resetBits(); - m_CABACEstimator->codeAlfCtuFilterIndex( cs, ctuIdx ); - return m_numCtusInAsu[asuIdx] * ctuLambda * FRAC_BITS_SCALE * m_CABACEstimator->getEstFracBits(); -} - -double EncAdaptiveLoopFilter::xCodeAlfAsuAlternative( CodingStructure& cs, int asuIdx, int ctuIdx, const int compIdx, AlfParam* alfParam, const double ctuLambda ) -{ - m_CABACEstimator->codeAlfCtuAlternative( cs, ctuIdx, compIdx, alfParam ); - return m_numCtusInAsu[asuIdx] * ctuLambda * FRAC_BITS_SCALE * m_CABACEstimator->getEstFracBits(); -} - -double xCodeAlfCtuEnabledFlag( CodingStructure& cs, int ctuIdx, const int compIdx, AlfParam* alfParam, CABACWriter* CABACEstimator, const double ctuLambda ) +double EncAdaptiveLoopFilter::xCodeAlfAsuEnabledFlag( CodingStructure& cs, int ctuIdx, const int compIdx, CABACWriter* CABACEstimator, const double ctuLambda ) { CABACEstimator->codeAlfCtuEnabledFlag( cs, ctuIdx, compIdx ); return ctuLambda * FRAC_BITS_SCALE * CABACEstimator->getEstFracBits(); } -double xCodeAlfCtuLumaFilterIdx( CodingStructure& cs, int asuIdx, int ctuIdx, CABACWriter* CABACEstimator, const double ctuLambda ) +double EncAdaptiveLoopFilter::xCodeAlfAsuLumaFilterIdx( CodingStructure& cs, int asuIdx, int ctuIdx, CABACWriter* CABACEstimator, const double ctuLambda ) { CABACEstimator->resetBits(); CABACEstimator->codeAlfCtuFilterIndex( cs, ctuIdx ); - return ctuLambda * FRAC_BITS_SCALE * CABACEstimator->getEstFracBits(); + return m_numCtusInAsu[asuIdx] * ctuLambda * FRAC_BITS_SCALE * CABACEstimator->getEstFracBits(); } -double xCodeAlfCtuAlternative( CodingStructure& cs, int asuIdx, int ctuIdx, const int compIdx, AlfParam* alfParam, CABACWriter* CABACEstimator, const double ctuLambda ) +double EncAdaptiveLoopFilter::xCodeAlfAsuAlternative( CodingStructure& cs, int asuIdx, int ctuIdx, const int compIdx, AlfParam* alfParam, CABACWriter* CABACEstimator, const double ctuLambda ) { CABACEstimator->codeAlfCtuAlternative( cs, ctuIdx, compIdx, alfParam ); - return ctuLambda * FRAC_BITS_SCALE * CABACEstimator->getEstFracBits(); + return m_numCtusInAsu[asuIdx] * ctuLambda * FRAC_BITS_SCALE * CABACEstimator->getEstFracBits(); } bool EncAdaptiveLoopFilter::isSkipAlfForFrame( const Picture& pic ) const @@ -2249,10 +2230,10 @@ double EncAdaptiveLoopFilter::deriveCtbAlfEnableFlags( CodingStructure& cs, Chan m_CABACEstimator->resetBits(); m_ctuEnableFlag[compID][ctuIdx] = 1; double costOn = distUnfilterCtu; - costOn += xCodeAlfAsuEnabledFlag( cs, ctuIdx, compID, &m_alfParamTemp, ctuLambda ); + costOn += xCodeAlfAsuEnabledFlag( cs, ctuIdx, compID, m_CABACEstimator, ctuLambda ); if( isLuma( channel ) ) { - costOn += xCodeAlfAsuLumaFilterIdx( cs, asuIdx, ctuIdx, ctuLambda ); + costOn += xCodeAlfAsuLumaFilterIdx( cs, asuIdx, ctuIdx, m_CABACEstimator, ctuLambda ); } ctxTempBest = AlfCtx( m_CABACEstimator->getCtx() ); @@ -2272,7 +2253,7 @@ double EncAdaptiveLoopFilter::deriveCtbAlfEnableFlags( CodingStructure& cs, Chan m_CABACEstimator->getCtx() = AlfCtx( ctxTempAltStart ); m_CABACEstimator->resetBits(); m_ctuAlternative[compID][ctuIdx] = altIdx; - double r_altCost = xCodeAlfAsuAlternative( cs, asuIdx, ctuIdx, compID, &m_alfParamTemp, ctuLambda ); + double r_altCost = xCodeAlfAsuAlternative( cs, asuIdx, ctuIdx, compID, &m_alfParamTemp, m_CABACEstimator, ctuLambda ); double altDist = 0.; altDist += doClip ? m_alfCovariance[compID][asuIdx][0].calcErrorForCoeffs( m_filterClippSet[altIdx], m_filterCoeffSet[altIdx], numCoeff, invFactor ) : m_alfCovariance[compID][asuIdx][0].calcErrorForCoeffs( m_filterClippSet[altIdx], m_filterCoeffSet[altIdx], numCoeff, invFactor ); @@ -2292,7 +2273,7 @@ double EncAdaptiveLoopFilter::deriveCtbAlfEnableFlags( CodingStructure& cs, Chan m_CABACEstimator->getCtx() = AlfCtx( ctxTempStart ); m_CABACEstimator->resetBits(); m_ctuEnableFlag[compID][ctuIdx] = 0; - double costOff = distUnfilterCtu + xCodeAlfAsuEnabledFlag( cs, ctuIdx, compID, &m_alfParamTemp, ctuLambda ); + double costOff = distUnfilterCtu + xCodeAlfAsuEnabledFlag( cs, ctuIdx, compID, m_CABACEstimator, ctuLambda ); uint8_t enable = 0; if( costOn < costOff ) @@ -4418,13 +4399,41 @@ void EncAdaptiveLoopFilter::initDistortionCcalf( int numCtus ) } } -void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWriter* CABACEstimator, CtxCache* ctxCache, int asuIdx ) +void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWriter* CABACEstimator, CtxCache* ctxCache, int ctuIdx ) { if( isSkipAlfForFrame( *cs.picture ) ) { return; } + int asuIdx = 0; + if( m_numCtusInAsuHeight > 1 ) + { + // using ASUs + const PreCalcValues& pcv = *cs.pcv; + const int xC = (ctuIdx % pcv.widthInCtus) << pcv.maxCUSizeLog2; + const int yC = (ctuIdx / pcv.widthInCtus) << pcv.maxCUSizeLog2; + + const int wC = std::min( m_maxCUWidth, m_picWidth - xC ); + const int hC = std::min( m_maxCUHeight, m_picHeight - yC ); + + const int xA = xC & ~(m_maxAsuWidth - 1); + const int yA = yC & ~(m_maxAsuHeight - 1); + const int wA = std::min( m_maxAsuWidth, m_picWidth - xA ); + const int hA = std::min( m_maxAsuHeight, m_picHeight - yA ); + + // bottom-right CTU in ASU? + if(xA + wA != xC + wC || yA + hA != yC + hC) + { + return; + } + asuIdx = ( yA / m_maxAsuHeight ) * m_numAsusInWidth + ( xA / m_maxAsuWidth ); + } + else + { + asuIdx = ctuIdx; + } + int ctuX, ctuY; getAsuCtuXY(asuIdx, ctuX, ctuY); int ctbIdx = ctuY * cs.pcv->widthInCtus + ctuX; @@ -4467,9 +4476,10 @@ void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWrite //rate CABACEstimator->getCtx() = AlfCtx(ctxTempStart); CABACEstimator->resetBits(); - double rateOn = xCodeAlfCtuEnabledFlag(cs, ctbIdx, COMP_Y, nullptr, CABACEstimator, ctuLambda); + double rateOn = xCodeAlfAsuEnabledFlag(cs, ctbIdx, COMP_Y, CABACEstimator, ctuLambda); alfCtbFilterSetIndex[ctbIdx] = filterSetIdx; - rateOn += xCodeAlfCtuLumaFilterIdx(cs, asuIdx, ctbIdx, CABACEstimator, ctuLambda); + rateOn += xCodeAlfAsuLumaFilterIdx(cs, asuIdx, ctbIdx, CABACEstimator, ctuLambda); + //distortion double dist = distUnfilterCtb; @@ -4527,7 +4537,7 @@ void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWrite //rate CABACEstimator->getCtx() = AlfCtx(ctxTempStart); CABACEstimator->resetBits(); - double rateCost = xCodeAlfCtuEnabledFlag(cs, ctbIdx, COMP_Y, nullptr, CABACEstimator, ctuLambda); + double rateCost = xCodeAlfAsuEnabledFlag(cs, ctbIdx, COMP_Y, CABACEstimator, ctuLambda); //cost const double costOff = distUnfilterCtb + rateCost; DTRACE(g_trace_ctx, D_MISC, "\t\t\t costOff=%.2f\n", costOff); @@ -4567,7 +4577,7 @@ void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWrite CABACEstimator->getCtx() = AlfCtx(ctxTempStart); CABACEstimator->resetBits(); //ctb flag - double rateCostOn = xCodeAlfCtuEnabledFlag(cs, ctbIdx, compId, nullptr, CABACEstimator, ctuLambda); + double rateCostOn = xCodeAlfAsuEnabledFlag(cs, ctbIdx, compId, CABACEstimator, ctuLambda); double dist = MAX_DOUBLE; int numAlts = alfParamTemp.numAlternativesChroma; ctxTempBest = AlfCtx(CABACEstimator->getCtx()); @@ -4581,7 +4591,7 @@ void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWrite CABACEstimator->getCtx() = AlfCtx(ctxTempAltStart); CABACEstimator->resetBits(); m_ctuAlternative[compId][ctbIdx] = altIdx; - double r_altCost = xCodeAlfCtuAlternative(cs, asuIdx, ctbIdx, compId, &alfParamTemp, CABACEstimator, ctuLambda); + double r_altCost = xCodeAlfAsuAlternative(cs, asuIdx, ctbIdx, compId, &alfParamTemp, CABACEstimator, ctuLambda); //distortion for (int i = 0; i < MAX_NUM_ALF_CHROMA_COEFF; i++) @@ -4612,7 +4622,7 @@ void EncAdaptiveLoopFilter::selectFilterForCTU( CodingStructure& cs, CABACWrite //rate CABACEstimator->getCtx() = AlfCtx(ctxTempStart); CABACEstimator->resetBits(); - double rateCost = xCodeAlfCtuEnabledFlag(cs, ctbIdx, compId, &alfParamTemp, CABACEstimator, ctuLambda); + double rateCost = xCodeAlfAsuEnabledFlag(cs, ctbIdx, compId, CABACEstimator, ctuLambda); //cost const double costOff = distUnfilterCtu + rateCost; DTRACE(g_trace_ctx, D_MISC, "Chroma_%d: \t\t\t costOn =%.2f, costOff =%.2f\n", compId, costOn, costOff ); @@ -4807,9 +4817,9 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa //rate m_CABACEstimator->getCtx() = AlfCtx(ctxTempStart); m_CABACEstimator->resetBits(); - double rateOn = xCodeAlfAsuEnabledFlag( cs, ctbIdx, COMP_Y, &m_alfParamTemp, ctuLambda ); + double rateOn = xCodeAlfAsuEnabledFlag( cs, ctbIdx, COMP_Y, m_CABACEstimator, ctuLambda ); alfCtbFilterSetIndex[ctbIdx] = filterSetIdx; - rateOn += xCodeAlfAsuLumaFilterIdx( cs, asuIdx, ctbIdx, ctuLambda ); + rateOn += xCodeAlfAsuLumaFilterIdx( cs, asuIdx, ctbIdx, m_CABACEstimator, ctuLambda ); //distortion double dist = distUnfilterCtb; @@ -4880,7 +4890,7 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa //rate m_CABACEstimator->getCtx() = AlfCtx(ctxTempStart); m_CABACEstimator->resetBits(); - double rateCost = xCodeAlfAsuEnabledFlag( cs, ctbIdx, COMP_Y, &m_alfParamTemp, ctuLambda ); + double rateCost = xCodeAlfAsuEnabledFlag( cs, ctbIdx, COMP_Y, m_CABACEstimator, ctuLambda ); //cost const double costOff = distUnfilterCtb + rateCost; DTRACE( g_trace_ctx, D_MISC, "\t\t\t costOff=%.2f\n", costOff ); @@ -5063,7 +5073,7 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa m_CABACEstimator->getCtx() = AlfCtx(ctxTempStart); m_CABACEstimator->resetBits(); //ctb flag - double rateCostOn = xCodeAlfAsuEnabledFlag( cs, ctbIdx, compId, &m_alfParamTemp, ctuLambda ); + double rateCostOn = xCodeAlfAsuEnabledFlag( cs, ctbIdx, compId, m_CABACEstimator, ctuLambda ); double dist = MAX_DOUBLE; int numAlts = m_alfParamTemp.numAlternativesChroma; ctxTempBest = AlfCtx( m_CABACEstimator->getCtx() ); @@ -5077,7 +5087,7 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa m_CABACEstimator->getCtx() = AlfCtx( ctxTempAltStart ); m_CABACEstimator->resetBits(); m_ctuAlternative[compId][ctbIdx] = altIdx; - double r_altCost = xCodeAlfAsuAlternative( cs, asuIdx, ctbIdx, compId, &m_alfParamTemp, ctuLambda ); + double r_altCost = xCodeAlfAsuAlternative( cs, asuIdx, ctbIdx, compId, &m_alfParamTemp, m_CABACEstimator, ctuLambda ); //distortion for (int i = 0; i < MAX_NUM_ALF_CHROMA_COEFF; i++) @@ -5109,7 +5119,7 @@ void EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa //rate m_CABACEstimator->getCtx() = AlfCtx(ctxTempStart); m_CABACEstimator->resetBits(); - double rateCost = xCodeAlfAsuEnabledFlag( cs, ctbIdx, compId, &m_alfParamTemp, ctuLambda ); + double rateCost = xCodeAlfAsuEnabledFlag( cs, ctbIdx, compId, m_CABACEstimator, ctuLambda ); //cost const double costOff = distUnfilterCtu + rateCost; DTRACE(g_trace_ctx, D_MISC, "Chroma_%d: \t\t\t costOn =%.2f, costOff =%.2f\n", compId, costOn, costOff ); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h index 6440b471b..ef979a6ee 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.h @@ -415,7 +415,7 @@ class EncAdaptiveLoopFilter : public AdaptiveLoopFilter void deriveCcAlfFilter ( Picture& pic, CodingStructure& cs, int numCtus ); void applyCcAlfFilterCTU ( CodingStructure& cs, ComponentID compID, const int ctuRsAddr, PelStorage& alfTempCtuBuf ); void deriveFilter ( Picture& pic, CodingStructure& cs, const double* lambdas, const int numCtus ); - void selectFilterForCTU ( CodingStructure& cs, CABACWriter* CABACEstimator, CtxCache* ctxCache, int ctuIdx ); + void selectFilterForCTU ( CodingStructure& cs, CABACWriter* CABACEstimator, CtxCache* ctxCache, int ctuIdx ); void reconstructCTU_MT ( Picture& pic, CodingStructure& cs, const int ctuRsAddr, PelStorage& alfTempCtuBuf ); void reconstructCTU ( Picture& pic, CodingStructure& cs, const CPelUnitBuf& recBuf, const int ctuRsAddr, PelStorage& alfTempCtuBuf ); // void alfReconstructor ( CodingStructure& cs ); @@ -423,13 +423,14 @@ class EncAdaptiveLoopFilter : public AdaptiveLoopFilter void initDerivation ( Slice& slice ); void resetFrameStats ( bool ccAlfEnabled ); bool isSkipAlfForFrame ( const Picture& pic ) const; + int getAsuHeightInCtus () { return m_numCtusInAsuHeight; } private: void xStoreAlfAsuEnabledFlag ( CodingStructure& cs, int ctuX, int ctuY, int ctuIdx, const int compIdx, bool flag ); void xStoreAlfAsuAlternative ( CodingStructure& cs, int ctuX, int ctuY, int ctuIdx, const int compIdx, const uint8_t alt ); void xStoreAlfAsuFilterIdx ( CodingStructure& cs, int ctuX, int ctuY, int ctuIdx, const short fltIdx, short* alfCtbFilterSetIndex ); - double xCodeAlfAsuEnabledFlag ( CodingStructure& cs, int ctuIdx, const int compIdx, AlfParam* alfParam, const double ctuLambda ); - double xCodeAlfAsuAlternative ( CodingStructure& cs, int asuIdx, int ctuIdx, const int compIdx, AlfParam* alfParam, const double ctuLambda ); - double xCodeAlfAsuLumaFilterIdx ( CodingStructure& cs, int asuIdx, int ctuIdx, const double ctuLambda ); + double xCodeAlfAsuEnabledFlag ( CodingStructure& cs, int ctuIdx, const int compIdx, CABACWriter* CABACEstimator, const double ctuLambda ); + double xCodeAlfAsuAlternative ( CodingStructure& cs, int asuIdx, int ctuIdx, const int compIdx, AlfParam* alfParam, CABACWriter* CABACEstimator, const double ctuLambda ); + double xCodeAlfAsuLumaFilterIdx ( CodingStructure& cs, int asuIdx, int ctuIdx, CABACWriter* CABACEstimator, const double ctuLambda ); void xGetStatisticsCTU ( Picture& pic, CodingStructure& cs, PelUnitBuf& recYuv, const int xPos, const int yPos, const int asuRsAddr, PelStorage& alfTempCtuBuf ); void alfEncoder ( CodingStructure& cs, AlfParam& alfParam, const ChannelType channel, const double lambdaChromaWeight, const int numAsus, const int numCtus ); diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 3b962c03f..05b6048a1 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -3105,8 +3105,6 @@ void EncCu::xCheckRDCostInterIMV(CodingStructure *&tempCS, CodingStructure *&bes { bcwLoopNum = 1; } - _CASE( _UNIT_AREA_AT(tempCS->area, 96, 640, 32, 32) && tempCS->slice->poc == 16 ) - _BREAK; for (int i = 1; i <= IMV_HPEL; i++) { diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp index 269596226..10942bd9a 100644 --- a/source/Lib/EncoderLib/EncSlice.cpp +++ b/source/Lib/EncoderLib/EncSlice.cpp @@ -279,6 +279,12 @@ void EncSlice::init( const VVEncCfg& encCfg, } ctuEncParams.resize( sizeInCtus ); setArbitraryWppPattern( *pps.pcv, m_ctuAddrMap, 3 ); + + const unsigned asuHeightInCtus = m_pALF->getAsuHeightInCtus(); + const unsigned numDeriveLines = encCfg.m_ifpLines ? + std::min( ((encCfg.m_ifpLines & (~(asuHeightInCtus - 1))) + asuHeightInCtus), pps.pcv->heightInCtus ) : pps.pcv->heightInCtus; + m_alfDeriveLine = numDeriveLines - 1; + m_alfDeriveCtu = numDeriveLines * pps.pcv->widthInCtus - 1; } @@ -892,7 +898,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) const UnitArea& ctuArea = ctuEncParam->ctuArea; const bool wppSyncEnabled = cs.sps->entropyCodingSyncEnabled; const TaskType currState = processStates[ ctuRsAddr ]; - const int syncLines = encSlice->m_pcEncCfg->m_ifpLines; + const unsigned syncLines = encSlice->m_pcEncCfg->m_ifpLines; DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) ); DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); @@ -914,7 +920,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) if( syncLines ) { const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ); - if( lineStart && !refPicCtuLineReady( slice, ctuPosY + syncLines, pcv ) ) + if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) ) { return false; } @@ -1161,19 +1167,18 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat ); // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER; } break; case ALF_DERIVE_FILTER: { - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; if( ctuRsAddr == deriveFilterCtu ) { // ensure statistics from all previous ctu's have been collected - int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus; - for( int y = 0; y < numCheckLines; y++ ) + for( int y = 0; y <= encSlice->m_alfDeriveLine; y++ ) { for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) { @@ -1200,7 +1205,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) if( ctuRsAddr == deriveFilterCtu ) { encSlice->m_pALF->initDerivation( slice ); - encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus ); + encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 ); encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false ); } else if( syncLines ) @@ -1225,9 +1230,16 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case ALF_RECONSTRUCT: { // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT ) return false; + else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 ) + { + const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus(); + const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 ); + if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) + return false; + } if( checkReadyState ) return true; @@ -1277,7 +1289,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat ); // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * std::min(syncLines + 1, pcv.heightInCtus) - 1: pcv.sizeInCtus - 1; processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER; } break; @@ -1285,11 +1297,11 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case CCALF_DERIVE_FILTER: { // synchronization dependencies - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * std::min(syncLines + 1, pcv.heightInCtus) - 1: pcv.sizeInCtus - 1; if( ctuRsAddr == deriveFilterCtu ) { // ensure statistics from all previous ctu's have been collected - int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus; + int numCheckLines = syncLines ? std::min(syncLines + 1, pcv.heightInCtus): pcv.heightInCtus; for( int y = 0; y < numCheckLines; y++ ) { for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) @@ -1316,7 +1328,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) { if( ctuRsAddr == deriveFilterCtu ) { - encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus ); + encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * std::min(syncLines + 1, pcv.heightInCtus): pcv.sizeInCtus ); } else if( syncLines ) { @@ -1337,7 +1349,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) case CCALF_RECONSTRUCT: { // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) - const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1; + const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * std::min(syncLines + 1, pcv.heightInCtus) - 1: pcv.sizeInCtus - 1; if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT ) return false; diff --git a/source/Lib/EncoderLib/EncSlice.h b/source/Lib/EncoderLib/EncSlice.h index 862e2872c..35ae9ea6a 100644 --- a/source/Lib/EncoderLib/EncSlice.h +++ b/source/Lib/EncoderLib/EncSlice.h @@ -112,6 +112,8 @@ class EncSlice Ctx m_entropyCodingSyncContextState; ///< context storage for state of contexts at the wavefront/WPP/entropy-coding-sync second CTU of tile-row used for writing std::vector m_syncPicCtx; ///< context storage for state of contexts at the wavefront/WPP/entropy-coding-sync second CTU of tile-row used for estimation SliceType m_encCABACTableIdx; + unsigned m_alfDeriveLine; + unsigned m_alfDeriveCtu; double m_saoDisabledRate[ MAX_NUM_COMP ][ VVENC_MAX_TLAYER ]; bool m_saoEnabled[ MAX_NUM_COMP ]; diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp index 89cebf119..391179d0e 100644 --- a/source/Lib/vvenc/vvencCfg.cpp +++ b/source/Lib/vvenc/vvencCfg.cpp @@ -949,6 +949,12 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) if( c->m_alfUnitSize < 0 ) c->m_alfUnitSize = c->m_CTUSize; + if( c->m_ifp && c->m_alfUnitSize != c->m_CTUSize ) + { + msg.log( VVENC_WARNING, "IFP is enabled. For better performance, AlfUnitSize is adjusted to the CTUSize.\n" ); + c->m_alfUnitSize = c->m_CTUSize; + } + // quantization threshold if( c->m_quantThresholdVal < 0 ) {