diff --git a/CMakeLists.txt b/CMakeLists.txt index cb2e5cf95..c032ef1c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,10 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.13.0 ) endif() # project name -project( vvenc VERSION 1.11.1 ) +project( vvenc VERSION 1.12.0 ) # set alternative version numbering for release candidates -#set( PROJECT_VERSION_RC rc1 ) +set( PROJECT_VERSION_RC rc1 ) if( PROJECT_VERSION_RC ) set( PROJECT_VERSION "${PROJECT_VERSION}-${PROJECT_VERSION_RC}" ) endif() diff --git a/changelog.txt b/changelog.txt index 72f45f948..ebfcf51cd 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,41 @@ +///////////////////////////////////////// +tag 1.12.0-rc1 + +* libvvenc: + - added parameters: + - vvenc_config::m_mtProfile: defines how aggressively to boost multi-threading (0...3, where 0 is + current multi-threading, and 3 contains all WPP, IFP and tile usage, use -1 for auto behavior + depending on resolution and the number of threads used) + - changed parameters (vvenc_config::): + - m_numTileCols and m_numTileRows from uint32_t to int32_t (-1 for auto behavior, see m_mtProfile) + - m_ifp and m_entropyCodingSynchEnabled from bool to int8_t (-1 for auto behavior, see m_mtProfile) + - enabled capped constant quality factor (CQF) mode + - constant quality mode (vvenc_config::m_QP set and ::m_usePerceptQPA enabled) with rate capping + enabled with an absolute limit (::m_RCMaxBitrate) + - minor (~5%) speedups to faster preset + - improvements to low-delay encoding configuration + - other minor optimizations, cleanups and bugfixes + +* vvencFFapp: + - added parameters: + - MTProfile: set the multi-threading profile (0: current default - 3: use all MT optimizers, use + -1/auto to automatically set depending on resolution and number of threads used) + - changed parameters: + - WaveFrontSynchro, IFP, Tiles: default value set to -1, with the behavior defined by MTProfile + parameter, resolution and number of used threads + +* vvencapp: + - added parameters: + - mtprofile: set the multi-threading profile (0: current default - 3: use all MT optimizers, use + -1/auto to automatically set depending on resolution and number of threads used) + - internal-bitdepth: sets internal encoding bitdepth, i.e. the output bitstream bitdepth + - refreshtype: specifies the refresh-type used at random access points + - decodedpicturehash: enable/disable decoded picture hash (DPH) SEI generation + - changed parameters: + - ifp, tiles: default value set to -1, with the behavior defined by mtprofile parameter, resolution + and number of used threads + - format: now accepts yuv400 and yuv400_10 as parameter for monochrome input + ///////////////////////////////////////// tag 1.11.1 diff --git a/source/Lib/CommonLib/x86/MCTFX86.h b/source/Lib/CommonLib/x86/MCTFX86.h index 6a36acd91..7f8c701e7 100644 --- a/source/Lib/CommonLib/x86/MCTFX86.h +++ b/source/Lib/CommonLib/x86/MCTFX86.h @@ -461,6 +461,7 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride, const int yOffset = -1; const Pel* sourceCol = buf + base + yOffset * buffStride; const Pel* origCol = org; + __m256i verror = _mm256_setzero_si256(); for( int x1 = 0; x1 < w; x1 += 16, sourceCol += 16, origCol += 16 ) { @@ -531,15 +532,7 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride, xsum = _mm256_sub_epi16( xsum, xorg ); xsum = _mm256_madd_epi16( xsum, xsum ); - __m128i - ysum = _mm_add_epi32( _mm256_castsi256_si128( xsum ), _mm256_extracti128_si256( xsum, 1 ) ); - xerror = _mm_hadd_epi32( xerror, ysum ); - error = _mm_cvtsi128_si32( xerror ); - - if( error > besterror ) - { - return error; - } + verror = _mm256_add_epi32( verror, xsum ); } else { @@ -551,7 +544,8 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride, } GCC_WARNING_RESET - + + xerror = _mm_add_epi32( _mm256_castsi256_si128( verror ), _mm256_extracti128_si256( verror , 1 ) ); xerror = _mm_hadd_epi32( xerror, xerror ); xerror = _mm_hadd_epi32( xerror, xerror ); error = _mm_cvtsi128_si32( xerror ); diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index b12e20193..8af2c8a3a 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -1580,11 +1580,17 @@ void EncGOP::xUpdateRateCap() auto pic = *it; if( pic->isReconstructed ) { + const unsigned uibits = pic->sliceDataStreams[0].getNumberOfWrittenBits(); + if( !pic->gopEntry->m_isStartOfIntra && pic->gopEntry->m_scType == SCT_NONE ) { - const unsigned uibits = pic->sliceDataStreams[0].getNumberOfWrittenBits(); xUpdateRateCapBits( pic, uibits ); } + else if( pic->gopEntry->m_isStartOfIntra && pic->gopEntry->m_gopNum == 0 && pic->poc < m_pcEncCfg->m_GOPSize && m_rcap.accumTargetBits * (uint32_t) m_pcEncCfg->m_GOPSize < uibits ) + { + m_rcap.accumActualBits += uibits - m_rcap.accumTargetBits * (uint32_t) m_pcEncCfg->m_GOPSize; // capped CQF: compensate for overspending in first I-frame + } + it = m_rcUpdateList.erase( it ); } else @@ -1708,6 +1714,10 @@ void EncGOP::xInitGopQpCascade( Picture& keyPic, const PicList& picList ) const unsigned bFrmBC_final = bFrmBC * nonKeyPicsFactor; const unsigned targetBits = (unsigned)( (bFrmBC + (intraP >> 1)) / (intraP - 1) ); m_rcap.accumTargetBits += targetBits; + if (keyPic.gopEntry->m_isStartOfIntra && keyPic.gopEntry->m_gopNum == 0 && keyPic.poc < m_pcEncCfg->m_GOPSize && m_rcap.accumTargetBits * (int64_t) intraP < iFrmBC) + { + m_rcap.accumTargetBits = (iFrmBC + (intraP >> 1)) / intraP; + } m_rcap.nonRateCapEstim = 1.0; // changed in case of capping m_rcap.gopAdaptedQPAdj = 0; // changed in first GOP of scene diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h index e1e376824..eaea36aaa 100644 --- a/source/Lib/apputils/VVEncAppCfg.h +++ b/source/Lib/apputils/VVEncAppCfg.h @@ -121,6 +121,11 @@ const std::vector> PresetToEnumMap = { "medium", vvencPresetMode::VVENC_MEDIUM }, { "slow", vvencPresetMode::VVENC_SLOW }, { "slower", vvencPresetMode::VVENC_SLOWER }, + { "0", vvencPresetMode::VVENC_FASTER }, + { "1", vvencPresetMode::VVENC_FAST }, + { "2", vvencPresetMode::VVENC_MEDIUM }, + { "3", vvencPresetMode::VVENC_SLOW }, + { "4", vvencPresetMode::VVENC_SLOWER }, { "medium_lowDecEnergy", vvencPresetMode::VVENC_MEDIUM_LOWDECNRG }, { "medium_lowdecenergy", vvencPresetMode::VVENC_MEDIUM_LOWDECNRG }, { "firstpass", vvencPresetMode::VVENC_FIRSTPASS },