From 8594a0f64a8c552fb640a2b33da7e592641f7c1e Mon Sep 17 00:00:00 2001
From: Adam Wieckowski <70575289+adamjw24@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:34:02 +0200
Subject: [PATCH] Prepare v1.12.0 (#385)

* Added changelog for v1.12.0-rc1, minor changes

* Update version to v1.12.0-rc1
---
 CMakeLists.txt                     |  4 ++--
 changelog.txt                      | 38 ++++++++++++++++++++++++++++++
 source/Lib/CommonLib/x86/MCTFX86.h | 14 ++++-------
 source/Lib/EncoderLib/EncGOP.cpp   | 12 +++++++++-
 source/Lib/apputils/VVEncAppCfg.h  |  5 ++++
 5 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb2e5cf95..c032ef1c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.13.0 )
 endif()
 
 # project name
-project( vvenc VERSION 1.11.1 )
+project( vvenc VERSION 1.12.0 )
 
 # set alternative version numbering for release candidates
-#set( PROJECT_VERSION_RC rc1 )
+set( PROJECT_VERSION_RC rc1 )
 if( PROJECT_VERSION_RC )
     set( PROJECT_VERSION "${PROJECT_VERSION}-${PROJECT_VERSION_RC}" )
 endif()
diff --git a/changelog.txt b/changelog.txt
index 72f45f948..ebfcf51cd 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,41 @@
+/////////////////////////////////////////
+tag 1.12.0-rc1
+
+* libvvenc:
+  - added parameters:
+    - vvenc_config::m_mtProfile: defines how aggressively to boost multi-threading (0...3, where 0 is
+      current multi-threading, and 3 contains all WPP, IFP and tile usage, use -1 for auto behavior
+      depending on resolution and the number of threads used)
+  - changed parameters (vvenc_config::):
+    - m_numTileCols and m_numTileRows from uint32_t to int32_t (-1 for auto behavior, see m_mtProfile)
+    - m_ifp and m_entropyCodingSynchEnabled from bool to int8_t (-1 for auto behavior, see m_mtProfile)
+  - enabled capped constant quality factor (CQF) mode
+    - constant quality mode (vvenc_config::m_QP set and ::m_usePerceptQPA enabled) with rate capping
+      enabled with an absolute limit (::m_RCMaxBitrate)
+  - minor (~5%) speedups to faster preset
+  - improvements to low-delay encoding configuration
+  - other minor optimizations, cleanups and bugfixes
+
+* vvencFFapp:
+  - added parameters:
+    - MTProfile: set the multi-threading profile (0: current default - 3: use all MT optimizers, use
+      -1/auto to automatically set depending on resolution and number of threads used)
+  - changed parameters:
+    - WaveFrontSynchro, IFP, Tiles: default value set to -1, with the behavior defined by MTProfile
+      parameter, resolution and number of used threads
+
+* vvencapp:
+  - added parameters:
+    - mtprofile: set the multi-threading profile (0: current default - 3: use all MT optimizers, use
+                 -1/auto to automatically set depending on resolution and number of threads used)
+    - internal-bitdepth: sets internal encoding bitdepth, i.e. the output bitstream bitdepth
+    - refreshtype: specifies the refresh-type used at random access points
+    - decodedpicturehash: enable/disable decoded picture hash (DPH) SEI generation
+  - changed parameters:
+    - ifp, tiles: default value set to -1, with the behavior defined by mtprofile parameter, resolution
+                  and number of used threads
+    - format: now accepts yuv400 and yuv400_10 as parameter for monochrome input
+
 /////////////////////////////////////////
 tag 1.11.1
 
diff --git a/source/Lib/CommonLib/x86/MCTFX86.h b/source/Lib/CommonLib/x86/MCTFX86.h
index 6a36acd91..7f8c701e7 100644
--- a/source/Lib/CommonLib/x86/MCTFX86.h
+++ b/source/Lib/CommonLib/x86/MCTFX86.h
@@ -461,6 +461,7 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride,
     const int yOffset    = -1;
     const Pel* sourceCol = buf + base + yOffset * buffStride;
     const Pel* origCol   = org;
+    __m256i verror = _mm256_setzero_si256();
 
     for( int x1 = 0; x1 < w; x1 += 16, sourceCol += 16, origCol += 16 )
     {
@@ -531,15 +532,7 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride,
           xsum = _mm256_sub_epi16( xsum, xorg );
           xsum = _mm256_madd_epi16( xsum, xsum );
 
-          __m128i
-          ysum = _mm_add_epi32( _mm256_castsi256_si128( xsum ), _mm256_extracti128_si256( xsum, 1 ) );
-          xerror = _mm_hadd_epi32( xerror, ysum );
-          error = _mm_cvtsi128_si32( xerror );
-
-          if( error > besterror )
-          {
-            return error;
-          }
+          verror = _mm256_add_epi32( verror, xsum );
         }
         else
         {
@@ -551,7 +544,8 @@ int motionErrorLumaFrac_loRes_SIMD( const Pel* org, const ptrdiff_t origStride,
     }
 
     GCC_WARNING_RESET
-    
+
+    xerror = _mm_add_epi32( _mm256_castsi256_si128( verror ), _mm256_extracti128_si256( verror , 1 ) );
     xerror = _mm_hadd_epi32( xerror, xerror );
     xerror = _mm_hadd_epi32( xerror, xerror );
     error  = _mm_cvtsi128_si32( xerror );
diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp
index b12e20193..8af2c8a3a 100644
--- a/source/Lib/EncoderLib/EncGOP.cpp
+++ b/source/Lib/EncoderLib/EncGOP.cpp
@@ -1580,11 +1580,17 @@ void EncGOP::xUpdateRateCap()
     auto pic = *it;
     if( pic->isReconstructed )
     {
+      const unsigned uibits = pic->sliceDataStreams[0].getNumberOfWrittenBits();
+
       if( !pic->gopEntry->m_isStartOfIntra && pic->gopEntry->m_scType == SCT_NONE )
       {
-        const unsigned uibits = pic->sliceDataStreams[0].getNumberOfWrittenBits();
         xUpdateRateCapBits( pic, uibits );
       }
+      else if( pic->gopEntry->m_isStartOfIntra && pic->gopEntry->m_gopNum == 0 && pic->poc < m_pcEncCfg->m_GOPSize && m_rcap.accumTargetBits * (uint32_t) m_pcEncCfg->m_GOPSize < uibits )
+      {
+        m_rcap.accumActualBits += uibits - m_rcap.accumTargetBits * (uint32_t) m_pcEncCfg->m_GOPSize; // capped CQF: compensate for overspending in first I-frame
+      }
+
       it = m_rcUpdateList.erase( it );
     }
     else
@@ -1708,6 +1714,10 @@ void EncGOP::xInitGopQpCascade( Picture& keyPic, const PicList& picList )
   const unsigned bFrmBC_final   = bFrmBC * nonKeyPicsFactor;
   const unsigned targetBits     = (unsigned)( (bFrmBC + (intraP >> 1)) / (intraP - 1) );
   m_rcap.accumTargetBits += targetBits;
+  if (keyPic.gopEntry->m_isStartOfIntra && keyPic.gopEntry->m_gopNum == 0 && keyPic.poc < m_pcEncCfg->m_GOPSize && m_rcap.accumTargetBits * (int64_t) intraP < iFrmBC)
+  {
+    m_rcap.accumTargetBits = (iFrmBC + (intraP >> 1)) / intraP;
+  }
   m_rcap.nonRateCapEstim = 1.0;     // changed in case of capping
   m_rcap.gopAdaptedQPAdj = 0;       // changed in first GOP of scene
 
diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h
index e1e376824..eaea36aaa 100644
--- a/source/Lib/apputils/VVEncAppCfg.h
+++ b/source/Lib/apputils/VVEncAppCfg.h
@@ -121,6 +121,11 @@ const std::vector<SVPair<vvencPresetMode>> PresetToEnumMap =
   { "medium",    vvencPresetMode::VVENC_MEDIUM },
   { "slow",      vvencPresetMode::VVENC_SLOW },
   { "slower",    vvencPresetMode::VVENC_SLOWER },
+  { "0",         vvencPresetMode::VVENC_FASTER },
+  { "1",         vvencPresetMode::VVENC_FAST },
+  { "2",         vvencPresetMode::VVENC_MEDIUM },
+  { "3",         vvencPresetMode::VVENC_SLOW },
+  { "4",         vvencPresetMode::VVENC_SLOWER },
   { "medium_lowDecEnergy", vvencPresetMode::VVENC_MEDIUM_LOWDECNRG },
   { "medium_lowdecenergy", vvencPresetMode::VVENC_MEDIUM_LOWDECNRG },
   { "firstpass", vvencPresetMode::VVENC_FIRSTPASS },