From 38672426d53d190cc756e3792efb31da684f5611 Mon Sep 17 00:00:00 2001
From: Jens Brandenburg <jens.brandenburg@hhi.fraunhofer.de>
Date: Tue, 22 Dec 2020 11:21:20 +0100
Subject: [PATCH] merged develop v0.2.1.0

---
 CMakeLists.txt                           |  2 +-
 README.md                                |  7 ++
 changelog.txt                            | 13 +++-
 source/Lib/CommonLib/CodingStructure.cpp | 84 +++++++++++++-----------
 source/Lib/CommonLib/CodingStructure.h   |  2 +
 source/Lib/CommonLib/Picture.cpp         |  8 ++-
 source/Lib/CommonLib/TrQuant.cpp         | 21 +++---
 source/Lib/CommonLib/x86/QuantX86.h      |  7 +-
 source/Lib/DecoderLib/DecLib.cpp         | 12 +++-
 source/Lib/EncoderLib/BitAllocation.cpp  | 11 ++--
 source/Lib/EncoderLib/BitAllocation.h    |  2 +-
 source/Lib/EncoderLib/EncLib.cpp         |  8 ---
 source/Lib/EncoderLib/EncPicture.cpp     | 10 ++-
 source/Lib/EncoderLib/EncSlice.cpp       |  2 +
 source/Lib/EncoderLib/RateCtrl.cpp       | 15 ++---
 source/Lib/EncoderLib/RateCtrl.h         |  2 +-
 source/Lib/vvenc/EncCfg.cpp              |  2 +-
 17 files changed, 124 insertions(+), 84 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 072a5744a..9f95c25fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.12.0 )
 endif()
 
 # project name
-project( vvenc VERSION 0.2.0.0 )
+project( vvenc VERSION 0.2.1.0 )
 
 if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
   # enable sse4.1 build for all source files for gcc and clang
diff --git a/README.md b/README.md
index 31e042bd0..67465c7ed 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,13 @@ The expert mode encoder (**vvencFFapp**) is based on the [VVC test model (VTM)](
 
     vvencFFapp -c randomaccess_medium.cfg -c sequence.cfg
 
+# Contributing
+
+Feel free to contribute. To do so:
+
+* Fork the current-most state of the master branch
+* Apply the desired changes
+* Create a pull-request to the upstream repository
 
 # License
 
diff --git a/changelog.txt b/changelog.txt
index 41b6a841c..32b62e84c 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,14 @@
+/////////////////////////////////////////
+tag 0.2.1.0
+
+* bugfixes:
+ - ISP fix: erroneous placed sanity check
+
+* libvvenc:
+ - decrease memory consumption
+ - harmonize 2-pass rate control and perceptual QPA
+ - improve rate control
+
 /////////////////////////////////////////
 tag 0.2.0.0
 
@@ -22,7 +33,7 @@ tag 0.2.0.0
  - various memory reductions (Rom.cpp, scaling list memory)
  - verious optimizations (SIMD for MCTF, forward transformation, single column IF; memory accesses for DMVR)
  - changed MCTF algorithm to do intermediate rounding between hor/ver filtering
- 
+
 * vvencapp:
  - new parameter      --refreshsec,-rs to define the intra refresh rate in seconds depending on the given frame rate.
                        Internally, the refresh rate in seconds is translated into the frames where the refresh is set.
diff --git a/source/Lib/CommonLib/CodingStructure.cpp b/source/Lib/CommonLib/CodingStructure.cpp
index f03924dc8..5556b8ac8 100644
--- a/source/Lib/CommonLib/CodingStructure.cpp
+++ b/source/Lib/CommonLib/CodingStructure.cpp
@@ -122,24 +122,10 @@ void CodingStructure::destroy()
 
   destroyCoeffs();
 
-  for( uint32_t i = 0; i < MAX_NUM_CH; i++ )
-  {
-    delete[] m_cuPtr[ i ];
-    m_cuPtr[ i ] = nullptr;
-
-    delete[] m_tuPtr[ i ];
-    m_tuPtr[ i ] = nullptr;
-  }
-
-  for( int i = 0; i < NUM_EDGE_DIR; i++ )
-  {
-    xFree( m_lfParam[ i ] );
-    m_lfParam[ i ] = nullptr;
-  }
-
   delete[] m_motionBuf;
   m_motionBuf = nullptr;
 
+  destroyTempBuffers();
 
   if ( m_unitCacheMutex ) m_unitCacheMutex->lock();
 
@@ -650,17 +636,34 @@ void CodingStructure::createInternals( const UnitArea& _unit, const bool isTopLa
   parent  = nullptr;
   refCS   = nullptr;
 
-  unsigned numCh = getNumberValidChannels(area.chromaFormat);
+  unsigned _lumaAreaScaled = g_miScaling.scale( area.lumaSize() ).area();
+  m_motionBuf = new MotionInfo[_lumaAreaScaled];
 
-  for (unsigned i = 0; i < numCh; i++)
+  if( isTopLayer )
   {
-    Size allocArea = area.blocks[i].size();
-    m_mapSize[i] = unitScale[i].scale(allocArea);
+    motionLutBuf.resize( pcv->heightInCtus );
+  }
+  else
+  {
+    createCoeffs();
+    createTempBuffers( false );
+    initStructData();
+  }
+}
 
-    unsigned _area = unitScale[i].scale( area.blocks[i].size() ).area();
+void CodingStructure::createTempBuffers( const bool isTopLayer )
+{
+  unsigned numCh = getNumberValidChannels( area.chromaFormat );
+
+  for( unsigned i = 0; i < numCh; i++ )
+  {
+    Size allocArea  = area.blocks[i].size();
+    m_mapSize[i]    = unitScale[i].scale(allocArea);
 
-    m_cuPtr[i]    = _area > 0 ? new CodingUnit*    [_area] : nullptr;
-    m_tuPtr[i]    = _area > 0 ? new TransformUnit* [_area] : nullptr;
+    unsigned _area  = unitScale[i].scale( area.blocks[i].size() ).area();
+
+    m_cuPtr[i]      = _area > 0 ? new CodingUnit*    [_area] : nullptr;
+    m_tuPtr[i]      = _area > 0 ? new TransformUnit* [_area] : nullptr;
   }
 
   for( unsigned i = 0; i < NUM_EDGE_DIR; i++ )
@@ -668,29 +671,29 @@ void CodingStructure::createInternals( const UnitArea& _unit, const bool isTopLa
     m_lfParam[i] = ( isTopLayer && m_mapSize[0].area() > 0 ) ? ( LoopFilterParam* ) xMalloc( LoopFilterParam, m_mapSize[0].area() ) : nullptr;
   }
 
-  numCh = getNumberValidComponents(area.chromaFormat);
+  unsigned _maxNumDmvrMvs = ( area.lwidth() >> 3 ) * ( area.lheight() >> 3 );
+  m_dmvrMvCache.resize( _maxNumDmvrMvs );
+}
 
-  for (unsigned i = 0; i < numCh; i++)
+void CodingStructure::destroyTempBuffers()
+{
+  for( uint32_t i = 0; i < MAX_NUM_CH; i++ )
   {
-    m_offsets[i] = 0;
+    delete[] m_cuPtr[i];
+    m_cuPtr[i] = nullptr;
+
+    delete[] m_tuPtr[i];
+    m_tuPtr[i] = nullptr;
   }
 
-  if( isTopLayer )
+  for( int i = 0; i < NUM_EDGE_DIR; i++ )
   {
-    motionLutBuf.resize( pcv->heightInCtus );
+    xFree( m_lfParam[i] );
+    m_lfParam[i] = nullptr;
   }
-  else
-  {
-    createCoeffs();
-  }
-
-  unsigned _lumaAreaScaled = g_miScaling.scale( area.lumaSize() ).area();
-  m_motionBuf       = new MotionInfo[_lumaAreaScaled];
-
-  unsigned _maxNumDmvrMvs = ( area.lwidth() >> 3 ) * ( area.lheight() >> 3 );
-  m_dmvrMvCache.resize( _maxNumDmvrMvs );
 
-  initStructData();
+  // swap the contents of the vector so that memory released
+  std::vector<Mv>().swap( m_dmvrMvCache );
 }
 
 void CodingStructure::addMiToLut(static_vector<HPMVInfo, MAX_NUM_HMVP_CANDS> &lut, const HPMVInfo &mi)
@@ -739,6 +742,11 @@ void CodingStructure::createCoeffs()
 
     m_coeffs[i] = _area > 0 ? ( TCoeff* ) xMalloc( TCoeff, _area ) : nullptr;
   }
+
+  for( unsigned i = 0; i < numComp; i++ )
+  {
+    m_offsets[i] = 0;
+  }
 }
 
 void CodingStructure::destroyCoeffs()
diff --git a/source/Lib/CommonLib/CodingStructure.h b/source/Lib/CommonLib/CodingStructure.h
index a85176efc..84e63e7d6 100644
--- a/source/Lib/CommonLib/CodingStructure.h
+++ b/source/Lib/CommonLib/CodingStructure.h
@@ -176,6 +176,8 @@ class CodingStructure
   void clearCUs();
   const int signalModeCons( const PartSplit split, Partitioner &partitioner, const ModeType modeTypeParent ) const;
 
+  void createTempBuffers( const bool isTopLayer );
+  void destroyTempBuffers();
 private:
   void createInternals(const UnitArea& _unit, const bool isTopLayer);
 
diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp
index e15ff7faf..980b9d3af 100644
--- a/source/Lib/CommonLib/Picture.cpp
+++ b/source/Lib/CommonLib/Picture.cpp
@@ -187,7 +187,6 @@ void Picture::create( ChromaFormat _chromaFormat, const Size& size, unsigned _ma
   margin            =  _margin;
   const Area a      = Area( Position(), size );
   m_bufs[ PIC_RECONSTRUCTION ].create( _chromaFormat, a, _maxCUSize, _margin, MEMORY_ALIGN_DEF_SIZE );
-  m_bufs[ PIC_SAO_TEMP ].create( _chromaFormat, a, _maxCUSize, 0, MEMORY_ALIGN_DEF_SIZE );
 
   if( _decoder )
   {
@@ -230,11 +229,17 @@ void Picture::destroy()
 
 void Picture::createTempBuffers( unsigned _maxCUSize )
 {
+  CHECK( !cs, "Coding structure is required a this point!" );
+
+  m_bufs[PIC_SAO_TEMP].create( chromaFormat, Y(), cs->pcv->maxCUSize, 0, MEMORY_ALIGN_DEF_SIZE );
+
   if( cs ) cs->rebindPicBufs();
 }
 
 void Picture::destroyTempBuffers()
 {
+  m_bufs[PIC_SAO_TEMP].destroy();
+
   if( cs ) cs->rebindPicBufs();
 }
 
@@ -262,7 +267,6 @@ void Picture::finalInit( const VPS& _vps, const SPS& sps, const PPS& pps, PicHea
 
   if( cs )
   {
-    cs->initStructData();
     CHECK( cs->sps != &sps, "picture initialization error: sps changed" );
     CHECK( cs->vps != &_vps, "picture initialization error: vps changed" );
   }
diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
index e33f3d7a6..5f21057b4 100644
--- a/source/Lib/CommonLib/TrQuant.cpp
+++ b/source/Lib/CommonLib/TrQuant.cpp
@@ -501,19 +501,20 @@ void TrQuant::xT( const TransformUnit& tu, const ComponentID compID, const CPelB
   }
 #endif //ENABLE_SIMD_TRAFO
 
-  const int      shift_1st              = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
-  const int      shift_2nd              =  (Log2(height))            + TRANSFORM_MATRIX_SHIFT                          + COM16_C806_TRANS_PREC;
-  CHECK( shift_1st < 0, "Negative shift" );
-  CHECK( shift_2nd < 0, "Negative shift" );
-
   if (width > 1 && height > 1)
   {
+    const int shift_1st = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
+    const int shift_2nd =  (Log2(height))            + TRANSFORM_MATRIX_SHIFT                          + COM16_C806_TRANS_PREC;
+    CHECK( shift_1st < 0, "Negative shift" );
+    CHECK( shift_2nd < 0, "Negative shift" );
     fastFwdTrans[trTypeHor][transformWidthIndex](block, tmp, shift_1st, height, 0, skipWidth);
     fastFwdTrans[trTypeVer][transformHeightIndex](tmp, dstCoeff.buf, shift_2nd, width, skipWidth, skipHeight);
   }
   else if (height == 1)   // 1-D horizontal transform
   {
-    fastFwdTrans[trTypeHor][transformWidthIndex](block, dstCoeff.buf, shift_1st, 1, 0, skipWidth);
+    const int shift = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
+    CHECK( shift < 0, "Negative shift" );
+    fastFwdTrans[trTypeHor][transformWidthIndex](block, dstCoeff.buf, shift, 1, 0, skipWidth);
   }
   else   // if (iWidth == 1) //1-D vertical transform
   {
@@ -562,14 +563,14 @@ void TrQuant::xIT( const TransformUnit& tu, const ComponentID compID, const CCoe
     }
   }
 
-  const int      shift_1st              =   TRANSFORM_MATRIX_SHIFT + 1 + COM16_C806_TRANS_PREC; // 1 has been added to shift_1st at the expense of shift_2nd
-  const int      shift_2nd              = ( TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1 ) - bitDepth + COM16_C806_TRANS_PREC;
-  CHECK( shift_1st < 0, "Negative shift" );
-  CHECK( shift_2nd < 0, "Negative shift" );
   TCoeff *block = m_blk;
   TCoeff *tmp   = m_tmp;
   if (width > 1 && height > 1)   // 2-D transform
   {
+    const int shift_1st =   TRANSFORM_MATRIX_SHIFT + 1 + COM16_C806_TRANS_PREC; // 1 has been added to shift_1st at the expense of shift_2nd
+    const int shift_2nd = ( TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1 ) - bitDepth + COM16_C806_TRANS_PREC;
+    CHECK( shift_1st < 0, "Negative shift" );
+    CHECK( shift_2nd < 0, "Negative shift" );
     fastInvTrans[trTypeVer][transformHeightIndex](pCoeff.buf, tmp, shift_1st, width, skipWidth, skipHeight, clipMinimum, clipMaximum);
     fastInvTrans[trTypeHor][transformWidthIndex](tmp, block, shift_2nd, height, 0, skipWidth, clipMinimum, clipMaximum);
   }
diff --git a/source/Lib/CommonLib/x86/QuantX86.h b/source/Lib/CommonLib/x86/QuantX86.h
index 793b4b7d8..5e6e0ab6d 100644
--- a/source/Lib/CommonLib/x86/QuantX86.h
+++ b/source/Lib/CommonLib/x86/QuantX86.h
@@ -204,7 +204,7 @@ static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const
     {
       for( int y = 0; y <= maxY; y++)
       {
-        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
+        __m128i v_level = maxX == 1 ? _mm_loadl_epi64( (__m128i const*) & piQCoef[y * piQCfStride] ) : _mm_loadu_si128( (__m128i const*) & piQCoef[y * piQCfStride] );
         v_level = _mm_packs_epi32 (v_level,v_level);
         v_level = _mm_and_si128(v_level,vlevmask);
         v_level = _mm_max_epi16 (v_level, v_min);
@@ -218,7 +218,10 @@ static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const
 
         v_level = _mm_max_epi32 (v_level, v_Tmin);
         v_level = _mm_min_epi32 (v_level, v_Tmax);
-        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
+        if( maxX == 1 )
+          _mm_storel_epi64( (__m128i*)(piCoef + y * width), v_level );
+        else
+          _mm_storeu_si128( (__m128i*)(piCoef + y * width), v_level );
       }
     }
     else
diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp
index de5d2b55e..8f1e16bf5 100644
--- a/source/Lib/DecoderLib/DecLib.cpp
+++ b/source/Lib/DecoderLib/DecLib.cpp
@@ -156,6 +156,11 @@ bool tryDecodePicture( Picture* pcEncPic, const int expectedPoc, const std::stri
               {
                 if( pic->poc == poc && (!bDecodeUntilPocFound || expectedPoc == poc ) )
                 {
+                  pcEncPic->createTempBuffers( pic->cs->pcv->maxCUSize );
+                  pcEncPic->cs->createCoeffs();
+                  pcEncPic->cs->createTempBuffers( true );
+                  pcEncPic->cs->initStructData();
+
                   CHECK( pcEncPic->slices.size() == 0, "at least one slice should be available" );
 
                   CHECK( expectedPoc != poc, "mismatch in POC - check encoder configuration" );
@@ -669,9 +674,10 @@ void DecLib::finishPicture(int& poc, PicList*& rpcListPic, MsgLevel msgl )
   m_maxDecSubPicIdx = 0;
   m_maxDecSliceAddrInSubPic = -1;
 
-  m_pic->destroyTempBuffers();
-  m_pic->cs->destroyCoeffs();
   m_pic->cs->releaseIntermediateData();
+  m_pic->cs->destroyTempBuffers();
+  m_pic->cs->destroyCoeffs();
+  m_pic->destroyTempBuffers();
   m_pic->cs->picHeader->initPicHeader();
 }
 
@@ -1007,6 +1013,8 @@ void DecLib::xActivateParameterSets( const int layerId)
 
     m_pic->createTempBuffers( m_pic->cs->pps->pcv->maxCUSize );
     m_pic->cs->createCoeffs();
+    m_pic->cs->createTempBuffers( true );
+    m_pic->cs->initStructData();
 
     m_pic->allocateNewSlice();
     // make the slice-pilot a real slice, and set up the slice-pilot for the next slice
diff --git a/source/Lib/EncoderLib/BitAllocation.cpp b/source/Lib/EncoderLib/BitAllocation.cpp
index 13628ebc5..161562521 100644
--- a/source/Lib/EncoderLib/BitAllocation.cpp
+++ b/source/Lib/EncoderLib/BitAllocation.cpp
@@ -339,7 +339,7 @@ int BitAllocation::applyQPAdaptationChroma (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr || optChromaQPOffset == nullptr || encCfg->m_usePerceptQPA > 4) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const int          bitDepth = slice->sps->bitDepths[CH_L];
 
@@ -394,6 +394,7 @@ int BitAllocation::applyQPAdaptationChroma (const Slice* slice, const EncCfg* en
 }
 
 int BitAllocation::applyQPAdaptationLuma (const Slice* slice, const EncCfg* encCfg, const int savedQP, const double lambda, std::vector<int>& ctuPumpRedQP,
+                                          const bool forceFrameWiseQPA,
                                           const uint32_t ctuStartAddr, const uint32_t ctuBoundingAddr, const bool isHDR /*= false*/)
 {
   Picture* const pic          = (slice != nullptr ? slice->pic : nullptr);
@@ -403,9 +404,9 @@ int BitAllocation::applyQPAdaptationLuma (const Slice* slice, const EncCfg* encC
 
   if (pic == nullptr || pic->cs == nullptr || encCfg == nullptr || ctuStartAddr >= ctuBoundingAddr) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
-  const bool useFrameWiseQPA  = (encCfg->m_QP > MAX_QP_PERCEPT_QPA);
+  const bool useFrameWiseQPA  = (encCfg->m_QP > MAX_QP_PERCEPT_QPA) || forceFrameWiseQPA;
   const int          bitDepth = slice->sps->bitDepths[CH_L];
   const int           sliceQP = (savedQP < 0 ? slice->sliceQp : savedQP);
   const PreCalcValues&    pcv = *pic->cs->pcv;
@@ -588,7 +589,7 @@ int BitAllocation::applyQPAdaptationSubCtu (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const int         bitDepth  = slice->sps->bitDepths[CH_L];
   const PosType     guardSize = (isHighResolution ? 2 : 1);
@@ -664,7 +665,7 @@ double BitAllocation::getPicVisualActivity (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr) return 0.0;
 
-  const bool isXPSNRQPA = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRQPA = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighRes  = ( encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280 ) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const CPelBuf picOrig = (origBuf != nullptr ? *origBuf : pic->getOrigBuf (COMP_Y));
   const CPelBuf picPrv1 = (isXPSNRQPA ? pic->getOrigBufPrev (COMP_Y, false) : picOrig);
diff --git a/source/Lib/EncoderLib/BitAllocation.h b/source/Lib/EncoderLib/BitAllocation.h
index e382230b3..43226644c 100644
--- a/source/Lib/EncoderLib/BitAllocation.h
+++ b/source/Lib/EncoderLib/BitAllocation.h
@@ -65,7 +65,7 @@ namespace vvenc {
                                  std::vector<int>& ctuPumpRedQP,
                                  int optChromaQPOffset[2], const bool isHDR = false);
     int applyQPAdaptationLuma   (const Slice* slice, const EncCfg* encCfg, const int savedQP, const double lambda,
-                                 std::vector<int>& ctuPumpRedQP,
+                                 std::vector<int>& ctuPumpRedQP, const bool forceFrameWiseQPA,
                                  const uint32_t ctuStartAddr, const uint32_t ctuBoundingAddr, const bool isHDR = false);
     int applyQPAdaptationSubCtu (const Slice* slice, const EncCfg* encCfg, const Area& lumaArea, const bool isHDR = false);
     int getCtuPumpingReducingQP (const Slice* slice, const CPelBuf& origY, const Distortion uiSadBestForQPA,
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index 6a7d2152f..97f0c239b 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -362,11 +362,6 @@ void EncLib::xSetRCEncCfg( int pass )
     // restore MCTF
     m_cBckCfg.m_MCTF              = mctf;
 
-    // configure QPA in the first pass
-    m_cBckCfg.m_usePerceptQPA                  = 0; // disable QPA in the first pass
-    m_cBckCfg.m_sliceChromaQpOffsetPeriodicity = 0;
-    m_cBckCfg.m_usePerceptQPATempFiltISlice    = 0;
-
     std::swap( const_cast<EncCfg&>(m_cEncCfg), m_cBckCfg );
   }
 }
@@ -603,9 +598,6 @@ void EncLib::xInitPicture( Picture& pic, int picNum, const PPS& pps, const SPS&
   pic.vps = &vps;
   pic.dci = &dci;
 
-  pic.createTempBuffers( pic.cs->pps->pcv->maxCUSize );
-  pic.cs->createCoeffs();
-
   // filter data initialization
   const uint32_t numberOfCtusInFrame = pic.cs->pcv->sizeInCtus;
 
diff --git a/source/Lib/EncoderLib/EncPicture.cpp b/source/Lib/EncoderLib/EncPicture.cpp
index 9ff166ace..6f7b22749 100644
--- a/source/Lib/EncoderLib/EncPicture.cpp
+++ b/source/Lib/EncoderLib/EncPicture.cpp
@@ -90,6 +90,11 @@ void EncPicture::encodePicture( Picture& pic, ParameterSetMap<APS>& shrdApsMap,
   // compress picture
   if ( pic.encPic )
   {
+    pic.createTempBuffers( pic.cs->pcv->maxCUSize );
+    pic.cs->createCoeffs();
+    pic.cs->createTempBuffers( true );
+    pic.cs->initStructData();
+
     xInitPicEncoder ( pic );
     gopEncoder.picInitRateControl( pic.gopId, pic, pic.slices[ 0 ] );
     xCompressPicture( pic );
@@ -113,9 +118,10 @@ void EncPicture::encodePicture( Picture& pic, ParameterSetMap<APS>& shrdApsMap,
     pic.picBlkStat.storeBlkSize( pic );
   }
   // cleanup
-  pic.destroyTempBuffers();
-  pic.cs->destroyCoeffs();
   pic.cs->releaseIntermediateData();
+  pic.cs->destroyTempBuffers();
+  pic.cs->destroyCoeffs();
+  pic.destroyTempBuffers();
 
   pic.encTime.stopTimer();
 
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index ac2ff03af..654e925b8 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -268,6 +268,7 @@ void EncSlice::xInitSliceLambdaQP( Slice* slice, int gopId )
   }
   if (m_pcEncCfg->m_usePerceptQPA)
   {
+    const bool rcIsFirstPassOf2 = (m_pcEncCfg->m_RCRateControlMode == 2 ? m_pcEncCfg->m_RCNumPasses == 2 && !m_pcRateCtrl->rcIsFinalPass : false);
     uint32_t  startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
     uint32_t  boundingCtuTsAddr = slice->pic->cs->pcv->sizeInCtus;
 
@@ -275,6 +276,7 @@ void EncSlice::xInitSliceLambdaQP( Slice* slice, int gopId )
     slice->pic->picInitialQP = iQP;
 
     if ((iQP = BitAllocation::applyQPAdaptationLuma (slice, m_pcEncCfg, adaptedLumaQP, dLambda, *m_CtuTaskRsrc[ 0 ]->m_encCu.getQpPtr(),
+                                                     rcIsFirstPassOf2,
                                                      startCtuTsAddr, boundingCtuTsAddr, m_pcEncCfg->m_usePerceptQPA > 2)) >= 0) // sets pic->ctuAdaptedQP[] & ctuQpaLambda[]
     {
       dLambda *= pow (2.0, ((double) iQP - dQP) / 3.0); // adjust lambda based on change of slice QP
diff --git a/source/Lib/EncoderLib/RateCtrl.cpp b/source/Lib/EncoderLib/RateCtrl.cpp
index b3369438f..43f060b08 100644
--- a/source/Lib/EncoderLib/RateCtrl.cpp
+++ b/source/Lib/EncoderLib/RateCtrl.cpp
@@ -304,7 +304,7 @@ void EncRCSeq::updateAfterPic ( int bits )
   framesLeft--;
 }
 
-void EncRCSeq::getTargetBitsFromFirstPass( int numPicCoded, int &targetBits, double &gopVsBitrateRatio, bool &isNewScene, double alpha[] )
+void EncRCSeq::getTargetBitsFromFirstPass( int numPicCoded, int &targetBits, double &gopVsBitrateRatio, double &frameVsGopRatio, bool &isNewScene, double alpha[] )
 {
   int picCounter = 0;
   int numOfLevels = int( log( gopSize ) / log( 2 ) + 0.5 ) + 2;
@@ -316,6 +316,7 @@ void EncRCSeq::getTargetBitsFromFirstPass( int numPicCoded, int &targetBits, dou
     {
       targetBits = it->targetBits;
       gopVsBitrateRatio = it->gopBitsVsBitrate;
+      frameVsGopRatio = it->frameInGopRatio;
       isNewScene = it->isNewScene;
       for ( int i = 0; i < numOfLevels; i++ )
       {
@@ -720,17 +721,11 @@ int EncRCPic::xEstPicTargetBits( EncRCSeq* encRcSeq, EncRCGOP* encRcGOP )
   if ( encRcSeq->twoPass )
   {
     double gopVsBitrateRatio = 1.0;
+    double frameVsGopRatio = 1.0;
     int tmpTargetBits = 0;
     double alpha[ 7 ] = { 0.0 };
-    encRcSeq->getTargetBitsFromFirstPass( encRcSeq->framesCoded, tmpTargetBits, gopVsBitrateRatio, isNewScene, alpha );
-    if ( currPicPosition == 0 || encRCSeq->framesLeft < encRcSeq->gopSize )
-    {
-      targetBits = int( ( encRcSeq->estimatedBitUsage - encRcSeq->bitsUsed ) * gopVsBitrateRatio + tmpTargetBits ); // calculate the difference of under/overspent bits and adjust the current target bits based on the gop ratio only for the first frame in the gop
-    }
-    else
-    {
-      targetBits = tmpTargetBits;
-    }
+    encRcSeq->getTargetBitsFromFirstPass( encRcSeq->framesCoded, tmpTargetBits, gopVsBitrateRatio, frameVsGopRatio, isNewScene, alpha );
+    targetBits = int( ( encRcSeq->estimatedBitUsage - encRcSeq->bitsUsed ) * gopVsBitrateRatio * frameVsGopRatio + tmpTargetBits ); // calculate the difference of under/overspent bits and adjust the current target bits based on the gop and frame ratio for every frame
 
     if ( encRcSeq->bitsUsed > 0 )
     {
diff --git a/source/Lib/EncoderLib/RateCtrl.h b/source/Lib/EncoderLib/RateCtrl.h
index 824049253..a30ccefe3 100644
--- a/source/Lib/EncoderLib/RateCtrl.h
+++ b/source/Lib/EncoderLib/RateCtrl.h
@@ -122,7 +122,7 @@ namespace vvenc {
     void setQpInGOP( int gopId, int gopQp, int &qp );
     bool isQpResetRequired( int gopId );
     int  getLeftAverageBits() { CHECK( !( framesLeft > 0 ), "No frames left" ); return (int)( bitsLeft / framesLeft ); }
-    void getTargetBitsFromFirstPass( int poc, int &targetBits, double &gopVsBitrateRatio, bool &isNewScene, double alpha[] );
+    void getTargetBitsFromFirstPass( int poc, int &targetBits, double &gopVsBitrateRatio, double &frameVsGopRatio, bool &isNewScene, double alpha[] );
 
   public:
     int             rcMode;
diff --git a/source/Lib/vvenc/EncCfg.cpp b/source/Lib/vvenc/EncCfg.cpp
index 9fc919532..b3629beac 100644
--- a/source/Lib/vvenc/EncCfg.cpp
+++ b/source/Lib/vvenc/EncCfg.cpp
@@ -1417,7 +1417,7 @@ bool EncCfg::initCfgParameter()
   }
 
   /// Experimental settings
-  checkExperimental( m_RCRateControlMode != 0 && m_RCNumPasses == 2 && m_usePerceptQPA != 0, "2-pass rate control with perceptually optimized QP-adaptation is experimental!" );
+  // checkExperimental( experimental combination of parameters, "Description!" );
 
   return( m_confirmFailed );
 }