Merge pull request #14 from fraunhoferhhi/develop_v0.2.1.0

develop v0.2.1.0
fraunhoferhhi · Dec 22, 2020 · 77af893 · 77af893
2 parents 58282d7 + 3867242
commit 77af893
Show file tree

Hide file tree

Showing 17 changed files with 124 additions and 84 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,7 +10,7 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.12.0 )
 endif()
 
 # project name
-project( vvenc VERSION 0.2.0.0 )
+project( vvenc VERSION 0.2.1.0 )
 
 if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
   # enable sse4.1 build for all source files for gcc and clang

diff --git a/README.md b/README.md
@@ -72,6 +72,13 @@ The expert mode encoder (**vvencFFapp**) is based on the [VVC test model (VTM)](
 
     vvencFFapp -c randomaccess_medium.cfg -c sequence.cfg
 
+# Contributing
+
+Feel free to contribute. To do so:
+
+* Fork the current-most state of the master branch
+* Apply the desired changes
+* Create a pull-request to the upstream repository
 
 # License
 

diff --git a/changelog.txt b/changelog.txt
@@ -1,3 +1,14 @@
+/////////////////////////////////////////
+tag 0.2.1.0
+
+* bugfixes:
+ - ISP fix: erroneous placed sanity check
+
+* libvvenc:
+ - decrease memory consumption
+ - harmonize 2-pass rate control and perceptual QPA
+ - improve rate control
+
 /////////////////////////////////////////
 tag 0.2.0.0
 
@@ -22,7 +33,7 @@ tag 0.2.0.0
  - various memory reductions (Rom.cpp, scaling list memory)
  - verious optimizations (SIMD for MCTF, forward transformation, single column IF; memory accesses for DMVR)
  - changed MCTF algorithm to do intermediate rounding between hor/ver filtering
- 
+
 * vvencapp:
  - new parameter      --refreshsec,-rs to define the intra refresh rate in seconds depending on the given frame rate.
                        Internally, the refresh rate in seconds is translated into the frames where the refresh is set.

diff --git a/source/Lib/CommonLib/CodingStructure.cpp b/source/Lib/CommonLib/CodingStructure.cpp
@@ -122,24 +122,10 @@ void CodingStructure::destroy()
 
   destroyCoeffs();
 
-  for( uint32_t i = 0; i < MAX_NUM_CH; i++ )
-  {
-    delete[] m_cuPtr[ i ];
-    m_cuPtr[ i ] = nullptr;
-
-    delete[] m_tuPtr[ i ];
-    m_tuPtr[ i ] = nullptr;
-  }
-
-  for( int i = 0; i < NUM_EDGE_DIR; i++ )
-  {
-    xFree( m_lfParam[ i ] );
-    m_lfParam[ i ] = nullptr;
-  }
-
   delete[] m_motionBuf;
   m_motionBuf = nullptr;
 
+  destroyTempBuffers();
 
   if ( m_unitCacheMutex ) m_unitCacheMutex->lock();
 
@@ -650,47 +636,64 @@ void CodingStructure::createInternals( const UnitArea& _unit, const bool isTopLa
   parent  = nullptr;
   refCS   = nullptr;
 
-  unsigned numCh = getNumberValidChannels(area.chromaFormat);
+  unsigned _lumaAreaScaled = g_miScaling.scale( area.lumaSize() ).area();
+  m_motionBuf = new MotionInfo[_lumaAreaScaled];
 
-  for (unsigned i = 0; i < numCh; i++)
+  if( isTopLayer )
   {
-    Size allocArea = area.blocks[i].size();
-    m_mapSize[i] = unitScale[i].scale(allocArea);
+    motionLutBuf.resize( pcv->heightInCtus );
+  }
+  else
+  {
+    createCoeffs();
+    createTempBuffers( false );
+    initStructData();
+  }
+}
 
-    unsigned _area = unitScale[i].scale( area.blocks[i].size() ).area();
+void CodingStructure::createTempBuffers( const bool isTopLayer )
+{
+  unsigned numCh = getNumberValidChannels( area.chromaFormat );
+
+  for( unsigned i = 0; i < numCh; i++ )
+  {
+    Size allocArea  = area.blocks[i].size();
+    m_mapSize[i]    = unitScale[i].scale(allocArea);
 
-    m_cuPtr[i]    = _area > 0 ? new CodingUnit*    [_area] : nullptr;
-    m_tuPtr[i]    = _area > 0 ? new TransformUnit* [_area] : nullptr;
+    unsigned _area  = unitScale[i].scale( area.blocks[i].size() ).area();
+
+    m_cuPtr[i]      = _area > 0 ? new CodingUnit*    [_area] : nullptr;
+    m_tuPtr[i]      = _area > 0 ? new TransformUnit* [_area] : nullptr;
   }
 
   for( unsigned i = 0; i < NUM_EDGE_DIR; i++ )
   {
     m_lfParam[i] = ( isTopLayer && m_mapSize[0].area() > 0 ) ? ( LoopFilterParam* ) xMalloc( LoopFilterParam, m_mapSize[0].area() ) : nullptr;
   }
 
-  numCh = getNumberValidComponents(area.chromaFormat);
+  unsigned _maxNumDmvrMvs = ( area.lwidth() >> 3 ) * ( area.lheight() >> 3 );
+  m_dmvrMvCache.resize( _maxNumDmvrMvs );
+}
 
-  for (unsigned i = 0; i < numCh; i++)
+void CodingStructure::destroyTempBuffers()
+{
+  for( uint32_t i = 0; i < MAX_NUM_CH; i++ )
   {
-    m_offsets[i] = 0;
+    delete[] m_cuPtr[i];
+    m_cuPtr[i] = nullptr;
+
+    delete[] m_tuPtr[i];
+    m_tuPtr[i] = nullptr;
   }
 
-  if( isTopLayer )
+  for( int i = 0; i < NUM_EDGE_DIR; i++ )
   {
-    motionLutBuf.resize( pcv->heightInCtus );
+    xFree( m_lfParam[i] );
+    m_lfParam[i] = nullptr;
   }
-  else
-  {
-    createCoeffs();
-  }
-
-  unsigned _lumaAreaScaled = g_miScaling.scale( area.lumaSize() ).area();
-  m_motionBuf       = new MotionInfo[_lumaAreaScaled];
-
-  unsigned _maxNumDmvrMvs = ( area.lwidth() >> 3 ) * ( area.lheight() >> 3 );
-  m_dmvrMvCache.resize( _maxNumDmvrMvs );
 
-  initStructData();
+  // swap the contents of the vector so that memory released
+  std::vector<Mv>().swap( m_dmvrMvCache );
 }
 
 void CodingStructure::addMiToLut(static_vector<HPMVInfo, MAX_NUM_HMVP_CANDS> &lut, const HPMVInfo &mi)
@@ -739,6 +742,11 @@ void CodingStructure::createCoeffs()
 
     m_coeffs[i] = _area > 0 ? ( TCoeff* ) xMalloc( TCoeff, _area ) : nullptr;
   }
+
+  for( unsigned i = 0; i < numComp; i++ )
+  {
+    m_offsets[i] = 0;
+  }
 }
 
 void CodingStructure::destroyCoeffs()

diff --git a/source/Lib/CommonLib/CodingStructure.h b/source/Lib/CommonLib/CodingStructure.h
@@ -176,6 +176,8 @@ class CodingStructure
   void clearCUs();
   const int signalModeCons( const PartSplit split, Partitioner &partitioner, const ModeType modeTypeParent ) const;
 
+  void createTempBuffers( const bool isTopLayer );
+  void destroyTempBuffers();
 private:
   void createInternals(const UnitArea& _unit, const bool isTopLayer);
 

diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp
@@ -187,7 +187,6 @@ void Picture::create( ChromaFormat _chromaFormat, const Size& size, unsigned _ma
   margin            =  _margin;
   const Area a      = Area( Position(), size );
   m_bufs[ PIC_RECONSTRUCTION ].create( _chromaFormat, a, _maxCUSize, _margin, MEMORY_ALIGN_DEF_SIZE );
-  m_bufs[ PIC_SAO_TEMP ].create( _chromaFormat, a, _maxCUSize, 0, MEMORY_ALIGN_DEF_SIZE );
 
   if( _decoder )
   {
@@ -230,11 +229,17 @@ void Picture::destroy()
 
 void Picture::createTempBuffers( unsigned _maxCUSize )
 {
+  CHECK( !cs, "Coding structure is required a this point!" );
+
+  m_bufs[PIC_SAO_TEMP].create( chromaFormat, Y(), cs->pcv->maxCUSize, 0, MEMORY_ALIGN_DEF_SIZE );
+
   if( cs ) cs->rebindPicBufs();
 }
 
 void Picture::destroyTempBuffers()
 {
+  m_bufs[PIC_SAO_TEMP].destroy();
+
   if( cs ) cs->rebindPicBufs();
 }
 
@@ -262,7 +267,6 @@ void Picture::finalInit( const VPS& _vps, const SPS& sps, const PPS& pps, PicHea
 
   if( cs )
   {
-    cs->initStructData();
     CHECK( cs->sps != &sps, "picture initialization error: sps changed" );
     CHECK( cs->vps != &_vps, "picture initialization error: vps changed" );
   }

diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
@@ -501,19 +501,20 @@ void TrQuant::xT( const TransformUnit& tu, const ComponentID compID, const CPelB
   }
 #endif //ENABLE_SIMD_TRAFO
 
-  const int      shift_1st              = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
-  const int      shift_2nd              =  (Log2(height))            + TRANSFORM_MATRIX_SHIFT                          + COM16_C806_TRANS_PREC;
-  CHECK( shift_1st < 0, "Negative shift" );
-  CHECK( shift_2nd < 0, "Negative shift" );
-
   if (width > 1 && height > 1)
   {
+    const int shift_1st = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
+    const int shift_2nd =  (Log2(height))            + TRANSFORM_MATRIX_SHIFT                          + COM16_C806_TRANS_PREC;
+    CHECK( shift_1st < 0, "Negative shift" );
+    CHECK( shift_2nd < 0, "Negative shift" );
     fastFwdTrans[trTypeHor][transformWidthIndex](block, tmp, shift_1st, height, 0, skipWidth);
     fastFwdTrans[trTypeVer][transformHeightIndex](tmp, dstCoeff.buf, shift_2nd, width, skipWidth, skipHeight);
   }
   else if (height == 1)   // 1-D horizontal transform
   {
-    fastFwdTrans[trTypeHor][transformWidthIndex](block, dstCoeff.buf, shift_1st, 1, 0, skipWidth);
+    const int shift = ((Log2(width )) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange + COM16_C806_TRANS_PREC;
+    CHECK( shift < 0, "Negative shift" );
+    fastFwdTrans[trTypeHor][transformWidthIndex](block, dstCoeff.buf, shift, 1, 0, skipWidth);
   }
   else   // if (iWidth == 1) //1-D vertical transform
   {
@@ -562,14 +563,14 @@ void TrQuant::xIT( const TransformUnit& tu, const ComponentID compID, const CCoe
     }
   }
 
-  const int      shift_1st              =   TRANSFORM_MATRIX_SHIFT + 1 + COM16_C806_TRANS_PREC; // 1 has been added to shift_1st at the expense of shift_2nd
-  const int      shift_2nd              = ( TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1 ) - bitDepth + COM16_C806_TRANS_PREC;
-  CHECK( shift_1st < 0, "Negative shift" );
-  CHECK( shift_2nd < 0, "Negative shift" );
   TCoeff *block = m_blk;
   TCoeff *tmp   = m_tmp;
   if (width > 1 && height > 1)   // 2-D transform
   {
+    const int shift_1st =   TRANSFORM_MATRIX_SHIFT + 1 + COM16_C806_TRANS_PREC; // 1 has been added to shift_1st at the expense of shift_2nd
+    const int shift_2nd = ( TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1 ) - bitDepth + COM16_C806_TRANS_PREC;
+    CHECK( shift_1st < 0, "Negative shift" );
+    CHECK( shift_2nd < 0, "Negative shift" );
     fastInvTrans[trTypeVer][transformHeightIndex](pCoeff.buf, tmp, shift_1st, width, skipWidth, skipHeight, clipMinimum, clipMaximum);
     fastInvTrans[trTypeHor][transformWidthIndex](tmp, block, shift_2nd, height, 0, skipWidth, clipMinimum, clipMaximum);
   }

diff --git a/source/Lib/CommonLib/x86/QuantX86.h b/source/Lib/CommonLib/x86/QuantX86.h
@@ -204,7 +204,7 @@ static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const
     {
       for( int y = 0; y <= maxY; y++)
       {
-        __m128i v_level = _mm_loadu_si128( ( __m128i const * )&piQCoef[y * piQCfStride]  );
+        __m128i v_level = maxX == 1 ? _mm_loadl_epi64( (__m128i const*) & piQCoef[y * piQCfStride] ) : _mm_loadu_si128( (__m128i const*) & piQCoef[y * piQCfStride] );
         v_level = _mm_packs_epi32 (v_level,v_level);
         v_level = _mm_and_si128(v_level,vlevmask);
         v_level = _mm_max_epi16 (v_level, v_min);
@@ -218,7 +218,10 @@ static void DeQuantCoreSIMD(const int maxX,const int maxY,const int scale,const
 
         v_level = _mm_max_epi32 (v_level, v_Tmin);
         v_level = _mm_min_epi32 (v_level, v_Tmax);
-        _mm_storeu_si128(( __m128i * )(piCoef+y*width ), v_level );
+        if( maxX == 1 )
+          _mm_storel_epi64( (__m128i*)(piCoef + y * width), v_level );
+        else
+          _mm_storeu_si128( (__m128i*)(piCoef + y * width), v_level );
       }
     }
     else

diff --git a/source/Lib/DecoderLib/DecLib.cpp b/source/Lib/DecoderLib/DecLib.cpp
@@ -156,6 +156,11 @@ bool tryDecodePicture( Picture* pcEncPic, const int expectedPoc, const std::stri
               {
                 if( pic->poc == poc && (!bDecodeUntilPocFound || expectedPoc == poc ) )
                 {
+                  pcEncPic->createTempBuffers( pic->cs->pcv->maxCUSize );
+                  pcEncPic->cs->createCoeffs();
+                  pcEncPic->cs->createTempBuffers( true );
+                  pcEncPic->cs->initStructData();
+
                   CHECK( pcEncPic->slices.size() == 0, "at least one slice should be available" );
 
                   CHECK( expectedPoc != poc, "mismatch in POC - check encoder configuration" );
@@ -669,9 +674,10 @@ void DecLib::finishPicture(int& poc, PicList*& rpcListPic, MsgLevel msgl )
   m_maxDecSubPicIdx = 0;
   m_maxDecSliceAddrInSubPic = -1;
 
-  m_pic->destroyTempBuffers();
-  m_pic->cs->destroyCoeffs();
   m_pic->cs->releaseIntermediateData();
+  m_pic->cs->destroyTempBuffers();
+  m_pic->cs->destroyCoeffs();
+  m_pic->destroyTempBuffers();
   m_pic->cs->picHeader->initPicHeader();
 }
 
@@ -1007,6 +1013,8 @@ void DecLib::xActivateParameterSets( const int layerId)
 
     m_pic->createTempBuffers( m_pic->cs->pps->pcv->maxCUSize );
     m_pic->cs->createCoeffs();
+    m_pic->cs->createTempBuffers( true );
+    m_pic->cs->initStructData();
 
     m_pic->allocateNewSlice();
     // make the slice-pilot a real slice, and set up the slice-pilot for the next slice

diff --git a/source/Lib/EncoderLib/BitAllocation.cpp b/source/Lib/EncoderLib/BitAllocation.cpp
@@ -339,7 +339,7 @@ int BitAllocation::applyQPAdaptationChroma (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr || optChromaQPOffset == nullptr || encCfg->m_usePerceptQPA > 4) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const int          bitDepth = slice->sps->bitDepths[CH_L];
 
@@ -394,6 +394,7 @@ int BitAllocation::applyQPAdaptationChroma (const Slice* slice, const EncCfg* en
 }
 
 int BitAllocation::applyQPAdaptationLuma (const Slice* slice, const EncCfg* encCfg, const int savedQP, const double lambda, std::vector<int>& ctuPumpRedQP,
+                                          const bool forceFrameWiseQPA,
                                           const uint32_t ctuStartAddr, const uint32_t ctuBoundingAddr, const bool isHDR /*= false*/)
 {
   Picture* const pic          = (slice != nullptr ? slice->pic : nullptr);
@@ -403,9 +404,9 @@ int BitAllocation::applyQPAdaptationLuma (const Slice* slice, const EncCfg* encC
 
   if (pic == nullptr || pic->cs == nullptr || encCfg == nullptr || ctuStartAddr >= ctuBoundingAddr) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
-  const bool useFrameWiseQPA  = (encCfg->m_QP > MAX_QP_PERCEPT_QPA);
+  const bool useFrameWiseQPA  = (encCfg->m_QP > MAX_QP_PERCEPT_QPA) || forceFrameWiseQPA;
   const int          bitDepth = slice->sps->bitDepths[CH_L];
   const int           sliceQP = (savedQP < 0 ? slice->sliceQp : savedQP);
   const PreCalcValues&    pcv = *pic->cs->pcv;
@@ -588,7 +589,7 @@ int BitAllocation::applyQPAdaptationSubCtu (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr) return -1;
 
-  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRBasedQPA  = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighResolution = (encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const int         bitDepth  = slice->sps->bitDepths[CH_L];
   const PosType     guardSize = (isHighResolution ? 2 : 1);
@@ -664,7 +665,7 @@ double BitAllocation::getPicVisualActivity (const Slice* slice, const EncCfg* en
 
   if (pic == nullptr || encCfg == nullptr) return 0.0;
 
-  const bool isXPSNRQPA = (encCfg->m_usePerceptQPA & 1) == 0 && encCfg->m_RCNumPasses != 2;
+  const bool isXPSNRQPA = (encCfg->m_usePerceptQPA & 1) == 0 && (encCfg->m_RCRateControlMode == 0 || encCfg->m_RCNumPasses != 2);
   const bool isHighRes  = ( encCfg->m_SourceWidth > 2048 || encCfg->m_SourceHeight > 1280 ) && ( encCfg->m_usePerceptQPA & 1 ) == 0;
   const CPelBuf picOrig = (origBuf != nullptr ? *origBuf : pic->getOrigBuf (COMP_Y));
   const CPelBuf picPrv1 = (isXPSNRQPA ? pic->getOrigBufPrev (COMP_Y, false) : picOrig);

diff --git a/source/Lib/EncoderLib/BitAllocation.h b/source/Lib/EncoderLib/BitAllocation.h
@@ -65,7 +65,7 @@ namespace vvenc {
                                  std::vector<int>& ctuPumpRedQP,
                                  int optChromaQPOffset[2], const bool isHDR = false);
     int applyQPAdaptationLuma   (const Slice* slice, const EncCfg* encCfg, const int savedQP, const double lambda,
-                                 std::vector<int>& ctuPumpRedQP,
+                                 std::vector<int>& ctuPumpRedQP, const bool forceFrameWiseQPA,
                                  const uint32_t ctuStartAddr, const uint32_t ctuBoundingAddr, const bool isHDR = false);
     int applyQPAdaptationSubCtu (const Slice* slice, const EncCfg* encCfg, const Area& lumaArea, const bool isHDR = false);
     int getCtuPumpingReducingQP (const Slice* slice, const CPelBuf& origY, const Distortion uiSadBestForQPA,

diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
@@ -362,11 +362,6 @@ void EncLib::xSetRCEncCfg( int pass )
     // restore MCTF
     m_cBckCfg.m_MCTF              = mctf;
 
-    // configure QPA in the first pass
-    m_cBckCfg.m_usePerceptQPA                  = 0; // disable QPA in the first pass
-    m_cBckCfg.m_sliceChromaQpOffsetPeriodicity = 0;
-    m_cBckCfg.m_usePerceptQPATempFiltISlice    = 0;
-
     std::swap( const_cast<EncCfg&>(m_cEncCfg), m_cBckCfg );
   }
 }
@@ -603,9 +598,6 @@ void EncLib::xInitPicture( Picture& pic, int picNum, const PPS& pps, const SPS&
   pic.vps = &vps;
   pic.dci = &dci;
 
-  pic.createTempBuffers( pic.cs->pps->pcv->maxCUSize );
-  pic.cs->createCoeffs();
-
   // filter data initialization
   const uint32_t numberOfCtusInFrame = pic.cs->pcv->sizeInCtus;