From ac10055ff13dbc0f4035a4bf0b6b94cccdd9bc3d Mon Sep 17 00:00:00 2001
From: Adam Wieckowski <70575289+adamjw24@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:04:02 +0100
Subject: [PATCH] IFP and RC harmonization, fixes for ARM, early GCC 11 (#350)

---
 .github/workflows/Build.yml                   |   2 +-
 .gitlab-ci-internal.yml                       |   2 +
 CMakeLists.txt                                |  29 +++-
 Makefile                                      |   4 +
 cmake/modules/vvencCompilerSupport.cmake      |  18 ++
 include/vvenc/vvencCfg.h                      |  14 +-
 source/Lib/CommonLib/CommonDef.h              |   2 -
 source/Lib/CommonLib/DepQuant.cpp             |   6 +-
 source/Lib/CommonLib/DepQuant.h               |   4 +-
 source/Lib/CommonLib/InterPrediction.cpp      |  16 +-
 source/Lib/CommonLib/InterPrediction.h        |   6 +-
 source/Lib/CommonLib/Picture.cpp              |   5 +
 source/Lib/CommonLib/Picture.h                |   2 +
 source/Lib/CommonLib/Slice.cpp                |  19 ++-
 source/Lib/CommonLib/Slice.h                  |   3 +-
 source/Lib/CommonLib/TimeProfiler.h           |   1 +
 source/Lib/CommonLib/TrQuant.cpp              |   2 +-
 source/Lib/CommonLib/TypeDef.h                |   2 +
 source/Lib/CommonLib/UnitTools.cpp            |  10 +-
 source/Lib/CommonLib/UnitTools.h              |   5 +-
 source/Lib/CommonLib/arm/BufferARM.h          |  18 +-
 source/Lib/CommonLib/arm/RdCostARM.h          | 103 +-----------
 source/Lib/CommonLib/x86/CommonDefX86.cpp     |   4 +-
 source/Lib/CommonLib/x86/DepQuantX86.h        |  10 --
 source/Lib/CommonLib/x86/FixMissingIntrin.h   |   2 +
 .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp  |  14 +-
 source/Lib/EncoderLib/EncCu.cpp               |  28 ++--
 source/Lib/EncoderLib/EncGOP.cpp              | 158 ++++++++++++++----
 source/Lib/EncoderLib/EncGOP.h                |   2 +
 source/Lib/EncoderLib/EncLib.cpp              |   8 +
 source/Lib/EncoderLib/EncSlice.cpp            |  26 +--
 source/Lib/EncoderLib/GOPCfg.cpp              |  18 +-
 source/Lib/EncoderLib/GOPCfg.h                |  10 +-
 source/Lib/EncoderLib/InterSearch.cpp         |  70 ++++----
 source/Lib/EncoderLib/InterSearch.h           |   6 +-
 source/Lib/EncoderLib/PreProcess.cpp          |  12 +-
 source/Lib/apputils/VVEncAppCfg.h             |  24 ++-
 source/Lib/vvenc/vvencCfg.cpp                 |  71 +++++++-
 source/Lib/vvenc/vvencimpl.cpp                |   6 +
 39 files changed, 462 insertions(+), 280 deletions(-)

diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml
index 708003b69..05d41e438 100644
--- a/.github/workflows/Build.yml
+++ b/.github/workflows/Build.yml
@@ -73,6 +73,6 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake ..  -DCMAKE_BUILD_TYPE=Release -A "${{ matrix.config.msvc_arch }}"
+        cmake ..  -DCMAKE_BUILD_TYPE=Release -DVVENC_OVERRIDE_COMPILER_CHECK=ON -A "${{ matrix.config.msvc_arch }}"
         cmake --build . --config Release
       shell: cmd
diff --git a/.gitlab-ci-internal.yml b/.gitlab-ci-internal.yml
index 4e918ebf1..9f94dac8d 100644
--- a/.gitlab-ci-internal.yml
+++ b/.gitlab-ci-internal.yml
@@ -214,6 +214,7 @@ test_vc193x_Win32:
   extends: .build_test_msvc_template
   variables:
      MSVC_ARCH: Win32
+     CONFIG_OPTIONS: "-DVVENC_OVERRIDE_COMPILER_CHECK=1"
   tags:
     - vc193x
 
@@ -221,6 +222,7 @@ test_vc193x:
   extends: .build_test_msvc_template
   variables:
      MSVC_ARCH: x64
+     CONFIG_OPTIONS: "-DVVENC_OVERRIDE_COMPILER_CHECK=1"
   tags:
     - vc193x
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5565dcb4c..85e6a783e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,10 +31,31 @@ if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm"
 endif()
 
 # we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86
-set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" )
+set( VVENC_ENABLE_X86_SIMD TRUE                      CACHE BOOL "enable x86 intrinsics" )
 set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" )
 
 include( vvencCompilerSupport )
+check_problematic_compiler( VVENC_PROBLEMATIC_COMPILER "MSVC" 19.38 "" )
+
+if( VVENC_PROBLEMATIC_COMPILER )
+  set( VVENC_OVERRIDE_COMPILER_CHECK      OFF        CACHE BOOL "Build with known problematic compiler version" )
+
+  if( VVENC_OVERRIDE_COMPILER_CHECK )
+    set( VVENC_PROBLEMATIC_COMPILER_MSG_TYPE     WARNING )
+    set( VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE
+         "The performance will not be optimal due to workarounds." )
+  else()
+    set( VVENC_PROBLEMATIC_COMPILER_MSG_TYPE FATAL_ERROR )
+    set( VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE
+         "Set -DVVENC_OVERRIDE_COMPILER_CHECK=ON to build with this compiler anyways, which enables workarounds impacting performance.")
+  endif()
+
+  message( ${VVENC_PROBLEMATIC_COMPILER_MSG_TYPE}
+          "Binaries compiled with ${CMAKE_CXX_COMPILER} version ${CMAKE_CXX_COMPILER_VERSION} are known not to behave as intended. "
+          "The problematic version range is ${VVENC_PROBLEMATIC_COMPILER_VERSION_RANGE}. Please consider using a different compiler.\n"
+          ${VVENC_PROBLEMATIC_COMPILER_MSG_OVERRIDE} )
+
+endif()
 
 # enable sse4.1 build for all source files for gcc and clang
 if( VVENC_ENABLE_X86_SIMD )
@@ -81,14 +102,14 @@ endif()
 # enable install target
 set( VVENC_ENABLE_INSTALL                   ON  CACHE BOOL   "Enable or disable install target" )
 
-# enable postfix                                             
+# enable postfix
 set( VVENC_ENABLE_BUILD_TYPE_POSTFIX        OFF CACHE BOOL   "Enable or disable build type postfix for apps and libs" )
 
 set( VVENC_ENABLE_LINK_TIME_OPT             ON  CACHE BOOL   "Enable link time optimization for release and profile builds" )
 
 set( VVENC_ENABLE_THIRDPARTY_JSON           ON  CACHE BOOL   "Enable use of thirdparty json library" )
 
-set( VVENC_INSTALL_FULLFEATURE_APP		    OFF CACHE BOOL   "Install the full-feature app: vvencFFapp" )
+set( VVENC_INSTALL_FULLFEATURE_APP          OFF CACHE BOOL   "Install the full-feature app: vvencFFapp" )
 
 if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR
     CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
@@ -300,7 +321,7 @@ if( VVENC_ENABLE_INSTALL )
     
     set( CMAKE_INSTALL_RPATH ${RPATH_BASE} ${RPATH_BASE}/${RPATH_REL_DIR} )
     message( STATUS "CMAKE_INSTALL_RPATH=${CMAKE_INSTALL_RPATH}" )
-  endif()  
+  endif()
 endif()
 
 
diff --git a/Makefile b/Makefile
index d0da05c8e..3cb0353d0 100644
--- a/Makefile
+++ b/Makefile
@@ -74,6 +74,10 @@ ifneq ($(install-ffapp),)
 CONFIG_OPTIONS += -DVVENC_INSTALL_FULLFEATURE_APP=$(install-ffapp)
 endif
 
+ifneq ($(override-compiler-check),)
+CONFIG_OPTIONS += -DVVENC_OVERRIDE_COMPILER_CHECK=$(override-compiler-check)
+endif
+
 ifeq ($(j),)
 # Query cmake for the number of cores
 NUM_JOBS := $(shell cmake -P cmake/modules/vvencNumCores.cmake)
diff --git a/cmake/modules/vvencCompilerSupport.cmake b/cmake/modules/vvencCompilerSupport.cmake
index 4d54f90ea..c6e4d170f 100644
--- a/cmake/modules/vvencCompilerSupport.cmake
+++ b/cmake/modules/vvencCompilerSupport.cmake
@@ -83,3 +83,21 @@ function( _emscripten_enable_wasm_simd128 )
     set( CMAKE_REQUIRED_FLAGS -msimd128 PARENT_SCOPE )
   endif()
 endfunction()
+
+function( check_problematic_compiler output_var compiler_id first_bad_version first_fixed_version )
+  if( CMAKE_CXX_COMPILER_ID STREQUAL "${compiler_id}"
+      AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "${first_bad_version}"
+      AND (
+        NOT "${first_fixed_version}"
+        OR CMAKE_CXX_COMPILER_VERSION VERSION_LESS "${first_fixed_version}" ) )
+
+    set( ${output_var} TRUE PARENT_SCOPE )
+
+    if( "${first_fixed_version}" )
+      set( ${output_var}_VERSION_RANGE "(${first_bad_version}...${first_fixed_version}]" PARENT_SCOPE )
+    else()
+      set( ${output_var}_VERSION_RANGE "(${first_bad_version}...)"                       PARENT_SCOPE )
+    endif()
+
+  endif()
+endfunction()
diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h
index c1f46ced9..c54e6bc85 100644
--- a/include/vvenc/vvencCfg.h
+++ b/include/vvenc/vvencCfg.h
@@ -430,7 +430,7 @@ typedef struct vvenc_config
   int                 m_framesToBeEncoded;                                               // number of encoded frames (default: 0, all)
   int                 m_inputBitDepth[ 2 ];                                              // bit-depth of input pictures (2d array for luma,chroma)
 
-  int                 m_numThreads;                                                      // number of worker threads ( if <0: <720p 4threads, else 8threads (limited to available cores))
+  int                 m_numThreads;                                                      // number of worker threads ( if <0: <720p 4threads, <5K 2880p 8threads, else 12threads (limited to available cores))
 
   int                 m_QP;                                                              // QP value of key-picture (0-63, default: 32)
   int                 m_RCTargetBitrate;                                                 // target bitrate in bps (default: 0 (RC disabled))
@@ -761,7 +761,7 @@ typedef struct vvenc_config
   bool                m_picReordering;
   bool                m_reservedFlag;
   bool                m_poc0idr;
-  int8_t              m_fppLinesSynchro;
+  int8_t              m_ifpLines;
   bool                m_blockImportanceMapping;
   bool                m_saoScc;
   bool                m_addGOP32refPics;
@@ -774,8 +774,14 @@ typedef struct vvenc_config
                                                                                          // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate).
                                                                                          // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate.
                                                                                          // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier
-  int                 m_forceScc;
-  double              m_reservedDouble[9];
+  int8_t              m_forceScc;
+  bool                m_ifp;
+
+  int8_t              m_reservedInt8[2];
+
+  int                 m_minIntraDist;
+  int                 m_reservedInt;
+  double              m_reservedDouble[8];
 
   // internal state variables
   bool                m_configDone;                                                      // state variable, Private context used for internal data ( do not change )
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index b804a2e49..e5f5ea64d 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -501,8 +501,6 @@ static constexpr uint8_t MAX_TMP_BUFS = 6;
 
 static constexpr int QPA_MAX_NOISE_LEVELS = 8;
 
-static constexpr int FPPLS_ALF_DERIVE_LINES   = 1; ///< number of CTU lines for ALF filter derivation
-static constexpr int FPPLS_CCALF_DERIVE_LINES = 1; ///< number of CTU lines for CCALF filter derivation
 
 
 // ====================================================================================================================
diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp
index a77964072..76b7e99de 100644
--- a/source/Lib/CommonLib/DepQuant.cpp
+++ b/source/Lib/CommonLib/DepQuant.cpp
@@ -1436,7 +1436,7 @@ void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffB
     const uint32_t    log2TrHeight    = Log2(height);
     const bool isLfnstApplied         = tu.cu->lfnstIdx > 0 && (CU::isSepTree(*tu.cu) ? true : isLuma(compID));
     const bool enableScalingLists     = getUseScalingList(width, height, (tu.mtsIdx[compID] == MTS_SKIP), isLfnstApplied);
-    static_cast<DQIntern::DepQuant*>(p)->quant( tu, pSrc, compID, cQP, Quant::m_dLambda, ctx, uiAbsSum, enableScalingLists, Quant::getQuantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) );
+    p->quant( tu, pSrc, compID, cQP, Quant::m_dLambda, ctx, uiAbsSum, enableScalingLists, Quant::getQuantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) );
   }
   else
   {
@@ -1460,7 +1460,7 @@ void DepQuant::dequant( const TransformUnit& tu, CoeffBuf& dstCoeff, const Compo
     const uint32_t    log2TrHeight   = Log2(height);
     const bool isLfnstApplied        = tu.cu->lfnstIdx > 0 && (CU::isSepTree(*tu.cu) ? true : isLuma(compID));
     const bool enableScalingLists    = getUseScalingList(width, height, (tu.mtsIdx[compID] == MTS_SKIP), isLfnstApplied);
-    static_cast<DQIntern::DepQuant*>(p)->dequant( tu, dstCoeff, compID, cQP, enableScalingLists, Quant::getDequantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) );
+    p->dequant( tu, dstCoeff, compID, cQP, enableScalingLists, Quant::getDequantCoeff(scalingListType, qpRem, log2TrWidth, log2TrHeight) );
   }
   else
   {
@@ -1472,7 +1472,7 @@ void DepQuant::init( int rdoq, bool useRDOQTS, int thrVal )
 {
   QuantRDOQ2::init( rdoq, useRDOQTS, thrVal );
 
-  static_cast<DQIntern::DepQuant*>(p)->init( thrVal );
+  p->init( thrVal );
 }
 
 } // namespace vvenc
diff --git a/source/Lib/CommonLib/DepQuant.h b/source/Lib/CommonLib/DepQuant.h
index ba8de5339..3cb56318a 100644
--- a/source/Lib/CommonLib/DepQuant.h
+++ b/source/Lib/CommonLib/DepQuant.h
@@ -241,8 +241,8 @@ class DepQuantImpl
 public:
   virtual ~DepQuantImpl() {}
   virtual void quant   ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ) = 0;
-  virtual void dequant ( const TransformUnit& tu,  CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP,                                                      bool enableScalingLists, int* quantCoeff );
-  virtual void init    ( int dqTrVal );
+  void         dequant ( const TransformUnit& tu,  CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP,                                                      bool enableScalingLists, int* quantCoeff );
+  void         init    ( int dqTrVal );
 
 protected:
   DQIntern::Quantizer  m_quant;
diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index 8d9e1c6ee..0f5a55216 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -246,7 +246,7 @@ void InterPrediction::destroy()
   m_IBCBuffer.destroy();
 }
 
-void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int ctuSize, const int fppLinesSynchro )
+void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int ctuSize, const int ifpLines )
 {
   // if it has been initialised before, but the chroma format has changed, release the memory and start again.
   if( m_yuvPred[L0].getOrigin( COMP_Y ) != nullptr && m_currChromaFormat != chFormat )
@@ -279,7 +279,7 @@ void InterPrediction::init( RdCost* pcRdCost, ChromaFormat chFormat, const int c
     m_IBCBufferWidth = g_IBCBufferSize / ctuSize;
     m_IBCBuffer.create(UnitArea(chFormat, Area(0, 0, m_IBCBufferWidth, ctuSize)));
   }
-  InterPredInterpolation::m_fppLinesSynchro = fppLinesSynchro;
+  InterPredInterpolation::m_ifpLines = ifpLines;
 }
 
 // ====================================================================================================================
@@ -615,7 +615,7 @@ InterPredInterpolation::InterPredInterpolation()
   , m_skipPROF(false)
   , m_encOnly(false)
   , m_isBi(false)
-  , m_fppLinesSynchro(0)
+  , m_ifpLines(0)
 {
 
 }
@@ -727,7 +727,7 @@ void InterPredInterpolation::xPredInterBlk ( const ComponentID compID, const Cod
 
   bool  wrapRef = false;
   Mv    mv(_mv);
-  CHECKD( m_fppLinesSynchro && !srcPadBuf && !CU::isMvInRangeFPP( cu[compID].y, cu[compID].height, mv.ver, m_fppLinesSynchro, *cu.cs->pcv, getComponentScaleY(compID, chFmt) ), "xPredInterBlk: CTU line-wise FPP MV restriction failed!\n" );
+  CHECKD( m_ifpLines && !srcPadBuf && cu.cs->picture != refPic && !CU::isMvInRangeFPP( cu[compID].y, cu[compID].height, mv.ver, m_ifpLines, *cu.cs->pcv, getComponentScaleY(compID, chFmt) ), "xPredInterBlk: CTU line-wise FPP MV restriction failed!\n" );
   if( !isIBC && cu.cs->pcv->wrapArround )
   {
     wrapRef = wrapClipMv( mv, cu.blocks[0].pos(), cu.blocks[0].size(), *cu.cs);
@@ -1796,7 +1796,7 @@ void InterPredInterpolation::xPredAffineBlk(const ComponentID compID, const Codi
         iMvScaleTmpVer = curMv.ver;
       }
 
-      CHECKD( m_fppLinesSynchro && !CU::isMvInRangeFPP( puY + h, blockHeight, iMvScaleTmpVer, m_fppLinesSynchro, *pps.pcv, iScaleY ), "xPredAffineBlk: FPP MV restriction failed!\n" );
+      CHECKD( m_ifpLines && !CU::isMvInRangeFPP( puY + h, blockHeight, iMvScaleTmpVer, m_ifpLines, *pps.pcv, iScaleY ), "xPredAffineBlk: FPP MV restriction failed!\n" );
       // get the MV in high precision
       int xFrac, yFrac, xInt, yInt;
 
@@ -1896,10 +1896,10 @@ void InterPredInterpolation::xPredAffineBlk(const ComponentID compID, const Codi
 
 }
 
-bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const Mv* _mv, const int fppLinesSynchro, const int mvPrecShift )
+bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const Mv* _mv, const int ifpLines, const int mvPrecShift )
 {
   const PreCalcValues& pcv = *cu.cs->pcv;
-  if( cu.ly() >= ( ( pcv.heightInCtus - 1 - fppLinesSynchro ) << pcv.maxCUSizeLog2 ) )
+  if( cu.ly() >= ( ( pcv.heightInCtus - 1 - ifpLines ) << pcv.maxCUSizeLog2 ) )
     return true;
 
   const ChromaFormat chFmt = cu.chromaFormat;
@@ -1941,7 +1941,7 @@ bool InterPredInterpolation::xIsAffineMvInRangeFPP( const CodingUnit &cu, const
   }
   const bool subblkMVSpreadOverLimit = InterPrediction::isSubblockVectorSpreadOverLimit(iDMvHorX, iDMvHorY, iDMvVerX, iDMvVerY, cu.interDir);
 
-  const int yRefMax     = ( ( ( cu.ly() >> pcv.maxCUSizeLog2 ) + fppLinesSynchro + 1 ) << pcv.maxCUSizeLog2 ) - 1;
+  const int yRefMax     = ( ( ( cu.ly() >> pcv.maxCUSizeLog2 ) + ifpLines + 1 ) << pcv.maxCUSizeLog2 ) - 1;
   const int dctifMarginVerBot = 4;
 
   auto roundMvVal = [&](int mvVal, int shift)
diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
index 0c836f355..b50d96603 100644
--- a/source/Lib/CommonLib/InterPrediction.h
+++ b/source/Lib/CommonLib/InterPrediction.h
@@ -82,7 +82,7 @@ class InterPredInterpolation
   InterpolationFilter  m_if;
   Pel*                 m_filteredBlock        [LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][MAX_NUM_COMP];
   Pel*                 m_filteredBlockTmp     [LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS_SIGNAL][MAX_NUM_COMP];
-  int                  m_fppLinesSynchro;
+  int                  m_ifpLines;
 
   int  xRightShiftMSB         ( int numer, int denom );
   void xApplyBDOF             ( PelBuf& yuvDst, const ClpRng& clpRng );
@@ -122,7 +122,7 @@ class InterPredInterpolation
                             PelUnitBuf &predDst, PelUnitBuf &predSrc0, PelUnitBuf &predSrc1);
 
   static bool isSubblockVectorSpreadOverLimit(int a, int b, int c, int d, int predType);
-  bool xIsAffineMvInRangeFPP (const CodingUnit& cu, const Mv* _mv, const int fppLinesSynchro, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL);
+  bool xIsAffineMvInRangeFPP (const CodingUnit& cu, const Mv* _mv, const int ifpLines, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL);
 };
 
 class DMVR : public InterPredInterpolation
@@ -171,7 +171,7 @@ class InterPrediction : public DMVR
   InterPrediction();
   virtual ~InterPrediction();
 
-  void    init                  ( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, const int fppLinesSynchro = 0 );
+  void    init                  ( RdCost* pcRdCost, ChromaFormat chromaFormatIDC, const int ctuSize, const int ifpLines = 0 );
   void    destroy               ();
 
   // inter
diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp
index 730863722..c011dacd1 100644
--- a/source/Lib/CommonLib/Picture.cpp
+++ b/source/Lib/CommonLib/Picture.cpp
@@ -161,6 +161,7 @@ Picture::Picture()
     , isFinished        ( false )
     , isLongTerm        ( false )
     , isFlush           ( false )
+    , isInProcessList   ( false )
     , precedingDRAP     ( false )
     , gopEntry          ( nullptr )
     , refCounter        ( 0 )
@@ -226,6 +227,7 @@ void Picture::reset()
   isFinished          = false;
   isLongTerm          = false;
   isFlush             = false;
+  isInProcessList     = false;
   isMeanQPLimited     = false;
   precedingDRAP       = false;
 
@@ -236,6 +238,9 @@ void Picture::reset()
   gopAdaptedQP        = 0;
   actualHeadBits      = 0;
   actualTotalBits     = 0;
+  encRCPic            = nullptr;
+  picApsGlobal        = nullptr;
+  refApsGlobal        = nullptr;
 
   std::fill_n( m_sharedBufs, (int)NUM_PIC_TYPES, nullptr );
   std::fill_n( m_bufsOrigPrev, NUM_QPA_PREV_FRAMES, nullptr );
diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h
index a6c052bdb..46a2a54c2 100644
--- a/source/Lib/CommonLib/Picture.h
+++ b/source/Lib/CommonLib/Picture.h
@@ -114,6 +114,7 @@ struct PicApsGlobal{
   int      poc;
   unsigned tid;
   bool     initalized = false;
+  int      refCnt = 0;
   ParameterSetMap<APS> apsMap;
   PicApsGlobal( int _p ) : poc(_p), tid(MAX_UINT), apsMap( MAX_NUM_APS * MAX_NUM_APS_TYPE ) {}
   PicApsGlobal( int _p, unsigned _t ) : poc(_p), tid(_t), apsMap( MAX_NUM_APS * MAX_NUM_APS_TYPE ) {}
@@ -216,6 +217,7 @@ struct Picture : public UnitArea
   bool                          isFinished;
   bool                          isLongTerm;
   bool                          isFlush;
+  bool                          isInProcessList;
   bool                          precedingDRAP; // preceding a DRAP picture in decoding order
 
   const GOPEntry*               gopEntry;
diff --git a/source/Lib/CommonLib/Slice.cpp b/source/Lib/CommonLib/Slice.cpp
index e3643fdbf..5c15f6039 100644
--- a/source/Lib/CommonLib/Slice.cpp
+++ b/source/Lib/CommonLib/Slice.cpp
@@ -443,7 +443,7 @@ void Slice::updateRefPicCounter( int step )
   }
 }
 
-bool Slice::checkRefPicsReconstructed() const
+bool Slice::checkAllRefPicsReconstructed() const
 {
   for ( int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++ )
   {
@@ -460,6 +460,23 @@ bool Slice::checkRefPicsReconstructed() const
   return true;
 }
 
+bool Slice::checkAllRefPicsAccessible() const
+{
+  for ( int refList = 0; refList < NUM_REF_PIC_LIST_01; refList++ )
+  {
+    int numOfActiveRef = numRefIdx[ refList ];
+    for ( int i = 0; i < numOfActiveRef; i++ )
+    {
+      if ( ! refPicList[ refList ][ i ]->isInProcessList )
+      {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 void Slice::checkColRefIdx(uint32_t curSliceSegmentIdx, const Picture* pic) const
 {
   Slice* curSlice   = pic->slices[ curSliceSegmentIdx ];
diff --git a/source/Lib/CommonLib/Slice.h b/source/Lib/CommonLib/Slice.h
index 9097a0ade..8e0001979 100644
--- a/source/Lib/CommonLib/Slice.h
+++ b/source/Lib/CommonLib/Slice.h
@@ -1265,7 +1265,8 @@ class Slice
   void                        resetSlicePart();
   void                        constructRefPicList(const PicList& rcListPic, bool extBorder, const bool usingLongTerm = true);
   void                        updateRefPicCounter( int step );
-  bool                        checkRefPicsReconstructed() const;
+  bool                        checkAllRefPicsAccessible() const;
+  bool                        checkAllRefPicsReconstructed() const;
   void                        setRefPOCList();
   void                        setSMVDParam();
   void                        checkColRefIdx(uint32_t curSliceSegmentIdx, const Picture* pic) const;
diff --git a/source/Lib/CommonLib/TimeProfiler.h b/source/Lib/CommonLib/TimeProfiler.h
index c0a50e39d..19955488d 100644
--- a/source/Lib/CommonLib/TimeProfiler.h
+++ b/source/Lib/CommonLib/TimeProfiler.h
@@ -91,6 +91,7 @@ namespace vvenc {
   E_( P_INTRA_CHROMA            ) \
   E_( P_INTRA                   ) \
   E_( P_QUANT                   ) \
+  E_( P_DEQUANT                 ) \
   E_( P_TRAFO                   ) \
   E_( P_RESHAPER                ) \
   E_( P_DEBLOCK_FILTER          ) \
diff --git a/source/Lib/CommonLib/TrQuant.cpp b/source/Lib/CommonLib/TrQuant.cpp
index e75e85063..18f7afd0f 100644
--- a/source/Lib/CommonLib/TrQuant.cpp
+++ b/source/Lib/CommonLib/TrQuant.cpp
@@ -292,7 +292,7 @@ void TrQuant::xDeQuant(const TransformUnit& tu,
                        const ComponentID   &compID,
                        const QpParam       &cQP)
 {
-  PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_QUANT );
+  PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_DEQUANT );
   m_quant->dequant( tu, dstCoeff, compID, cQP );
 }
 
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 5ee3fd5f9..e08aaf3e8 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -71,6 +71,8 @@ namespace vvenc {
 
 #define FIX_FOR_TEMPORARY_COMPILER_ISSUES_ENABLED         1 // Some compilers fail on particular code fragments, remove this when the compiler is fixed (or new version is used)
 
+#define IFP_RC_DETERMINISTIC                              0 // Enables Rate Control deterministic behavior (same results) when using IFP
+
 // ====================================================================================================================
 // General settings
 // ====================================================================================================================
diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp
index 3a046355e..f5175df08 100644
--- a/source/Lib/CommonLib/UnitTools.cpp
+++ b/source/Lib/CommonLib/UnitTools.cpp
@@ -3548,18 +3548,18 @@ bool CU::isMTSAllowed(const CodingUnit &cu, const ComponentID compID)
   return mtsAllowed;
 }
 
-bool CU::isMvInRangeFPP( const int yB, const int nH, const int yMv, const int fppLinesSynchro, const PreCalcValues& pcv, const int chromaShift, const int mvPrecShift )
+bool CU::isMvInRangeFPP( const int yB, const int nH, const int yMv, const int ifpLines, const PreCalcValues& pcv, const int chromaShift, const int mvPrecShift )
 {
   //const int dctifMarginVerBot = 4 >> yCompScale;
   const int ctuLogScale = pcv.maxCUSizeLog2 - chromaShift;
-  const int yBMax       = ( pcv.heightInCtus - 1 - fppLinesSynchro ) << ctuLogScale;
-  const int yRefMax     = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1;
+  const int yBMax       = ( pcv.heightInCtus - 1 - ifpLines ) << ctuLogScale;
+  const int yRefMax     = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1;
   if( yB < yBMax && ( yB + nH + ( 4 >> chromaShift ) + (yMv >> (mvPrecShift + chromaShift) ) - 1 > yRefMax ) )
     return false;
   return true;
 }
 
-bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int fppLinesSynchro )
+bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int ifpLines )
 {
   const CMotionBuf mb = cu.getMotionBuf();
   const ComponentID compID = COMP_Y;
@@ -3584,7 +3584,7 @@ bool CU::isMotionBufInRangeFPP( const CodingUnit &cu, const int fppLinesSynchro
           const Mv& mv = mi.mv[i];
           const int refMaxPosY = cuBottom + dctifMarginVerBot + (mv.ver >> mvPrecShift);
           const int refCtuRow = std::min( (int)((refMaxPosY > 0) ? refMaxPosY >> maxCUSizeShift: -1), (int)(cu.cs->pcv->heightInCtus - 1));
-          if( refCtuRow > ( curCtuRow + fppLinesSynchro ) )
+          if( refCtuRow > ( curCtuRow + ifpLines ) )
             return false;
         }
       }
diff --git a/source/Lib/CommonLib/UnitTools.h b/source/Lib/CommonLib/UnitTools.h
index 9a13fb36f..26cdc7e09 100644
--- a/source/Lib/CommonLib/UnitTools.h
+++ b/source/Lib/CommonLib/UnitTools.h
@@ -181,9 +181,8 @@ namespace CU
   void     getIBCMergeCandidates        (const CodingUnit& cu, MergeCtx& mrgCtx, const int& mrgCandIdx = -1);
   void     fillIBCMvpCand               (CodingUnit& cu, AMVPInfo& amvpInfo);
   void     getIbcMVPsEncOnly            (CodingUnit& cu, Mv* mvPred, int& nbPred);
-  //bool     isMvInRangeFPP               (const CodingUnit &cu, const Mv& mv, const int fppLinesSynchro, const ComponentID compID = COMP_Y, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL );
-  bool     isMvInRangeFPP               (const int yB, const int nH, const int yMv, const int fppLinesSynchro, const PreCalcValues& pcv, const int yCompScale = 0, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL);
-  bool     isMotionBufInRangeFPP        (const CodingUnit& cu, const int fppLinesSynchro);
+  bool     isMvInRangeFPP               (const int yB, const int nH, const int yMv, const int ifpLines, const PreCalcValues& pcv, const int yCompScale = 0, const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL);
+  bool     isMotionBufInRangeFPP        (const CodingUnit& cu, const int ifpLines);
 }
 
 // TU tools
diff --git a/source/Lib/CommonLib/arm/BufferARM.h b/source/Lib/CommonLib/arm/BufferARM.h
index 1d84911d7..d87e60ae8 100644
--- a/source/Lib/CommonLib/arm/BufferARM.h
+++ b/source/Lib/CommonLib/arm/BufferARM.h
@@ -65,7 +65,8 @@ namespace vvenc
 template<ARM_VEXT vext>
 void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, const Pel* lut )
 {
-  if( ( width & 31 ) == 0 )
+
+  if( ( width & 31 ) == 0 && ( height & 3 ) == 0 )
   {
     int16x8x4_t xtmp1;
     int16x8x4_t xtmp2;
@@ -219,7 +220,7 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p
       dst += ( dstStride << 2 );
     }
   }
-  else if( ( width & 15 ) == 0 )
+  else if( ( width & 15 ) == 0 && ( height & 3 ) == 0 )
   {
     int16x8x2_t xtmp1;
     int16x8x2_t xtmp2;
@@ -309,7 +310,7 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p
       dst += ( dstStride << 2 );
     }
   }
-  else if( ( width & 7 ) == 0 )
+  else if( ( width & 7 ) == 0 && ( height & 3 ) == 0 )
   {
     int16x8_t xtmp1;
     int16x8_t xtmp2;
@@ -366,7 +367,16 @@ void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const p
       dst += ( dstStride << 2 );
     }
   }
-
+  else
+  {
+#define RSP_SGNL_OP( ADDR ) dst[ADDR] = lut[src[ADDR]]
+#define RSP_SGNL_INC        src += srcStride; dst += dstStride;
+    
+    SIZE_AWARE_PER_EL_OP( RSP_SGNL_OP, RSP_SGNL_INC )
+    
+#undef RSP_SGNL_OP
+#undef RSP_SGNL_INC
+  }
   return;
 }
 
diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h
index f98b94d74..c70605491 100644
--- a/source/Lib/CommonLib/arm/RdCostARM.h
+++ b/source/Lib/CommonLib/arm/RdCostARM.h
@@ -57,100 +57,6 @@ namespace vvenc
 #ifdef TARGET_SIMD_ARM
 #if __ARM_ARCH >= 8
 
-template<ARM_VEXT vext, bool isWdt16>
-Distortion xGetSAD_MxN_SIMD( const DistParam& rcDtParam )
-{
-  if( rcDtParam.bitDepth > 10 )
-    return isWdt16 ? RdCost::xGetSAD16( rcDtParam ) : RdCost::xGetSAD8( rcDtParam );
-
-  //  assert( rcDtParam.iCols == iWidth);
-  const short*    pSrc1       = (const short*) rcDtParam.org.buf;
-  const short*    pSrc2       = (const short*) rcDtParam.cur.buf;
-  const int       iRows       = rcDtParam.org.height;
-  const int       iSubShift   = rcDtParam.subShift;
-  const ptrdiff_t iStrideSrc1 = rcDtParam.org.stride << iSubShift;
-  const ptrdiff_t iStrideSrc2 = rcDtParam.cur.stride << iSubShift;
-
-  uint32_t uiSum = 0;
-
-  int16x8_t vsum16 = vdupq_n_s16( 0 );
-
-  for( int i = 0; i < ( iRows >> 3 ); i++ )
-  {
-    // 0
-    int16x8_t vsrc1 = vld1q_s16( pSrc1 );
-    int16x8_t vsrc2 = vld1q_s16( pSrc2 );
-
-    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-
-    if( isWdt16 )
-    {
-      vsrc1 = vld1q_s16( pSrc1 + 8 );
-      vsrc2 = vld1q_s16( pSrc2 + 8 );
-
-      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-    }
-
-    pSrc1 += iStrideSrc1;
-    pSrc2 += iStrideSrc2;
-
-    // 1
-    vsrc1 = vld1q_s16( pSrc1 );
-    vsrc2 = vld1q_s16( pSrc2 );
-
-    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-
-    if( isWdt16 )
-    {
-      vsrc1 = vld1q_s16( pSrc1 + 8 );
-      vsrc2 = vld1q_s16( pSrc2 + 8 );
-
-      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-    }
-
-    pSrc1 += iStrideSrc1;
-    pSrc2 += iStrideSrc2;
-
-    // 2
-    vsrc1 = vld1q_s16( pSrc1 );
-    vsrc2 = vld1q_s16( pSrc2 );
-
-    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-
-    if( isWdt16 )
-    {
-      vsrc1 = vld1q_s16( pSrc1 + 8 );
-      vsrc2 = vld1q_s16( pSrc2 + 8 );
-
-      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-    }
-
-    pSrc1 += iStrideSrc1;
-    pSrc2 += iStrideSrc2;
-
-    // 3
-    vsrc1 = vld1q_s16( pSrc1 );
-    vsrc2 = vld1q_s16( pSrc2 );
-
-    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-
-    if( isWdt16 )
-    {
-      vsrc1 = vld1q_s16( pSrc1 + 8 );
-      vsrc2 = vld1q_s16( pSrc2 + 8 );
-
-      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
-    }
-
-    pSrc1 += iStrideSrc1;
-    pSrc2 += iStrideSrc2;
-  }
-
-  uiSum = vaddlvq_s16( vsum16 );
-  uiSum <<= iSubShift;
-  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
-}
-
 template<ARM_VEXT vext, bool isCalCentrePos>
 void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost )
 {
@@ -221,10 +127,9 @@ void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost )
   if( isCalCentrePos )
     sumTwo = vshrq_n_s32( sumTwo, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) );
 
-  vst1q_lane_u64( (uint64_t*) &cost[ 0 ], (uint64x2_t) sum, 0 );
-  if( isCalCentrePos )
-    cost[ 2 ] = vgetq_lane_s32( sumTwo, 0 );
-  vst1q_lane_u64( (uint64_t*) &cost[ 3 ], (uint64x2_t) sum, 1 );
+  vst1q_s32( (int32_t*) &cost[0], vzip1q_s32( sum, vdupq_n_s32(0) ) );
+  if (isCalCentrePos) cost[2] = (vgetq_lane_s32(sumTwo,0));
+  vst1q_s32( (int32_t*) &cost[3], vzip2q_s32( sum, vdupq_n_s32(0) ) );
 }
 
 template <ARM_VEXT vext>
@@ -245,8 +150,6 @@ void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, b
 template<ARM_VEXT vext>
 void RdCost::_initRdCostARM()
 {
-  m_afpDistortFunc[0][DF_SAD8   ] = xGetSAD_MxN_SIMD<vext, false>;
-  m_afpDistortFunc[0][DF_SAD16  ] = xGetSAD_MxN_SIMD<vext, true>;
 	m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD<vext>;
 }
 
diff --git a/source/Lib/CommonLib/x86/CommonDefX86.cpp b/source/Lib/CommonLib/x86/CommonDefX86.cpp
index 66e441976..6b666d486 100644
--- a/source/Lib/CommonLib/x86/CommonDefX86.cpp
+++ b/source/Lib/CommonLib/x86/CommonDefX86.cpp
@@ -266,7 +266,7 @@ X86_VEXT read_x86_extension_flags( X86_VEXT request )
   static const X86_VEXT max_supported = _get_x86_extensions();
   static X86_VEXT       ext_flags     = max_supported;
 #else
-  static const X86_VEXT max_supported = AVX;                               // disable AVX2 for non-x86 because the SIMD-Everywhere implementation is buggy
+  static const X86_VEXT max_supported = AVX2;                               // disable AVX2 for non-x86 because the SIMD-Everywhere implementation is buggy
   static X86_VEXT       ext_flags     = SIMD_EVERYWHERE_EXTENSION_LEVEL;   // default to SSE42 for WASM and SIMD-everywhere
 #endif
 
@@ -276,8 +276,6 @@ X86_VEXT read_x86_extension_flags( X86_VEXT request )
     {
 #ifdef REAL_TARGET_X86
       THROW( "requested SIMD level (" << request << ") not supported by current CPU (max " << max_supported << ")." );
-#else
-      THROW( "requested SIMD level (" << request << ") not supported because the SIMD-Everywhere implementation for AVX2 is buggy." );
 #endif
     }
 
diff --git a/source/Lib/CommonLib/x86/DepQuantX86.h b/source/Lib/CommonLib/x86/DepQuantX86.h
index 459170370..77429699a 100644
--- a/source/Lib/CommonLib/x86/DepQuantX86.h
+++ b/source/Lib/CommonLib/x86/DepQuantX86.h
@@ -115,8 +115,6 @@ namespace DQIntern
 
     int      cffBitsCtxOffset;
     bool     anyRemRegBinsLt4;
-    unsigned effWidth;
-    unsigned effHeight;
     int      initRemRegBins;
   };
 
@@ -1157,11 +1155,6 @@ namespace DQIntern
     {
     }
 
-    void init( int dqTrVal )
-    {
-      m_quant.init( dqTrVal );
-    }
-
     void quant( TransformUnit &tu, const CCoeffBuf &srcCoeff, const ComponentID compID, const QpParam &cQP, const double lambda, const Ctx &ctx, TCoeff &absSum, bool enableScalingLists, int *quantCoeff )
     {
       //===== reset / pre-init =====
@@ -1308,8 +1301,6 @@ namespace DQIntern
 
       int effectWidth  = std::min( 32, effWidth );
       int effectHeight = std::min( 32, effHeight );
-      m_state_curr.effWidth         = effectWidth;
-      m_state_curr.effHeight        = effectHeight;
       m_state_curr.initRemRegBins   = ( effectWidth * effectHeight * MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT ) / 16;
       m_state_curr.anyRemRegBinsLt4 = true; // for the first coeff use scalar impl., because it check against the init state, which
                                             // prohibits some paths
@@ -1504,7 +1495,6 @@ namespace DQIntern
 
   private:
     CommonCtx<vext> m_commonCtx;
-    Quantizer       m_quant;
     Decisions       m_trellis[MAX_TB_SIZEY * MAX_TB_SIZEY][2];
     Rom             m_scansRom;
 
diff --git a/source/Lib/CommonLib/x86/FixMissingIntrin.h b/source/Lib/CommonLib/x86/FixMissingIntrin.h
index 7a07c7edb..6ef103748 100644
--- a/source/Lib/CommonLib/x86/FixMissingIntrin.h
+++ b/source/Lib/CommonLib/x86/FixMissingIntrin.h
@@ -83,6 +83,8 @@ static inline __m128i _mm_loadu_si32( const void* p )
 {
   return _mm_cvtsi32_si128( *(int32_t*)p );
 }
+#elif defined( REAL_TARGET_X86 ) && defined( __GNUC__ ) && !defined( __llvm__ ) && !defined( __INTEL_COMPILER ) && __GNUC__ <= 11 && __GNUC_MINOR__ <= 2
+#define _mm_loadu_si32( p ) _mm_cvtsi32_si128( *(int32_t*)( p ) )
 #endif
 
 #ifdef MISSING_INTRIN_mm_loadu_si64
diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
index d9d0c19eb..b3a0cba2e 100644
--- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
+++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
@@ -1001,7 +1001,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW
   m_CtxCache = &ctxCache;
 
 
-  int alfUnitSize = encCfg.m_fppLinesSynchro ? m_encCfg->m_CTUSize: m_encCfg->m_alfUnitSize;
+  int alfUnitSize = encCfg.m_ifpLines ? m_encCfg->m_CTUSize: m_encCfg->m_alfUnitSize;
   initASU( alfUnitSize );
 
   const int numBins = m_encCfg->m_useNonLinearAlfLuma || m_encCfg->m_useNonLinearAlfChroma ? MaxAlfNumClippingValues : 1;
@@ -1654,7 +1654,7 @@ void EncAdaptiveLoopFilter::deriveFilter( Picture& pic, CodingStructure& cs, con
   {
     return;
   }
-  const int numAsus = m_encCfg->m_fppLinesSynchro && numCtus != m_numAsusInPic ? numCtus: m_numAsusInPic;
+  const int numAsus = m_encCfg->m_ifpLines && numCtus != m_numAsusInPic ? numCtus: m_numAsusInPic;
 
   initCABACEstimator( cs.slice );
 
@@ -1740,7 +1740,7 @@ void EncAdaptiveLoopFilter::deriveFilter( Picture& pic, CodingStructure& cs, con
   m_CABACEstimator->getCtx() = AlfCtx( ctxStart );
   alfEncoderCtb( cs, alfParam, lambdaChromaWeight, numAsus, numCtus );
 
-  if( m_encCfg->m_fppLinesSynchro )
+  if( m_encCfg->m_ifpLines )
   {
     reconstructCoeffFixedAPSs( cs, !cs.slice->lumaApsId.empty() && cs.slice->alfEnabled[COMP_Y],
       cs.slice->chromaApsId >= 0 && (cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr]), true );
@@ -1986,11 +1986,11 @@ void EncAdaptiveLoopFilter::initEncProcess( Slice& slice )
   }
 
   // NOTE: ALF is here enabled per default. However it can be disabled during filter derivation part.
-  //       In line synchronized FPP mode, it cannot be disabled.
+  //       In lines synchronized IFP mode, it cannot be disabled.
   slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = slice.sps->alfEnabled;
   slice.ccAlfCbEnabled = slice.ccAlfCrEnabled = slice.sps->ccalfEnabled;
 
-  if( m_encCfg->m_fppLinesSynchro )
+  if( m_encCfg->m_ifpLines )
   {
     // CCALF
    m_ccAlfFilterParam.ccAlfFilterEnabled[0] = slice.ccAlfCbEnabled;
@@ -4936,8 +4936,8 @@ void  EncAdaptiveLoopFilter::alfEncoderCtb( CodingStructure& cs, AlfParam& alfPa
   cs.slice->ccAlfCbApsId = newApsId;
   cs.slice->ccAlfCrApsId = newApsId;
 
-  // in case of FPP line synchro, we always trying to use ALF (with final decision at CTU level)
-  if (costOff <= costMin && !m_encCfg->m_fppLinesSynchro)
+  // in case of IFP lines synchro, we always trying to use ALF (with final decision at CTU level)
+  if (costOff <= costMin && !m_encCfg->m_ifpLines)
   {
     memset( cs.slice->alfEnabled, 0, sizeof( cs.slice->alfEnabled ) );
     cs.slice->numAps = (0);
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index 8e1e10425..3b962c03f 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -1715,9 +1715,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC
           continue;
         }
         mergeCtx.setMergeInfo( cu, uiMergeCand );
-        if( m_pcEncCfg->m_fppLinesSynchro && 
-         (  ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) ||
-            ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+        if( m_pcEncCfg->m_ifpLines && 
+         (  ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ||
+            ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
           ) )
         {
           // skip candidate
@@ -1907,9 +1907,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC
             continue;
           }
           mergeCtx.setMmvdMergeCandiInfo(cu, mmvdMergeCand);
-          if( m_pcEncCfg->m_fppLinesSynchro &&
-            ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) ||
-              ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+          if( m_pcEncCfg->m_ifpLines &&
+            ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ||
+              ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
             ) )
           {
             // skip candidate
@@ -2124,9 +2124,9 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC
         tempCS->initStructData(encTestMode.qp);
         continue;
       }
-      if( m_pcEncCfg->m_fppLinesSynchro && !m_pcEncCfg->m_useFastMrg &&
-        ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) ||
-          ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+      if( m_pcEncCfg->m_ifpLines && !m_pcEncCfg->m_useFastMrg &&
+        ( ( cu.refIdx[L0] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L0][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) ||
+          ( cu.refIdx[L1] >= 0 && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cu.mv[L1][0].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
         ) )
       {
         // skip candidate
@@ -2392,10 +2392,10 @@ void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bes
         continue;
       }
 
-      if( m_pcEncCfg->m_fppLinesSynchro ) 
+      if( m_pcEncCfg->m_ifpLines ) 
       {
-        skipCandFpp[L0][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 0].mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv );
-        skipCandFpp[L1][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 1].mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv );
+        skipCandFpp[L0][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 0].mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv );
+        skipCandFpp[L1][mergeCand] = !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mergeCtx.mvFieldNeighbours[(mergeCand << 1) + 1].mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv );
         if( skipCandFpp[L0][mergeCand] || skipCandFpp[L1][mergeCand] )
           continue;
       }
@@ -2638,7 +2638,7 @@ void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bes
       cu.mmvdMergeIdx     = MAX_UINT;
 
       CU::spanGeoMotionInfo(cu, mergeCtx, cu.geoSplitDir, cu.geoMergeIdx0, cu.geoMergeIdx1);
-      if( m_pcEncCfg->m_fppLinesSynchro && 
+      if( m_pcEncCfg->m_ifpLines && 
         ( skipCandFpp[L0][cu.geoMergeIdx0] || skipCandFpp[L1][cu.geoMergeIdx0] || skipCandFpp[L0][cu.geoMergeIdx1] || skipCandFpp[L1][cu.geoMergeIdx1] ) ) 
       {
         tempCS->initStructData(encTestMode.qp);
@@ -4050,7 +4050,7 @@ bool EncCu::xCheckSATDCostAffineMerge(CodingStructure*& tempCS, CodingUnit& cu,
       CU::spanMotionInfo( cu );
     }
 
-    if( m_pcEncCfg->m_fppLinesSynchro && ( !( CU::isMotionBufInRangeFPP( cu, m_pcEncCfg->m_fppLinesSynchro ) ) ) )
+    if( m_pcEncCfg->m_ifpLines && ( !( CU::isMotionBufInRangeFPP( cu, m_pcEncCfg->m_ifpLines ) ) ) )
     {
       // Do not use this mode
       continue;
diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp
index 78b498021..b1f784412 100644
--- a/source/Lib/EncoderLib/EncGOP.cpp
+++ b/source/Lib/EncoderLib/EncGOP.cpp
@@ -315,7 +315,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList )
   const bool lockStepMode = (m_pcEncCfg->m_RCTargetBitrate > 0 || (m_pcEncCfg->m_LookAhead > 0 && !m_isPreAnalysis)) && (m_pcEncCfg->m_maxParallelFrames > 0);
 
   // get list of pictures to be encoded and used for RC update
-  if( m_procList.empty() && ! m_gopEncListInput.empty() )
+  if( m_procList.empty() && (!m_gopEncListInput.empty() || !m_rcInputReorderList.empty()) )
   {
     xGetProcessingLists( m_procList, m_rcUpdateList, lockStepMode );
   }
@@ -344,7 +344,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList )
         const VVEncCfg* encCfg = m_pcEncCfg;
         auto picItr             = find_if( m_procList.begin(), m_procList.end(), [encCfg]( auto pic ) {
           // if ALF enabled and ALFTempPred is used, ensure that refAps is initialized
-          return ( encCfg->m_fppLinesSynchro || pic->slices[ 0 ]->checkRefPicsReconstructed() )
+          return ( encCfg->m_ifpLines || pic->slices[ 0 ]->checkAllRefPicsReconstructed() )
             && ( !encCfg->m_alf || ( !pic->refApsGlobal || pic->refApsGlobal->initalized ) ); } );
 
         const bool nextPicReady = picItr != m_procList.end();
@@ -398,6 +398,11 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList )
     }
   }
 
+  if( lockStepMode && m_pcEncCfg->m_ifpLines && !m_rcUpdateList.empty() )
+  {
+    xUpdateRcIfp();
+  }
+
   // picture/AU output
   // 
   // in lock-step mode:
@@ -405,7 +410,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList )
   // if the next picture to output belongs to the current chunk, do output (evaluation) when all pictures of the chunk are finished
 
   if( m_gopEncListOutput.empty() || !m_gopEncListOutput.front()->isReconstructed ||
-    ( lockStepMode && !m_rcUpdateList.empty() && m_gopEncListOutput.front() == m_rcUpdateList.front() && !xLockStepPicsFinished() ) )
+    ( lockStepMode && !m_pcEncCfg->m_ifpLines && !m_rcUpdateList.empty() && m_gopEncListOutput.front() == m_rcUpdateList.front() && !xLockStepPicsFinished() ) )
   {
     return;
   }
@@ -420,7 +425,7 @@ void EncGOP::xProcessPictures( AccessUnitList& auList, PicList& doneList )
   // update pending RC
   // first pic has been written to bitstream
   // therefore we have at least for this picture a valid total bit and head bit count
-  if( !m_rcUpdateList.empty() && m_rcUpdateList.front() == outPic )
+  if( !m_rcUpdateList.empty() && m_rcUpdateList.front() == outPic && (!lockStepMode || !m_pcEncCfg->m_ifpLines)  )
   {
     if( m_pcEncCfg->m_RCTargetBitrate > 0 )
     {
@@ -475,6 +480,8 @@ void EncGOP::xSyncAlfAps( Picture& pic )
   if( !refAps )
     return;
   CHECK( !refAps->initalized, "Attempt referencing from an uninitialized APS" );
+  pic.refApsGlobal->refCnt--;
+  CHECK( pic.refApsGlobal->refCnt < 0, "Not expected APS ref. counter\n" );
 
   // copy ref APSs to current picture
   const ParameterSetMap<APS>& src = refAps->apsMap;
@@ -1315,8 +1322,11 @@ void EncGOP::xSetupPicAps( Picture* pic )
   // additional +2 offset, due two max possible processing delay of two GOPs (Threads=1 mode)
   if( m_globalApsList.size() > ( std::max( (int)MAX_NUM_APS, m_pcEncCfg->m_GOPSize ) * ( m_pcEncCfg->m_maxParallelFrames + 2 ) ) )
   {
-    delete m_globalApsList.front();
-    m_globalApsList.pop_front();
+    if( m_globalApsList.front()->refCnt == 0 )
+    {
+      delete m_globalApsList.front();
+      m_globalApsList.pop_front();
+    }
   }
 
   pic->picApsGlobal = m_globalApsList.back();
@@ -1354,6 +1364,8 @@ void EncGOP::xSetupPicAps( Picture* pic )
       curApsItr--;
       refAps = *curApsItr;
     }
+    if( refAps )
+      refAps->refCnt++;
   }
 
   //CHECK( !refAps, "Faied to get reference APS" );
@@ -1409,41 +1421,128 @@ void EncGOP::xInitPicsInCodingOrder( const PicList& picList )
   CHECK( picList.size() && m_pcEncCfg->m_maxParallelFrames <= 0 && m_gopEncListOutput.size() != 1, "no new picture for encoding found" );
 }
 
+void EncGOP::xUpdateRcIfp()
+{
+  // deterministic behavior: RC update on next finished frame in sliding window coding order,
+  //                         evaluate only one finished frame at front of the list that makes place for the next frame
+  //                         whose parameters can be set using the finished frame bits info
+  //
+  // non-deterministic behavior: RC update on any finished frame
+
+#if IFP_RC_DETERMINISTIC
+  if( m_rcUpdateList.front()->isReconstructed && m_rcUpdateList.back()->encRCPic && ( m_rcUpdateList.front()->isFlush || m_rcUpdateList.size() == m_pcEncCfg->m_maxParallelFrames ) )
+  {   
+#endif
+    for( auto it = m_rcUpdateList.begin(); it != m_rcUpdateList.end(); )
+    {
+      auto pic = *it;
+      if( pic->isReconstructed )
+      {
+        pic->actualTotalBits = pic->sliceDataStreams[0].getNumberOfWrittenBits();
+        pic->refCounter--;
+        m_pcRateCtrl->updateAfterPicEncRC( pic );
+        it = m_rcUpdateList.erase( it );
+      }
+      else
+      {
+        ++it;
+      }
+#if IFP_RC_DETERMINISTIC
+      // in deterministic case, only one frame is allowed to update the RC
+      break;
+#endif
+    }
+#if IFP_RC_DETERMINISTIC
+  }
+#endif
+}
+
+inline void getReorderedProcList( std::list<Picture*>& inputList, std::list<Picture*>& procList, const int maxSize, bool isIFP )
+{
+  // deliver frames of the same TID (temporal layer) and from the same GOP
+  const int procTL = inputList.size() ? inputList.front()->TLayer             : -1;
+  const int gopNum = inputList.size() ? inputList.front()->gopEntry->m_gopNum : -1;
+  for( auto it = inputList.begin(); it != inputList.end(); )
+  {
+    auto pic = *it;
+    if( pic->gopEntry->m_gopNum == gopNum
+        && pic->TLayer == procTL
+        && ( isIFP ? pic->slices[ 0 ]->checkAllRefPicsAccessible(): pic->slices[ 0 ]->checkAllRefPicsReconstructed() ) )
+    {
+      pic->isInProcessList = true;
+      procList.push_back  ( pic );
+      it = inputList.erase( it );
+    }
+    else
+    {
+      ++it;
+    }
+    if( (int)procList.size() >= maxSize )
+      break;
+  }
+}
+
 void EncGOP::xGetProcessingLists( std::list<Picture*>& procList, std::list<Picture*>& rcUpdateList, const bool lockStepMode )
 {
-  // in lockstep mode, process only pics of same temporal layer
+  // in lockstep mode, frames are reordered in a specific processing order
   if( lockStepMode )
   {
-    // start new parallel chunk only, if next output picture is not reconstructed
-    if( rcUpdateList.empty() )
+    if( m_pcEncCfg->m_ifpLines )
     {
-      const int procTL         = m_gopEncListInput.size() ? m_gopEncListInput.front()->TLayer             : -1;
-      const int gopNum         = m_gopEncListInput.size() ? m_gopEncListInput.front()->gopEntry->m_gopNum : -1;
-      const int minSerialDepth = m_pcEncCfg->m_maxParallelFrames > 2 ? 1 : 2;  // up to this temporal layer encode pictures only in serial mode
-      const int maxSize        = procTL <= minSerialDepth ? 1 : m_pcEncCfg->m_maxParallelFrames;
-      for( auto it = m_gopEncListInput.begin(); it != m_gopEncListInput.end(); )
+      // in IFP lockstep mode:
+      // we need an additional reordering list to ensure causality of the coding order (ref.pics) on irregular GOP structures
+      // in the first step, the reordered list is filled
+      // in the second, the frames from reordered list are moved to proc. list up to required update-list size
+      const int maxUpdateListSize = m_pcEncCfg->m_maxParallelFrames;
+      if( rcUpdateList.size() < maxUpdateListSize && ( !m_gopEncListInput.empty() || !m_rcInputReorderList.empty()))
       {
-        auto pic = *it;
-        if( pic->gopEntry->m_gopNum == gopNum
-            && pic->TLayer == procTL
-            && pic->slices[ 0 ]->checkRefPicsReconstructed() )
-        {
-          procList.push_back    ( pic );
-          rcUpdateList.push_back( pic );
-          it = m_gopEncListInput.erase( it );
-        }
-        else
+        while( rcUpdateList.size() < maxUpdateListSize && ( !m_gopEncListInput.empty() || !m_rcInputReorderList.empty()) )
         {
-          ++it;
+          if( !m_rcInputReorderList.empty() )
+          {
+            auto pic = m_rcInputReorderList.front();
+            m_rcInputReorderList.pop_front();
+            pic->refCounter++;
+            procList.push_back( pic );
+            rcUpdateList.push_back( pic );
+          }
+          else
+          {
+            while( m_rcInputReorderList.size() < maxUpdateListSize && !m_gopEncListInput.empty() )
+            {
+              getReorderedProcList( m_gopEncListInput, m_rcInputReorderList, maxUpdateListSize, true );
+            }
+          }
         }
-        if( (int)procList.size() >= maxSize )
-          break;
       }
     }
+    else if( rcUpdateList.empty() )
+    {
+      // retrieve next lockstep chunk
+      const int procTL         = m_gopEncListInput.size() ? m_gopEncListInput.front()->TLayer : -1;
+      const int minSerialDepth = m_pcEncCfg->m_maxParallelFrames > 2 ? 1 : 2;  // up to this temporal layer encode pictures only in serial mode
+      const int maxSize        = procTL <= minSerialDepth ? 1 : m_pcEncCfg->m_maxParallelFrames;
+      getReorderedProcList( m_gopEncListInput, procList, maxSize, false );
+      std::copy( procList.begin(), procList.end(), std::back_inserter(rcUpdateList) );
+    }
   }
   else
   {
-    procList.splice( procList.end(), m_gopEncListInput );
+    if( m_pcEncCfg->m_ifpLines )
+    {
+      // in case of IFP, using the reordered list brings an additional speedup
+      while( !m_gopEncListInput.empty() )
+      {
+        size_t inputListSize = m_gopEncListInput.size();
+        getReorderedProcList( m_gopEncListInput, procList, (int)procList.size() + m_pcEncCfg->m_maxParallelFrames, true );
+        CHECK( m_gopEncListInput.size() == inputListSize, "IFP processing list derivation: attempting to run in a deadlock" );
+      }
+    }
+    else
+    {
+      // just pass the input list to processing list
+      procList.splice( procList.end(), m_gopEncListInput );
+    }
     m_gopEncListInput.clear();
     if( ! m_gopEncListOutput.empty() )
       rcUpdateList.push_back( m_gopEncListOutput.front() );
@@ -1797,8 +1896,6 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod
       alfAPS->ccAlfParam.reset();
     }
   }
-  pic.picApsGlobal = nullptr;
-  pic.refApsGlobal = nullptr;
   CHECK( slice->enableDRAPSEI && m_pcEncCfg->m_maxParallelFrames, "Dependent Random Access Point is not supported by Frame Parallel Processing" );
 
   pic.isInitDone = true;
@@ -2502,7 +2599,6 @@ void EncGOP::xAddPSNRStats( const Picture* pic, CPelUnitBuf cPicD, AccessUnitLis
       }
     }
   }
-
   const uint32_t uibits = numRBSPBytes * 8;
 
   if (m_isPreAnalysis || !m_pcRateCtrl->rcIsFinalPass)
diff --git a/source/Lib/EncoderLib/EncGOP.h b/source/Lib/EncoderLib/EncGOP.h
index 633182318..27c3dccea 100644
--- a/source/Lib/EncoderLib/EncGOP.h
+++ b/source/Lib/EncoderLib/EncGOP.h
@@ -147,6 +147,7 @@ class EncGOP : public EncStage
   std::list<Picture*>       m_gopEncListOutput;
   std::list<Picture*>       m_procList;
   std::list<Picture*>       m_rcUpdateList;
+  std::list<Picture*>       m_rcInputReorderList;  // used in RC in IFP lines synchro mode
   std::deque<PicApsGlobal*> m_globalApsList;
 
   std::vector<int>          m_globalCtuQpVector;
@@ -200,6 +201,7 @@ class EncGOP : public EncStage
   void xSelectReferencePictureList    ( Slice* slice ) const;
   void xSyncAlfAps                    ( Picture& pic );
 
+  void xUpdateRcIfp                   ();
   void xWritePicture                  ( Picture& pic, AccessUnitList& au, bool isEncodeLtRef );
   int  xWriteParameterSets            ( Picture& pic, AccessUnitList& accessUnit, HLSWriter& hlsWriter );
   int  xWritePictureSlices            ( Picture& pic, AccessUnitList& accessUnit, HLSWriter& hlsWriter );
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index c8280ee4c..bf58e6210 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -56,6 +56,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "EncStage.h"
 #include "PreProcess.h"
 #include "EncGOP.h"
+#include "CommonLib/x86/CommonDefX86.h"
 
 //! \ingroup EncoderLib
 //! \{
@@ -111,6 +112,13 @@ void EncLib::initEncoderLib( const vvenc_config& encCfg )
   // copy config parameter
   const_cast<VVEncCfg&>(m_encCfg) = encCfg;
 
+#if defined( REAL_TARGET_X86 ) && defined( _MSC_VER ) && _MSC_VER >= 1938 
+  if( read_x86_extension_flags() >= x86_simd::AVX2 )
+  {
+    msg.log( VVENC_WARNING, "WARNING: MSVC version >= 17.8 produces invalid AVX2 code, partially disabling AVX2!\n" );
+  }
+
+#endif
   // setup modified configs for rate control
   if( m_encCfg.m_RCNumPasses > 1 || m_encCfg.m_LookAhead )
   {
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index 722c3d19d..269596226 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -892,7 +892,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
   const UnitArea& ctuArea        = ctuEncParam->ctuArea;
   const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
   const TaskType currState       = processStates[ ctuRsAddr ];
-  const int syncLines            = encSlice->m_pcEncCfg->m_fppLinesSynchro;
+  const int syncLines            = encSlice->m_pcEncCfg->m_ifpLines;
 
   DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
   DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
@@ -910,7 +910,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
     // encode
     case CTU_ENCODE:
       {
-        // CTU line-wise frame parallel processing synchronization
+        // CTU line-wise inter-frame parallel processing synchronization
         if( syncLines )
         {
           const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
@@ -1161,18 +1161,18 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
         ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
 
         // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
       }
       break;
 
     case ALF_DERIVE_FILTER:
       {
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         if( ctuRsAddr == deriveFilterCtu )
         {
           // ensure statistics from all previous ctu's have been collected
-          int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, FPPLS_ALF_DERIVE_LINES): pcv.heightInCtus;
+          int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus;
           for( int y = 0; y < numCheckLines; y++ )
           {
             for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
@@ -1200,7 +1200,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
           if( ctuRsAddr == deriveFilterCtu )
           {
             encSlice->m_pALF->initDerivation( slice );
-            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES: pcv.sizeInCtus );
+            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus );
             encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
           }
           else if( syncLines )
@@ -1225,7 +1225,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
     case ALF_RECONSTRUCT:
       {
         // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_ALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
           return false;
 
@@ -1277,7 +1277,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
         ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
 
         // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
       }
       break;
@@ -1285,11 +1285,11 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
     case CCALF_DERIVE_FILTER:
       {
         // synchronization dependencies
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         if( ctuRsAddr == deriveFilterCtu )
         {
           // ensure statistics from all previous ctu's have been collected
-          int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, FPPLS_CCALF_DERIVE_LINES): pcv.heightInCtus;
+          int numCheckLines = syncLines ? std::min((int)pcv.heightInCtus, (syncLines + 1)): pcv.heightInCtus;
           for( int y = 0; y < numCheckLines; y++ )
           {
             for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
@@ -1316,7 +1316,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
         {
           if( ctuRsAddr == deriveFilterCtu )
           {
-            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES: pcv.sizeInCtus );
+            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, syncLines ? pcv.widthInCtus * (syncLines + 1): pcv.sizeInCtus );
           }
           else if( syncLines )
           {
@@ -1337,7 +1337,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
     case CCALF_RECONSTRUCT:
       {
         // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
-        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * FPPLS_CCALF_DERIVE_LINES - 1: pcv.sizeInCtus - 1;
+        const unsigned deriveFilterCtu = syncLines ? pcv.widthInCtus * (syncLines + 1) - 1: pcv.sizeInCtus - 1;
         if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
           return false;
 
@@ -1379,7 +1379,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
           if(ctuPosY + 1 == pcv.heightInCtus)
             recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
 
-          // for FPP lines synchro, do an additional increment signaling that CTU row is ready
+          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
           if( syncLines )
             ++(pic->m_tileColsDone->at( ctuPosY ));
         }
diff --git a/source/Lib/EncoderLib/GOPCfg.cpp b/source/Lib/EncoderLib/GOPCfg.cpp
index 0923a5e67..087a50556 100644
--- a/source/Lib/EncoderLib/GOPCfg.cpp
+++ b/source/Lib/EncoderLib/GOPCfg.cpp
@@ -55,7 +55,7 @@ namespace vvenc {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode )
+void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode, int minIntraDist )
 {
   CHECK( gopSize < 1, "gop size has to be greater than 0" );
 
@@ -119,6 +119,8 @@ void GOPCfg::initGopList( int refreshType, bool poc0idr, int intraPeriod, int go
   m_cnOffset     = 0;
   m_numTillGop   = poc0idr ? 0 : (int)m_gopList->size() - 1;
   m_numTillIntra = poc0idr ? 0 : (int)m_gopList->size() - 1;
+  m_minIntraDist = minIntraDist;
+  m_lastIntraPOC = -1;
 }
 
 void GOPCfg::getNextGopEntry( GOPEntry& gopEntry )
@@ -158,6 +160,7 @@ void GOPCfg::getNextGopEntry( GOPEntry& gopEntry )
     gopEntry.m_temporalId     = isTl0 ? 0 : 1;
     gopEntry.m_isStartOfIntra = isStartOfIntra;
     gopEntry.m_isValid        = true;
+    if( isStartOfIntra ) m_lastIntraPOC = m_nextPoc;
 
     // continue with next frame
     m_nextPoc += 1;
@@ -180,6 +183,7 @@ void GOPCfg::getNextGopEntry( GOPEntry& gopEntry )
     gopEntry.m_sliceType      = 'I';
     gopEntry.m_isStartOfIntra = true;
     gopEntry.m_temporalId     = 0;
+    m_lastIntraPOC            = m_nextPoc;
   }
 
   // check for end of current gop
@@ -234,6 +238,7 @@ void GOPCfg::startIntraPeriod( GOPEntry& gopEntry )
   gopEntry.m_isStartOfIntra = true;
   gopEntry.m_isStartOfGop   = true;
   gopEntry.m_temporalId     = 0;
+  m_lastIntraPOC            = gopEntry.m_POC;
 
   // start with first gop list
   m_gopList      = &m_defaultGopLists[ 0 ];
@@ -251,7 +256,7 @@ void GOPCfg::startIntraPeriod( GOPEntry& gopEntry )
   }
 }
 
-void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry ) const
+void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry )
 {
   gopEntry.m_isStartOfGop = true;
   if( gopEntry.m_gopNum == 0 && ! gopEntry.m_isStartOfIntra )
@@ -259,6 +264,7 @@ void GOPCfg::fixStartOfLastGop( GOPEntry& gopEntry ) const
     gopEntry.m_isStartOfIntra = true;
     gopEntry.m_sliceType      = 'I';
     gopEntry.m_temporalId     = 0;
+    m_lastIntraPOC            = gopEntry.m_POC;
   }
 }
 
@@ -275,6 +281,14 @@ void GOPCfg::getDefaultRPLLists( RPLList& rpl0, RPLList& rpl1 ) const
   }
 }
 
+bool GOPCfg::isSTAallowed( int poc ) const
+{
+  int intraDistBack    = poc - m_lastIntraPOC;
+  int intraDistForward = m_numTillIntra + 1;
+
+  return ( intraDistBack >= m_minIntraDist && intraDistForward >= m_minIntraDist );
+}
+
 bool GOPCfg::hasNonZeroTemporalId() const
 {
   return m_maxTid > 0;
diff --git a/source/Lib/EncoderLib/GOPCfg.h b/source/Lib/EncoderLib/GOPCfg.h
index 39504273c..d9e9a93e6 100644
--- a/source/Lib/EncoderLib/GOPCfg.h
+++ b/source/Lib/EncoderLib/GOPCfg.h
@@ -93,6 +93,8 @@ class GOPCfg
     int  m_maxTid;
     int  m_firstPassMode;
     int  m_defaultNumActive[ 2 ];
+    int  m_minIntraDist;
+    int  m_lastIntraPOC;
 
   public:
     GOPCfg( MsgLog& _m )
@@ -115,6 +117,8 @@ class GOPCfg
       , m_maxTid          ( 0 )
       , m_firstPassMode   ( 0 )
       , m_defaultNumActive{ 0, 0 }
+      , m_minIntraDist    ( -1 )
+      , m_lastIntraPOC    ( -1 )
     {
     };
 
@@ -122,17 +126,19 @@ class GOPCfg
     {
     };
 
-    void initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode );
+    void initGopList( int refreshType, bool poc0idr, int intraPeriod, int gopSize, int leadFrames, bool bPicReordering, const vvencGOPEntry cfgGopList[ VVENC_MAX_GOP ], const vvencMCTF& mctfCfg, int firstPassMode, int m_minIntraDist );
     void getNextGopEntry( GOPEntry& gopEntry );
     void startIntraPeriod( GOPEntry& gopEntry );
-    void fixStartOfLastGop( GOPEntry& gopEntry ) const;
+    void fixStartOfLastGop( GOPEntry& gopEntry );
     void getDefaultRPLLists( RPLList& rpl0, RPLList& rpl1 ) const;
+    void setLastIntraSTA( int poc ) { m_lastIntraPOC = poc; }
 
     int  getMaxTLayer() const                             { return m_maxTid; }
     const std::vector<int>& getMaxDecPicBuffering() const { return m_maxDecPicBuffering; }
     const std::vector<int>& getNumReorderPics() const     { return m_numReorderPics; }
     int  getDefaultNumActive( int l ) const               { return m_defaultNumActive[ l ]; }
 
+    bool isSTAallowed( int poc ) const;
     bool hasNonZeroTemporalId() const;
     bool hasLeadingPictures() const;
     bool isChromaDeltaQPEnabled() const;
diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp
index a8feb7ca3..9233faef0 100644
--- a/source/Lib/EncoderLib/InterSearch.cpp
+++ b/source/Lib/EncoderLib/InterSearch.cpp
@@ -217,7 +217,7 @@ InterSearch::~InterSearch()
 
 void InterSearch::init( const VVEncCfg& encCfg, TrQuant* pTrQuant, RdCost* pRdCost, EncModeCtrl* pModeCtrl, CodingStructure **pSaveCS )
 {
-  InterPrediction::init( pRdCost, encCfg.m_internChromaFormat, encCfg.m_CTUSize, encCfg.m_fppLinesSynchro );
+  InterPrediction::init( pRdCost, encCfg.m_internChromaFormat, encCfg.m_CTUSize, encCfg.m_ifpLines );
   m_numBVs                       = 0;
   m_pcEncCfg                     = &encCfg;
   m_pcTrQuant                    = pTrQuant;
@@ -1198,7 +1198,7 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub
         xCopyAMVPInfo(&aacAMVPInfo[1][bestBiPRefIdxL1], &amvp[REF_PIC_LIST_1]);
         aaiMvpIdxBi[1][bestBiPRefIdxL1] = bestBiPMvpL1;
         cMvPredBi  [1][bestBiPRefIdxL1] = amvp[REF_PIC_LIST_1].mvCand[bestBiPMvpL1];
-        if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cMvPredBi[1][bestBiPRefIdxL1].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+        if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cMvPredBi[1][bestBiPRefIdxL1].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
         {
           // this mvp cannot be used for mv, skip Bi-pred
           uiCostBi = std::numeric_limits<Distortion>::max();
@@ -1390,10 +1390,10 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub
             cCurMvField.setMvField( aacAMVPInfo[curRefList][refIdxCur].mvCand[i], refIdxCur );
             cTarMvField.setMvField( aacAMVPInfo[tarRefList][refIdxTar].mvCand[j], refIdxTar );
             GCC_WARNING_RESET
-            if( m_pcEncCfg->m_fppLinesSynchro )
+            if( m_pcEncCfg->m_ifpLines )
             {
-              xCheckAndClipMvToFppLine( cCurMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv );
-              xCheckAndClipMvToFppLine( cTarMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv );
+              xCheckAndClipMvToFppLine( cCurMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv );
+              xCheckAndClipMvToFppLine( cTarMvField.mv, cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv );
             }
             Distortion cost = xGetSymCost( cu, origBuf, eCurRefList, cCurMvField, cTarMvField, BcwIdx );
             if ( cost < costStart )
@@ -1507,9 +1507,9 @@ bool InterSearch::predInterSearch(CodingUnit& cu, Partitioner& partitioner, doub
 
         // save results
         if ( symCost < uiCostBi  
-          && ( !m_pcEncCfg->m_fppLinesSynchro || 
-          ( CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cCurMvField.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) &&
-            CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTarMvField.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) ) )          
+          && ( !m_pcEncCfg->m_ifpLines || 
+          ( CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cCurMvField.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) &&
+            CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTarMvField.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) ) )          
           )
         {
           uiCostBi = symCost;
@@ -1826,7 +1826,7 @@ void InterSearch::xEstimateMvPredAMVP( CodingUnit& cu, CPelUnitBuf& origBuf, Ref
   for( i = 0 ; i < pcAMVPInfo->numCand; i++)
   {
     Mv mvCand = pcAMVPInfo->mvCand[i];
-    if( m_pcEncCfg->m_fppLinesSynchro )
+    if( m_pcEncCfg->m_ifpLines )
       xClipMvSearch( mvCand, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, true );
 
     Distortion uiTmpCost = xGetTemplateCost( cu, origBuf, predBuf, mvCand, i, AMVP_MAX_NUM_CANDS, refPicList, iRefIdx );
@@ -2055,7 +2055,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic
 
     Mv bestInitMv = (bBi ? rcMv : rcMvPred);
     Mv cTmpMv     = bestInitMv;
-    xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro );
+    xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines );
     cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT);
     m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor;
     Distortion uiBestSad = m_cDistParam.distFunc(m_cDistParam);
@@ -2080,7 +2080,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic
       if( j < i )
         continue;
 
-      xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro);
+      xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines);
       cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT);
       m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor;
 
@@ -2136,7 +2136,7 @@ void InterSearch::xMotionEstimation(CodingUnit& cu, CPelUnitBuf& origBuf, RefPic
   DTRACE(g_trace_ctx, D_ME, "   MECost<L%d,%d>: %6d (%d)  MV:%d,%d\n", (int)refPicList, (int)bBi, ruiCost, ruiBits, rcMv.hor << 2, rcMv.ver << 2);
 }
 
-void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int fppLinesSynchro )
+void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int ifpLines )
 {
   if( pcv.wrapArround )
   {
@@ -2147,9 +2147,9 @@ void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Siz
   int iHorMax = ( pcv.lumaWidth + iOffset - ( int ) pos.x - 1 ) << iMvShift;
   int iHorMin = ( -( int ) pcv.maxCUSize   - iOffset - ( int ) pos.x + 1 ) * (1 << iMvShift);
 
-  int maxLumaHeight = fppLinesSynchro && ((pos.y >> pcv.maxCUSizeLog2) + fppLinesSynchro + 1 < pcv.heightInCtus) ? 
+  int maxLumaHeight = ifpLines && ((pos.y >> pcv.maxCUSizeLog2) + ifpLines + 1 < pcv.heightInCtus) ? 
     
-    (((pos.y >> pcv.maxCUSizeLog2) + fppLinesSynchro + 1) << pcv.maxCUSizeLog2 ) - size.height - 4  // 4 samples from DCTIF vertical bottom part
+    (((pos.y >> pcv.maxCUSizeLog2) + ifpLines + 1) << pcv.maxCUSizeLog2 ) - size.height - 4  // 4 samples from DCTIF vertical bottom part
 
     : pcv.lumaHeight + iOffset;
 
@@ -2160,26 +2160,26 @@ void InterSearch::xClipMvSearch( Mv& rcMv, const Position& pos, const struct Siz
   rcMv.ver = ( std::min( iVerMax, std::max( iVerMin, rcMv.ver ) ) );
 }
 
-void InterSearch::xClipMvToFppLine( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv )
+void InterSearch::xClipMvToFppLine( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv )
 {
   const int yCompScale = 0;
   const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL;
   const int ctuLogScale = pcv.maxCUSizeLog2 - yCompScale;
-  const int yRefMax     = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1;
+  const int yRefMax     = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1;
   const int yRefMv      = yB + nH + ( 4 >> yCompScale ) + (mv.ver >> mvPrecShift) - 1;
   CHECKD( yRefMv <= yRefMax, "Not expected" );
   mv.ver -= ( yRefMv - yRefMax ) << mvPrecShift;
 }
 
-void InterSearch::xCheckAndClipMvToFppLine( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv )
+void InterSearch::xCheckAndClipMvToFppLine( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv )
 {
   const int yCompScale  = 0;
   const int mvPrecShift = MV_FRACTIONAL_BITS_INTERNAL;
   const int ctuLogScale = pcv.maxCUSizeLog2 - yCompScale;
-  const int yBMax       = ( pcv.heightInCtus - 1 - fppLinesSynchro ) << ctuLogScale;
+  const int yBMax       = ( pcv.heightInCtus - 1 - ifpLines ) << ctuLogScale;
   if( yB < yBMax )
   {
-    const int yRefMax = ( ( ( yB >> ctuLogScale ) + fppLinesSynchro + 1 ) << ctuLogScale ) - 1;
+    const int yRefMax = ( ( ( yB >> ctuLogScale ) + ifpLines + 1 ) << ctuLogScale ) - 1;
     const int yRefMv  = yB + nH + ( 4 >> yCompScale ) + (mv.ver >> mvPrecShift) - 1;
     if( yRefMv > yRefMax )
     {
@@ -2210,7 +2210,7 @@ void InterSearch::xSetSearchRange ( const CodingUnit& cu,
   else
   {
     clipMv( mvTL, cu.lumaPos(), cu.lumaSize(), pcv);
-    xClipMvSearch( mvBR, cu.lumaPos(), cu.lumaSize(), pcv, m_pcEncCfg->m_fppLinesSynchro );
+    xClipMvSearch( mvBR, cu.lumaPos(), cu.lumaSize(), pcv, m_pcEncCfg->m_ifpLines );
   }
 
   mvTL.divideByPowerOf2( iMvShift );
@@ -2343,7 +2343,7 @@ void InterSearch::xTZSearch( const CodingUnit& cu,
   const bool bNewZeroNeighbourhoodTest               = bExtendedSettings;
 
   int iSearchRange = m_iSearchRange;
-  xClipMvSearch( rcMv, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro );
+  xClipMvSearch( rcMv, cu.lumaPos(), cu.lumaSize(),*cu.cs->pcv, m_pcEncCfg->m_ifpLines );
   rcMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_QUARTER);
   rcMv.divideByPowerOf2(2);
 
@@ -2371,7 +2371,7 @@ void InterSearch::xTZSearch( const CodingUnit& cu,
     const BlkUniMvInfo* curMvInfo = m_BlkUniMvInfoBuffer->getBlkUniMvInfo(i);
     Mv cTmpMv = curMvInfo->uniMvs[refPicList][iRefIdxPred];
 
-    xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_fppLinesSynchro);
+    xClipMvSearch(cTmpMv, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_pcEncCfg->m_ifpLines);
     cTmpMv.changePrecision(MV_PRECISION_INTERNAL, MV_PRECISION_INT);
     m_cDistParam.cur.buf = cStruct.piRefY + (cTmpMv.ver * cStruct.iRefStride) + cTmpMv.hor;
 
@@ -2633,9 +2633,9 @@ void InterSearch::xPatternSearchIntRefine(CodingUnit& cu, TZSearchStruct&  cStru
       cTestMv[iMVPIdx] += cBaseMvd[iMVPIdx];
       cTestMv[iMVPIdx] += amvpInfo.mvCand[iMVPIdx];
 
-      if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTestMv[iMVPIdx].ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+      if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), cTestMv[iMVPIdx].ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
       {
-        xClipMvToFppLine( cTestMv[iMVPIdx], cu.ly(), cu.lheight(), m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv );
+        xClipMvToFppLine( cTestMv[iMVPIdx], cu.ly(), cu.lheight(), m_pcEncCfg->m_ifpLines, *cu.cs->pcv );
         cTestMv[iMVPIdx].roundTransPrecInternal2AmvrVertical(cu.imv);
       }
 
@@ -2846,7 +2846,7 @@ Distortion InterSearch::xSymRefineMvSearch( CodingUnit& cu, CPelUnitBuf& origBuf
       mvOffset <<= nSearchStepShift;
       MvField mvCand = mvCurCenter, mvPair;
       mvCand.mv += mvOffset;
-      if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvCand.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+      if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvCand.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
       {
         continue; // Skip this pos
       }
@@ -2865,7 +2865,7 @@ Distortion InterSearch::xSymRefineMvSearch( CodingUnit& cu, CPelUnitBuf& origBuf
       mvPair.refIdx = rTarMvField.refIdx;
       mvPair.mv.set( rcMvTarPred.hor - (mvCand.mv.hor - rcMvCurPred.hor), rcMvTarPred.ver - (mvCand.mv.ver - rcMvCurPred.ver) );
 
-      if( m_pcEncCfg->m_fppLinesSynchro && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvPair.mv.ver, m_pcEncCfg->m_fppLinesSynchro, *cu.cs->pcv ) )
+      if( m_pcEncCfg->m_ifpLines && !CU::isMvInRangeFPP( cu.ly(), cu.lheight(), mvPair.mv.ver, m_pcEncCfg->m_ifpLines, *cu.cs->pcv ) )
       {
         continue; // Skip this pos
       }
@@ -4475,7 +4475,7 @@ void InterSearch::xSymMvdCheckBestMvp(
   PelUnitBuf predBufA = m_tmpPredStorage[curRefList].getCompactBuf( cu );
   const Picture* picRefA = cu.slice->getRefPic(curRefList, cCurMvField.refIdx);
   Mv mvA = cCurMvField.mv;
-  xClipMvSearch( mvA, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_fppLinesSynchro );
+  xClipMvSearch( mvA, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_ifpLines );
   xPredInterBlk( COMP_Y, cu, picRefA, mvA, predBufA, false, cu.slice->clpRngs[ COMP_Y ], false, false );
 
   bufTmp = m_tmpStorageLCU.getBuf( UnitAreaRelative( cu, cu ) );
@@ -4501,7 +4501,7 @@ void InterSearch::xSymMvdCheckBestMvp(
       PelUnitBuf predBufB = m_tmpPredStorage[tarRefList].getCompactBuf( cu );
       const Picture* picRefB = cu.slice->getRefPic(tarRefList, cTarMvField.refIdx);
       Mv mvB = cTarMvField.mv;
-      xClipMvSearch( mvB, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_fppLinesSynchro );
+      xClipMvSearch( mvB, cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv, m_ifpLines );
       xPredInterBlk( COMP_Y, cu, picRefB, mvB, predBufB, false, cu.slice->clpRngs[ COMP_Y ], false, false );
 
       // calc distortion
@@ -4948,7 +4948,7 @@ void InterSearch::xPredAffineInterSearch( CodingUnit& cu,
       ::memcpy(tmp.affMVs[1][bestBiPRefIdxL1], pcMvTemp, sizeof(Mv) * 3);
       iRefIdxBi[1] = bestBiPRefIdxL1;
 
-      if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, pcMvTemp, m_pcEncCfg->m_fppLinesSynchro ) )
+      if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, pcMvTemp, m_pcEncCfg->m_ifpLines ) )
       {
         // this mvp cannot be used for mv, skip Bi-pred
         uiCostBi = MAX_DISTORTION;
@@ -5020,7 +5020,7 @@ void InterSearch::xPredAffineInterSearch( CodingUnit& cu,
         // First iterate, get prediction block of opposite direction
         if (iIter == 0 && !slice.picHeader->mvdL1Zero)
         {
-          if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, aacMv[1 - iRefList], m_pcEncCfg->m_fppLinesSynchro ) )
+          if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, aacMv[1 - iRefList], m_pcEncCfg->m_ifpLines ) )
           {
             continue;
           }
@@ -5226,7 +5226,7 @@ Distortion InterSearch::xGetAffineTemplateCost(CodingUnit& cu, CPelUnitBuf& orig
   Mv mv[3];
   memcpy(mv, acMvCand, sizeof(mv));
 
-  if( m_pcEncCfg->m_fppLinesSynchro && !xIsAffineMvInRangeFPP( cu, mv, m_pcEncCfg->m_fppLinesSynchro ) )
+  if( m_pcEncCfg->m_ifpLines && !xIsAffineMvInRangeFPP( cu, mv, m_pcEncCfg->m_ifpLines ) )
   {
     return MAX_DISTORTION>>1;  
   }
@@ -5451,7 +5451,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
   {
     acMvTemp[2].roundAffinePrecInternal2Amvr(cu.imv);
   }
-  if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) )
+  if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) )
   {
     xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.cs->slice->clpRngs[COMP_Y], refPicList);
 
@@ -5589,7 +5589,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
       clipMv(acMvTemp[i], cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv);
     }
 
-    if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) )
+    if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) )
     {
       xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList);
 
@@ -5620,7 +5620,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
 
   auto checkCPMVRdCost = [&](Mv ctrlPtMv[3])
   {
-    if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, ctrlPtMv, m_pcEncCfg->m_fppLinesSynchro ) )
+    if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, ctrlPtMv, m_pcEncCfg->m_ifpLines ) )
     {
       xPredAffineBlk(COMP_Y, cu, refPic, ctrlPtMv, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList);
       // get error
@@ -5713,7 +5713,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
             acMvTemp[j].set(centerMv[j].hor + (testPos[i][0] * (1 << mvShift)), centerMv[j].ver + (testPos[i][1] * (1 << mvShift)));
             clipMv(acMvTemp[j], cu.lumaPos(), cu.lumaSize(), *cu.cs->pcv);
 
-            if( !m_pcEncCfg->m_fppLinesSynchro || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_fppLinesSynchro ) )
+            if( !m_pcEncCfg->m_ifpLines || xIsAffineMvInRangeFPP( cu, acMvTemp, m_pcEncCfg->m_ifpLines ) )
             {
               xPredAffineBlk(COMP_Y, cu, refPic, acMvTemp, predBuf, false, cu.slice->clpRngs[COMP_Y], refPicList);
 
diff --git a/source/Lib/EncoderLib/InterSearch.h b/source/Lib/EncoderLib/InterSearch.h
index 6e6b4c21f..741fa9282 100644
--- a/source/Lib/EncoderLib/InterSearch.h
+++ b/source/Lib/EncoderLib/InterSearch.h
@@ -513,10 +513,10 @@ class InterSearch : public InterPrediction, AffineGradientSearch
                                     const bool            bFastSettings = false
                                   );
 
-  void xClipMvSearch              ( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int fppLinesSynchro );
+  void xClipMvSearch              ( Mv& rcMv, const Position& pos, const struct Size& size, const PreCalcValues& pcv, const int ifpLines );
 
-  void xClipMvToFppLine           ( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv );
-  void xCheckAndClipMvToFppLine   ( Mv& mv, const int yB, const int nH, const int fppLinesSynchro, const PreCalcValues& pcv );
+  void xClipMvToFppLine           ( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv );
+  void xCheckAndClipMvToFppLine   ( Mv& mv, const int yB, const int nH, const int ifpLines, const PreCalcValues& pcv );
   void xSetSearchRange            ( const CodingUnit& cu,
                                     const Mv&             cMvPred,
                                     const int             iSrchRng,
diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp
index e7b6a69a2..e111f71d1 100644
--- a/source/Lib/EncoderLib/PreProcess.cpp
+++ b/source/Lib/EncoderLib/PreProcess.cpp
@@ -75,7 +75,7 @@ PreProcess::~PreProcess()
 
 void PreProcess::init( const VVEncCfg& encCfg, bool isFinalPass )
 {
-  m_gopCfg.initGopList( encCfg.m_DecodingRefreshType, encCfg.m_poc0idr, encCfg.m_IntraPeriod, encCfg.m_GOPSize, encCfg.m_leadFrames, encCfg.m_picReordering, encCfg.m_GOPList, encCfg.m_vvencMCTF, encCfg.m_FirstPassMode );
+  m_gopCfg.initGopList( encCfg.m_DecodingRefreshType, encCfg.m_poc0idr, encCfg.m_IntraPeriod, encCfg.m_GOPSize, encCfg.m_leadFrames, encCfg.m_picReordering, encCfg.m_GOPList, encCfg.m_vvencMCTF, encCfg.m_FirstPassMode, encCfg.m_minIntraDist );
   CHECK( m_gopCfg.getMaxTLayer() != encCfg.m_maxTLayer, "max temporal layer of gop configuration does not match pre-configured value" );
 
   m_encCfg      = &encCfg;
@@ -389,10 +389,11 @@ void PreProcess::xDetectSTA( Picture* pic, const PicList& picList )
 {
   const Picture* prevTl0 = xGetPrevTl0Pic( pic, picList );
 
-  int picMemorySTA = 0;
-  bool isSta       = false;
-
-  if( prevTl0 && prevTl0->picVisActTL0 > 0 )
+  int picMemorySTA  = 0;
+  bool isSta        = false;
+  bool intraAllowed = m_gopCfg.isSTAallowed( pic->poc );
+  
+  if( prevTl0 && prevTl0->picVisActTL0 > 0 && intraAllowed )
   {
     const int scThreshold = ( ( pic->isSccStrong ? 6 : ( pic->isSccWeak ? 5 : 4 ) ) * ( m_isHighRes ? 19 : 15 ) ) >> 2;
 
@@ -412,6 +413,7 @@ void PreProcess::xDetectSTA( Picture* pic, const PicList& picList )
     picShared->m_picMemorySTA         = picMemorySTA;
     picShared->m_gopEntry.m_sliceType = 'I';
     picShared->m_gopEntry.m_scType    = SCT_TL0_SCENE_CUT;
+    m_gopCfg.setLastIntraSTA( pic->poc );
 
     if( m_encCfg->m_sliceTypeAdapt == 2 )
     {
diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h
index 7f28a2936..050318368 100644
--- a/source/Lib/apputils/VVEncAppCfg.h
+++ b/source/Lib/apputils/VVEncAppCfg.h
@@ -390,6 +390,14 @@ const std::vector<SVPair<int>> BitrateOrScaleAbrevToIntMap =
   { "x",                -16 }   // negative value: multiplier of target bitrate, with a fixed-point accuracy of 4 bit
 };
 
+const std::vector<SVPair<bool>> IfpToValueMap =
+{
+  { "0",   false },
+  { "off", false },
+  { "1",   1 },
+  { "on",  1 },
+};
+
 //// ====================================================================================================================
 //// string <-> enum
 //// ====================================================================================================================
@@ -547,7 +555,9 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
 
   IStreamToInt8                     toSliceTypeAdapt              ( &c->m_sliceTypeAdapt );
   IStreamToInt8                     toSelectiveRDOQ               ( &c->m_useSelectiveRDOQ );
-  IStreamToInt8                     toFppLinesSynchro             ( &c->m_fppLinesSynchro );
+  IStreamToInt8                     toForceScc                    ( &c->m_forceScc );
+  IStreamToInt8                     toIfpLines                    ( &c->m_ifpLines );
+  IStreamToEnum<bool>               toUseIfp                      ( &c->m_ifp, &IfpToValueMap );
 
   po::Options opts;
   if( m_easyMode )
@@ -635,7 +645,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("rcstatsfile",                                     m_RCStatsFileName,                                   "rate control statistics file name")
     ("qp,q",                                            c->m_QP,                                             "quantization parameter, QP (0, 1, .. 63)")
     ("qpa",                                             toQPA,                                               "enable perceptually motivated QP adaptation based on XPSNR model (0: off, 1: on)", true)
-    ("threads,t",                                       c->m_numThreads,                                     "number of threads (multithreading; -1: resolution < 720p: 4, >= 720p: 8 threads)")
+    ("threads,t",                                       c->m_numThreads,                                     "number of threads (multithreading; -1: resolution < 720p: 4, < 5K 2880p: 8, >= 5K 2880p: 12 threads)")
+    ("ifp",                                             toUseIfp,                                            "inter-frame parallelization(IFP) (0: off, 1: on, with sync. offset of two CTU lines)")
     ("refreshtype,-rt",                                 toDecRefreshType,                                    "intra refresh type (idr, cra, cra_cre: CRA, constrained RASL picture encoding)")
     ("refreshsec,-rs",                                  c->m_IntraPeriodSec,                                 "intra period/refresh in seconds")
     ("intraperiod,-ip",                                 c->m_IntraPeriod,                                    "intra period in frames (0: specify intra period in seconds instead, see -refreshsec)")
@@ -646,7 +657,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
   {
     opts.setSubSection("Threading, performance");
     opts.addOptions()
-    ("Threads,t",                                       c->m_numThreads,                                     "Number of threads")
+    ("Threads,t",                                       c->m_numThreads,                                     "number of threads (multithreading; -1: resolution < 720p: 4, < 5K 2880p: 8, >= 5K 2880p: 12 threads)")
     ("preset",                                          toPreset,                                            "select preset for specific encoding setting (faster, fast, medium, slow, slower, medium_lowDecEnergy)")
     ("Tiles",                                           toNumTiles,                                          "Set number of tile columns and rows")
     ;
@@ -672,6 +683,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("MaxBitrate",                                      toMaxRate,                                           "Rate control: approximate maximum instantaneous bitrate [bits/second] (0: no rate cap; least constraint)" )
     ("PerceptQPA,-qpa",                                 c->m_usePerceptQPA,                                  "Enable perceptually motivated QP adaptation, XPSNR based (0:off, 1:on)", true)
     ("STA",                                             toSliceTypeAdapt,                                    "Enable slice type adaptation at GOPSize>8 (-1: auto, 0: off, 1: adapt slice type, 2: adapt NAL unit type)")
+    ("MinIntraDistance",                                c->m_minIntraDist,                                   "With STA: set a minimum coded frame distance to the previous intra frame (-1: GOPSize)" )
     ;
 
     opts.setSubSection("Quantization parameters");
@@ -858,7 +870,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("AddGOP32refPics",                                 c->m_addGOP32refPics,                                "Use different QP offsets and reference pictures in GOP structure")
     ("NumRefPics",                                      c->m_numRefPics,                                     "Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)" )
     ("NumRefPicsSCC",                                   c->m_numRefPicsSCC,                                  "Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics)" )
-    ("ForceSCC",                                        c->m_forceScc,                                       "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" )
+    ("ForceSCC",                                        toForceScc,                                          "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" )
     ;
 
     opts.setSubSection("Low-level QT-BTT partitioning options");
@@ -1071,7 +1083,9 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("TileColumnWidthArray",                            toTileColumnWidth,                                   "Tile column widths in units of CTUs. Last column width in list will be repeated uniformly to cover any remaining picture width")
     ("TileRowHeightArray",                              toTileRowHeight,                                     "Tile row heights in units of CTUs. Last row height in list will be repeated uniformly to cover any remaining picture height")
     ("TileParallelCtuEnc",                              c->m_tileParallelCtuEnc,                             "Allow parallel CTU block search in different tiles")
-    ("FppLinesSynchro",                                 toFppLinesSynchro,                                   "(experimental) Number of CTU-lines synchronization due to MV restriction for FPP mode")
+    ("FppLinesSynchro",                                 toIfpLines,                                          "(deprecated) Inter-Frame Parallelization(IFP) explicit CTU-lines synchronization offset (-1: default mode with two lines, 0: off)")
+    ("IFPLines",                                        toIfpLines,                                          "Inter-Frame Parallelization(IFP) explicit CTU-lines synchronization offset (-1: default mode with two lines, 0: off)")
+    ("IFP",                                             toUseIfp,                                            "Inter-Frame Parallelization(IFP) (0: off, 1: on, with default setting of IFPLines)")
     ;
 
     opts.setSubSection("Coding tools");
diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp
index a6e814e62..89cebf119 100644
--- a/source/Lib/vvenc/vvencCfg.cpp
+++ b/source/Lib/vvenc/vvencCfg.cpp
@@ -260,6 +260,16 @@ static inline std::string vvenc_getDecodingRefreshTypeStr(  int type, bool poc0i
   return cType;
 }
 
+static inline int getNumThreadsDefault( vvenc_config *c )
+{
+  const int minSize = std::min( c->m_SourceWidth, c->m_SourceHeight );
+  if( minSize >= 2880 )
+    return 12;
+  else if( minSize >= 720 )
+    return 8;
+  return 4;
+}
+
 VVENC_DECL void vvenc_GOPEntry_default(vvencGOPEntry *GOPEntry )
 {
   GOPEntry->m_POC                       = -1;
@@ -381,6 +391,7 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c )
 
   c->m_usePerceptQPA                           = false;         ///< perceptually motivated input-adaptive QP modification, abbrev. perceptual QP adaptation (QPA)
   c->m_sliceTypeAdapt                          = -1;            ///< perceptually and objectively motivated slice type adaptation (STA)
+  c->m_minIntraDist                            = -1;
 
   c->m_RCNumPasses                             = -1;
   c->m_RCPass                                  = -1;
@@ -652,7 +663,8 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c )
   c->m_maxParallelFrames                       = -1;
   c->m_ensureWppBitEqual                       = -1;
   c->m_tileParallelCtuEnc                      = true;
-  c->m_fppLinesSynchro                         = 0;
+  c->m_ifpLines                                = -1;
+  c->m_ifp                                     = false;
 
   c->m_picPartitionFlag                        = false;
   memset( c->m_tileColumnWidth, 0, sizeof(c->m_tileColumnWidth) );
@@ -690,6 +702,7 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c )
   c->m_forceScc                                = 0;
 
   c->m_reservedFlag                            = false;
+  c->m_reservedInt                             = 0;
   memset( c->m_reservedDouble, 0, sizeof(c->m_reservedDouble) );
 
   // init default preset
@@ -756,6 +769,7 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
   vvenc_confirmParameter( c, c->m_leadFrames < 0 || c->m_leadFrames > VVENC_MAX_GOP,                     "Lead frames exceeds supported range (0 to 64)" );
   vvenc_confirmParameter( c, c->m_trailFrames < 0 || c->m_trailFrames > VVENC_MCTF_RANGE,                "Trail frames exceeds supported range (0 to 4)" );
   vvenc_confirmParameter( c, c->m_sliceTypeAdapt < -1 || c->m_sliceTypeAdapt > 2,                        "Slice type adaptation (STA) invalid parameter given, range is (-1 .. 2)" );
+  vvenc_confirmParameter( c, c->m_minIntraDist < -1,                                                     "Minimum intra distance cannot be smaller than -1" );
 
   if( VVENC_RC_OFF == c->m_RCTargetBitrate )
   {
@@ -796,6 +810,12 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
   //
 
   vvenc::MsgLog msg(c->m_msgCtx, c->m_msgFnc);
+#if !IFP_RC_DETERMINISTIC
+  if( c->m_RCTargetBitrate != 0 && c->m_ifp )
+  {
+    msg.log( VVENC_WARNING, "Using RC with IFP. Results are non-deterministic!\n" );
+  }
+#endif
 
   if( c->m_FirstPassMode > 2 && c->m_RCTargetBitrate != 0 )
   {
@@ -906,13 +926,13 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
   if( c->m_numThreads < 0 )
   {
     const int numCores = std::thread::hardware_concurrency();
-    c->m_numThreads = std::min( c->m_SourceWidth, c->m_SourceHeight ) < 720 ? 4 : 8;
+    c->m_numThreads = getNumThreadsDefault( c );
     c->m_numThreads = std::min( c->m_numThreads, numCores );
   }
   if( c->m_ensureWppBitEqual < 0 )       c->m_ensureWppBitEqual     = c->m_numThreads ?      1   : 0   ;
   if( c->m_useAMaxBT < 0 )               c->m_useAMaxBT             = c->m_numThreads ?      0   : 1   ;
   if( c->m_cabacInitPresent < 0 )        c->m_cabacInitPresent      = c->m_numThreads ?      0   : 1   ;
-  if( c->m_alfTempPred < 0 )             c->m_alfTempPred           = c->m_fppLinesSynchro ? 0   : 1   ;
+  if( c->m_alfTempPred < 0 )             c->m_alfTempPred           = c->m_ifp        ?      0   : 1   ;
   if( c->m_saoEncodingRate < 0.0 )       c->m_saoEncodingRate       = c->m_numThreads ?      0.0 : 0.75;
   if( c->m_saoEncodingRateChroma < 0.0 ) c->m_saoEncodingRateChroma = c->m_numThreads ?      0.0 : 0.5 ;
   if( c->m_maxParallelFrames < 0 )
@@ -920,6 +940,12 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
     c->m_maxParallelFrames = std::min( c->m_numThreads, 4 );
   }
 
+  if( c->m_ifpLines > 0 && !c->m_ifp )
+  {
+    msg.log( VVENC_WARNING, "Given IFPLines=%d, but IFP is not enabled, reseting IFPLines to 0.\n", c->m_ifpLines );
+  }
+  c->m_ifpLines = !c->m_ifp ? 0: (c->m_ifpLines == -1 ? 2: c->m_ifpLines);
+
   if( c->m_alfUnitSize < 0 )
     c->m_alfUnitSize = c->m_CTUSize;
 
@@ -1328,6 +1354,20 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
   }
   vvenc_confirmParameter( c, c->m_GOPSize <= 8 && c->m_sliceTypeAdapt > 0, "Slice type adaptation for GOPSize <= 8 not supported" );
 
+  if( c->m_minIntraDist < 0 )
+  {
+    if( c->m_sliceTypeAdapt > 0 )
+    {
+      c->m_minIntraDist = std::min( c->m_GOPSize, c->m_IntraPeriod );
+    }
+    else
+    {
+      c->m_minIntraDist = 0;
+    }
+  }
+  vvenc_confirmParameter( c, c->m_minIntraDist > 0 && c->m_sliceTypeAdapt == 0,               "STA: Setting a minimal intra distance only works with slice type adaptation enabled" );
+  vvenc_confirmParameter( c, c->m_minIntraDist > c->m_IntraPeriod && c->m_sliceTypeAdapt > 0, "STA: Minimal intra distance can not be larger than intra period" );
+
   // set number of lead / trail frames in segment mode
   const int staFrames  = c->m_sliceTypeAdapt                       ? c->m_GOPSize     : 0;
   const int mctfFrames = c->m_vvencMCTF.MCTF || c->m_usePerceptQPA ? VVENC_MCTF_RANGE : 0;
@@ -2033,9 +2073,18 @@ static bool checkCfgParameter( vvenc_config *c )
     vvenc_confirmParameter(c, c->m_traceFile[0] != '\0' && c->m_maxParallelFrames > 1 && c->m_numThreads > 1, "Tracing and frame parallel encoding not supported" );
 #endif
     vvenc_confirmParameter(c, c->m_maxParallelFrames > c->m_GOPSize && c->m_GOPSize != 1, "Max parallel frames should be less then GOP size" );
-    vvenc_confirmParameter(c, c->m_fppLinesSynchro && c->m_alfTempPred != 0, "FPP CTU-lines synchro: ALFTempPred is not supported (must be disabled)" );
-    vvenc_confirmParameter(c, c->m_fppLinesSynchro && c->m_numTileRows > 1,  "FPP CTU-lines synchro: Only single tile row is supported" );
-    vvenc_confirmParameter(c, c->m_fppLinesSynchro < 0, "fppLinesSynchro must be >= 0" );
+    vvenc_confirmParameter(c, c->m_ifpLines && c->m_alfTempPred != 0, "IFP: ALFTempPred is not supported (must be disabled)" );
+    vvenc_confirmParameter(c, c->m_ifpLines && c->m_numTileRows > 1,  "IFP: Only single tile row is supported" );
+    vvenc_confirmParameter(c, c->m_ifpLines < 0, "IFPLines must be >= 0" );
+    vvenc_confirmParameter(c, c->m_ifp && c->m_ifpLines == 0, "IFP requires IFPLines=[-1 or >0]" );
+  }
+  if( c->m_ifpLines )
+  {
+    const int minNumThreadsIfp = getNumThreadsDefault( c ) * 2;
+    if( c->m_numThreads < minNumThreadsIfp )
+    {
+      msg.log( VVENC_WARNING, "Using IFP at low number of threads (<%d) does not provide more speedup, consider disabling IFP.\n", minNumThreadsIfp );
+    }
   }
 
   vvenc_confirmParameter(c, c->m_explicitAPSid < 0 || c->m_explicitAPSid > 7, "ExplicitAPDid out of range [0 .. 7]" );
@@ -2339,7 +2388,7 @@ VVENC_DECL int vvenc_init_default( vvenc_config *c, int width, int height, int f
   c->m_RCMaxBitrate        = 0;                        // maximum instantaneous bitrate in bps
 
   c->m_numThreads          = -1;                       // number of worker threads (-1: auto, 0: off, else set worker threads)
-  
+
   iRet = vvenc_init_preset( c, preset );
   return iRet;
 }
@@ -2980,6 +3029,12 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve
       }
       else
         css << "single-pass";
+#if !IFP_RC_DETERMINISTIC
+      if( c->m_ifpLines && c->m_numThreads > 1 )
+      {
+        css << " (non-deterministic due to IFP)";
+      }
+#endif
     }
     else
     {
@@ -3220,7 +3275,7 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve
     css << "\n" << loglvl << "PARALLEL PROCESSING CFG: ";
     css << "NumThreads:" << c->m_numThreads << " ";
     css << "MaxParallelFrames:" << c->m_maxParallelFrames << " ";
-    css << "FppLinesSynchro:" << ( int ) c->m_fppLinesSynchro << " ";
+    css << "IFP:" << (c->m_ifp ? 1: 0) << " (IFPLines:" << (int)c->m_ifpLines << ")" << " ";
     if( c->m_picPartitionFlag )
     {
       css << "TileParallelCtuEnc:" << c->m_tileParallelCtuEnc << " ";
diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp
index 3eb4f5758..d791f8b2e 100644
--- a/source/Lib/vvenc/vvencimpl.cpp
+++ b/source/Lib/vvenc/vvencimpl.cpp
@@ -70,6 +70,9 @@ POSSIBILITY OF SUCH DAMAGE.
 #  include <malloc.h>
 #endif
 
+#if defined( TARGET_SIMD_ARM )
+#  include "CommonLib/arm/CommonDefARM.h"
+#endif
 
 #if _DEBUG
 #define HANDLE_EXCEPTION 0
@@ -796,6 +799,9 @@ const char* VVEncImpl::setSIMDExtension( const char* simdId )
     try
     {
       read_x86_extension_flags( request_ext );
+#if defined( TARGET_SIMD_ARM )
+      read_arm_extension_flags( request_ext == x86_simd::UNDEFINED ? arm_simd::UNDEFINED : request_ext != x86_simd::SCALAR ? arm_simd::NEON : arm_simd::SCALAR );
+#endif
     }
     catch( Exception& )
     {