diff --git a/Makefile b/Makefile
index 5641208d6..d0da05c8e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-# VVCEnc/Makefile
+# VVenC/Makefile
 #
 # How to build a single target:
 #  make <project>-r  => build variant=release
@@ -163,22 +163,22 @@ DEFAULT_BUILD_TARGETS_SHARED := $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),$(t)
 DEFAULT_BUILD_TARGETS := $(DEFAULT_BUILD_TARGETS_STATIC) $(DEFAULT_BUILD_TARGETS_SHARED)
 
 
-release: $(BUILD_DIR-release)
+release: $(BUILD_DIR-release)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
-debug: $(BUILD_DIR-debug)
+debug: $(BUILD_DIR-debug)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
-relwithdebinfo: $(BUILD_DIR-relwithdebinfo)
+relwithdebinfo: $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
-release-shared: $(BUILD_DIR-release-shared)
+release-shared: $(BUILD_DIR-release-shared)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
-debug-shared: $(BUILD_DIR-debug-shared)
+debug-shared: $(BUILD_DIR-debug-shared)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
-relwithdebinfo-shared: $(BUILD_DIR-relwithdebinfo-shared)
+relwithdebinfo-shared: $(BUILD_DIR-relwithdebinfo-shared)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS)
 
 $(foreach t,$(DEFAULT_BUILD_TARGETS),clean-$(t)):
@@ -204,37 +204,37 @@ install-relwithdebinfo-shared: relwithdebinfo-shared
 
 
 ifeq ($(CMAKE_MCONFIG),)
-$(BUILD_DIR-release) configure-release:
+$(BUILD_DIR-release)/CMakeCache.txt configure-release:
 	cmake -S . -B $(BUILD_DIR-release) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Release
 
-$(BUILD_DIR-debug) configure-debug:
+$(BUILD_DIR-debug)/CMakeCache.txt configure-debug:
 	cmake -S . -B $(BUILD_DIR-debug) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Debug
 
-$(BUILD_DIR-relwithdebinfo) configure-relwithdebinfo:
+$(BUILD_DIR-relwithdebinfo)/CMakeCache.txt configure-relwithdebinfo:
 	cmake -S . -B $(BUILD_DIR-relwithdebinfo) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=RelWithDebInfo
 
-$(BUILD_DIR-release-shared) configure-release-shared:
+$(BUILD_DIR-release-shared)/CMakeCache.txt configure-release-shared:
 	cmake -S . -B $(BUILD_DIR-release-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=1
 
-$(BUILD_DIR-debug-shared) configure-debug-shared:
+$(BUILD_DIR-debug-shared)/CMakeCache.txt configure-debug-shared:
 	cmake -S . -B $(BUILD_DIR-debug-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=1
 
-$(BUILD_DIR-relwithdebinfo-shared) configure-relwithdebinfo-shared:
+$(BUILD_DIR-relwithdebinfo-shared)/CMakeCache.txt configure-relwithdebinfo-shared:
 	cmake -S . -B $(BUILD_DIR-relwithdebinfo-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_SHARED_LIBS=1
 
 configure-static: $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t))
 configure-shared: $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t))
 
 else
-$(BUILD_DIR_STATIC) configure-static $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t)):
+$(BUILD_DIR_STATIC)/CMakeCache.txt configure-static $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t)):
 	cmake -S . -B $(BUILD_DIR_STATIC) $(CONFIG_OPTIONS)
 
-$(BUILD_DIR_SHARED) configure-shared $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t)):
+$(BUILD_DIR_SHARED)/CMakeCache.txt configure-shared $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t)):
 	cmake -S . -B $(BUILD_DIR_SHARED) $(CONFIG_OPTIONS) -DBUILD_SHARED_LIBS=1
 endif
 
 static: $(DEFAULT_BUILD_TARGETS_STATIC)
-shared: $(DEFAULT_BUILD_TARGETS_SHARED) 
+shared: $(DEFAULT_BUILD_TARGETS_SHARED)
 
 all: static shared
 
@@ -303,26 +303,24 @@ TARGETS_RELEASE_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cr)
 TARGETS_DEBUG_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cd)
 TARGETS_RELWITHDEBINFO_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cp)
 
-$(TARGETS_RELEASE): $(BUILD_DIR-release)
+$(TARGETS_RELEASE): $(BUILD_DIR-release)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-release) $(BUILD_JOBS) --target $(patsubst %-r,%,$@) $(BUILD_TOOL_OPTIONS)
 
-$(TARGETS_RELEASE_CLEAN_FIRST): $(BUILD_DIR-release)
+$(TARGETS_RELEASE_CLEAN_FIRST): $(BUILD_DIR-release)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-release) $(BUILD_JOBS) --clean-first --target $(patsubst %-cr,%,$@) $(BUILD_TOOL_OPTIONS)
 
-$(TARGETS_DEBUG): $(BUILD_DIR-debug)
+$(TARGETS_DEBUG): $(BUILD_DIR-debug)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-debug) $(BUILD_JOBS) --target $(patsubst %-d,%,$@) $(BUILD_TOOL_OPTIONS)
 
-$(TARGETS_DEBUG_CLEAN_FIRST): $(BUILD_DIR-debug)
+$(TARGETS_DEBUG_CLEAN_FIRST): $(BUILD_DIR-debug)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-debug) $(BUILD_JOBS) --clean-first --target $(patsubst %-cd,%,$@) $(BUILD_TOOL_OPTIONS)
 
-$(TARGETS_RELWITHDEBINFO): $(BUILD_DIR-relwithdebinfo)
+$(TARGETS_RELWITHDEBINFO): $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-relwithdebinfo) $(BUILD_JOBS) --target $(patsubst %-p,%,$@) $(BUILD_TOOL_OPTIONS)
 
-$(TARGETS_RELWITHDEBINFO_CLEAN_FIRST): $(BUILD_DIR-relwithdebinfo)
+$(TARGETS_RELWITHDEBINFO_CLEAN_FIRST): $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt
 	cmake $(BUILD_OPTIONS-relwithdebinfo) $(BUILD_JOBS) --clean-first --target $(patsubst %-cp,%,$@) $(BUILD_TOOL_OPTIONS)
 
-.PHONY: install
+.PHONY: install clean realclean distclean
 
-ifeq ($(OS),Windows_NT)
 .NOTPARALLEL:
-endif
diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h
index 9cc4506aa..16c3c5a58 100644
--- a/include/vvenc/vvencCfg.h
+++ b/include/vvenc/vvencCfg.h
@@ -498,8 +498,8 @@ typedef struct vvenc_config
   int                 m_cfgUnused4[ 7 ];                                                 // TODO: remove unused memory from configuration
   int                 m_cfgUnused5[ 7 ];
   int                 m_cfgUnused6;
-  int                 m_cfgUnused7;
-  int                 m_cfgUnused8;
+  int                 m_maxPicWidth;
+  int                 m_maxPicHeight;
 
   bool                m_useSameChromaQPTables;
   vvencChromaQpMappingTableParams m_chromaQpMappingTableParams;
diff --git a/source/App/vvencFFapp/EncApp.cpp b/source/App/vvencFFapp/EncApp.cpp
index 74bf1b527..a6a449d45 100644
--- a/source/App/vvencFFapp/EncApp.cpp
+++ b/source/App/vvencFFapp/EncApp.cpp
@@ -305,6 +305,7 @@ int EncApp::encode()
 
     apputils::Stats cStats;
     int64_t frameCount =  apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput );
+    frameCount = std::max<int64_t>( 0, frameCount-appCfg.m_FrameSkip );
     int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded;
     cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
     bool statsInfoReady = false;
diff --git a/source/App/vvencapp/vvencapp.cpp b/source/App/vvencapp/vvencapp.cpp
index bf86041b6..19612ab1a 100644
--- a/source/App/vvencapp/vvencapp.cpp
+++ b/source/App/vvencapp/vvencapp.cpp
@@ -353,6 +353,7 @@ int main( int argc, char* argv[] )
     }
 
     int64_t frameCount =  apputils::VVEncAppCfg::getFrameCount( vvencappCfg.m_inputFileName, vvenccfg.m_SourceWidth, vvenccfg.m_SourceHeight, vvenccfg.m_inputBitDepth[0], vvencappCfg.m_packedYUVInput );
+    frameCount = std::max<int64_t>( 0, frameCount-vvencappCfg.m_FrameSkip );
     int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded;
 
     apputils::Stats cStats;
diff --git a/source/Lib/CommonLib/AffineGradientSearch.cpp b/source/Lib/CommonLib/AffineGradientSearch.cpp
index 435c62aa9..52acd1c1d 100644
--- a/source/Lib/CommonLib/AffineGradientSearch.cpp
+++ b/source/Lib/CommonLib/AffineGradientSearch.cpp
@@ -64,8 +64,9 @@ namespace vvenc {
   AffineGradientSearch::AffineGradientSearch()
   {
     m_HorizontalSobelFilter = xHorizontalSobelFilter;
-    m_VerticalSobelFilter = xVerticalSobelFilter;
-    m_EqualCoeffComputer = xEqualCoeffComputer;
+    m_VerticalSobelFilter   = xVerticalSobelFilter;
+    m_EqualCoeffComputer[0] = xEqualCoeffComputer<false>;
+    m_EqualCoeffComputer[1] = xEqualCoeffComputer<true>;
 
 #if ENABLE_SIMD_OPT_AFFINE_ME
 #ifdef TARGET_SIMD_X86
@@ -74,7 +75,7 @@ namespace vvenc {
 #endif
   }
 
-  void AffineGradientSearch::xHorizontalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height)
+  void AffineGradientSearch::xHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
   {
     for (int j = 1; j < height - 1; j++)
     {
@@ -106,7 +107,7 @@ namespace vvenc {
     }
   }
 
-  void AffineGradientSearch::xVerticalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height)
+  void AffineGradientSearch::xVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
   {
     for (int k = 1; k < width - 1; k++)
     {
@@ -139,7 +140,8 @@ namespace vvenc {
     }
   }
 
-  void AffineGradientSearch::xEqualCoeffComputer(Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param)
+  template<bool b6Param>
+  void AffineGradientSearch::xEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7])
   {
     int affineParamNum = b6Param ? 6 : 4;
 
diff --git a/source/Lib/CommonLib/AffineGradientSearch.h b/source/Lib/CommonLib/AffineGradientSearch.h
index 60c7904d2..f06995157 100644
--- a/source/Lib/CommonLib/AffineGradientSearch.h
+++ b/source/Lib/CommonLib/AffineGradientSearch.h
@@ -56,13 +56,14 @@ namespace vvenc {
   class AffineGradientSearch
   {
   public:
-    void  (*m_HorizontalSobelFilter)  (Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height);
-    void  (*m_VerticalSobelFilter)    (Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height);
-    void  (*m_EqualCoeffComputer)     (Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param);
-
-    static void xHorizontalSobelFilter( Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height);
-    static void xVerticalSobelFilter  ( Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height);
-    static void xEqualCoeffComputer   ( Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param);
+    void  (*m_HorizontalSobelFilter)  (Pel* const pPred, const int predStride, Pel *const pDerivate,   const int derivateBufStride, const int width, const int height);
+    void  (*m_VerticalSobelFilter)    (Pel* const pPred, const int predStride, Pel *const pDerivate,   const int derivateBufStride, const int width, const int height);
+    void  (*m_EqualCoeffComputer[2])  (Pel* const pResi, const int resiStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]);
+
+    static void xHorizontalSobelFilter( Pel* const pPred, const int predStride, Pel *const pDerivate,   const int derivateBufStride, const int width, const int height);
+    static void xVerticalSobelFilter  ( Pel* const pPred, const int predStride, Pel *const pDerivate,   const int derivateBufStride, const int width, const int height);
+    template<bool b6Param>
+    static void xEqualCoeffComputer   ( Pel* const pResi, const int resiStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]);
 
     AffineGradientSearch();
     ~AffineGradientSearch() {}
diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp
index 738c4086f..c229d2329 100644
--- a/source/Lib/CommonLib/DepQuant.cpp
+++ b/source/Lib/CommonLib/DepQuant.cpp
@@ -1661,7 +1661,7 @@ DepQuant::~DepQuant()
 
 void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff& uiAbsSum, const QpParam& cQP, const Ctx& ctx )
 {
-  if( tu.cs->picture->useScSelectiveRdoq && !xNeedRDOQ( tu, compID, pSrc, cQP ) )
+  if( tu.cs->picture->useSelectiveRdoq && !xNeedRDOQ( tu, compID, pSrc, cQP ) )
   {
     tu.lastPos[compID] = -1;
     uiAbsSum           =  0;
diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp
index c45ee0864..20b522582 100644
--- a/source/Lib/CommonLib/InterpolationFilter.cpp
+++ b/source/Lib/CommonLib/InterpolationFilter.cpp
@@ -356,7 +356,7 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel* src, int
 //
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 template<int N, bool isVertical, bool isFirst, bool isLast>
-void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR)
+void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff)
 {
   int row, col;
 
@@ -389,19 +389,21 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt
   // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
   CHECK(shift < 0, "Negative shift");
 
-  if ( isLast )
+  if( N != 2 )
   {
-    shift += (isFirst) ? 0 : headRoom;
-    offset = 1 << (shift - 1);
-    offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    if ( isLast )
+    {
+      shift += (isFirst) ? 0 : headRoom;
+      offset = 1 << (shift - 1);
+      offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    }
+    else
+    {
+      shift -= (isFirst) ? headRoom : 0;
+      offset = (isFirst) ? -IF_INTERNAL_OFFS *(1<<shift) : 0;
+    }
   }
   else
-  {
-    shift -= (isFirst) ? headRoom : 0;
-    offset = (isFirst) ? -IF_INTERNAL_OFFS *(1<<shift) : 0;
-  }
-
-  if (biMCForDMVR)
   {
     if( isFirst )
     {
@@ -414,6 +416,7 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt
       offset = 1 << (shift - 1);
     }
   }
+
   for (row = 0; row < height; row++)
   {
     for (col = 0; col < width; col++)
@@ -466,23 +469,23 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt
  * \param  coeff      Pointer to filter taps
  */
 template<int N>
-void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR)
+void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff)
 {
   if( N == 8 )
   {
-    m_filterHor[0][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterHor[0][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 4 )
   {
-    m_filterHor[1][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterHor[1][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 2 )
   {
-    m_filterHor[2][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterHor[2][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 6 )
   {
-    m_filterHor[3][1][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR );
+    m_filterHor[3][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else
   {
@@ -506,23 +509,23 @@ void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int sr
  * \param  coeff      Pointer to filter taps
  */
 template<int N>
-void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR)
+void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff)
 {
   if( N == 8 )
   {
-    m_filterVer[0][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterVer[0][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 4 )
   {
-    m_filterVer[1][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterVer[1][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 2 )
   {
-    m_filterVer[2][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR);
+    m_filterVer[2][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else if( N == 6 )
   {
-    m_filterVer[3][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR );
+    m_filterVer[3][isFirst][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff);
   }
   else{
     THROW( "Invalid tap number" );
@@ -562,7 +565,7 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in
     CHECK( frac < 0 || frac >= LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" );
     if( nFilterIdx == 1 )
     {
-      filterHor<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilterPrec4[frac], biMCForDMVR);
+      filterHor<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilterPrec4[frac]);
     }
     else
     {
@@ -570,24 +573,24 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in
       {
         if( useAltHpelIf && frac == 8 )
         {
-          filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaAltHpelIFilter, biMCForDMVR );
+          filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaAltHpelIFilter );
         }
         else if( ( width == 4 && height == 4 ) || ( width == 4 && height == ( 4 + NTAPS_LUMA - 1 ) ) )
         {
-          filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac], biMCForDMVR );
+          filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac] );
         }
         else
         {
-          filterHor<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac], biMCForDMVR );
+          filterHor<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac] );
         }
       }
       else if( reduceTap == 1 )
       {
-        filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac], biMCForDMVR );
+        filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac] );
       }
       else
       {
-        filterHor<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << 1], biMCForDMVR );
+        filterHor<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << 1] );
       }
     }
   }
@@ -595,7 +598,7 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in
   {
     const uint32_t csx = getComponentScaleX( compID, fmt );
     CHECK( frac < 0 || csx >= 2 || ( frac << ( 1 - csx ) ) >= CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" );
-    filterHor<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << ( 1 - csx )], biMCForDMVR);
+    filterHor<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << ( 1 - csx )] );
   }
 }
 
@@ -627,7 +630,7 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in
     CHECK( frac < 0 || frac >= LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" );
     if (nFilterIdx == 1)
     {
-      filterVer<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilterPrec4[frac], biMCForDMVR);
+      filterVer<NTAPS_BILINEAR>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilterPrec4[frac]);
     }
     else
     {
@@ -635,24 +638,24 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in
       {
         if( useAltHpelIf && frac == 8 )
         {
-          filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaAltHpelIFilter, biMCForDMVR );
+          filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaAltHpelIFilter );
         }
         else if( width == 4 && height == 4 )
         {
-          filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac], biMCForDMVR );
+          filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac] );
         }
         else
         {
-          filterVer<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac], biMCForDMVR );
+          filterVer<NTAPS_LUMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac] );
         }
       }
       else if( reduceTap == 1 )
       {
-        filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac], biMCForDMVR );
+        filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac] );
       }
       else
       {
-        filterVer<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << 1], biMCForDMVR );
+        filterVer<NTAPS_CHROMA>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << 1] );
       }
     }
   }
@@ -660,7 +663,7 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in
   {
     const uint32_t csy = getComponentScaleY( compID, fmt );
     CHECK( frac < 0 || csy >= 2 || ( frac << ( 1 - csy ) ) >= CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" );
-    filterVer<NTAPS_CHROMA>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << (1 - csy)], biMCForDMVR);
+    filterVer<NTAPS_CHROMA>(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << (1 - csy)]);
   }
 }
 
@@ -674,8 +677,8 @@ void InterpolationFilter::scalarFilterN2_2D( const ClpRng& clpRng, Pel const *sr
 {
   Pel *tmp = ( Pel* ) alloca( width * ( height + 1 ) * sizeof( Pel ) );
 
-  filter<2, false, true,  false>( clpRng, src, srcStride, tmp, width,     width, height + 1, ch, true );
-  filter<2, true , false, false>( clpRng, tmp, width,     dst, dstStride, width, height,     cv, true );
+  filter<2, false, true,  false>( clpRng, src, srcStride, tmp, width,     width, height + 1, ch );
+  filter<2, true , false, false>( clpRng, tmp, width,     dst, dstStride, width, height,     cv );
 }
 
 void InterpolationFilter::filter4x4( const ComponentID compID, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng,bool useAltHpelIf/*= false*/,int nFilterIdx /*= 0*/ )
diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h
index 6e2c09aee..6e3c83ee4 100644
--- a/source/Lib/CommonLib/InterpolationFilter.h
+++ b/source/Lib/CommonLib/InterpolationFilter.h
@@ -72,15 +72,15 @@ class InterpolationFilter
   static const TFilterCoeff m_bilinearFilterPrec4[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][NTAPS_BILINEAR]; ///< bilinear filter taps
 public:
   template<bool isFirst, bool isLast>
-  static void filterCopy(const ClpRng& clpRng, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height,                                                       bool biMCForDMVR);
+  static void filterCopy(const ClpRng& clpRng, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool biMCForDMVR);
 
   template<int N, bool isVertical, bool isFirst, bool isLast>
-  static void filter    (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height,                            TFilterCoeff const *coeff, bool biMCForDMVR);
+  static void filter    (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height,                            TFilterCoeff const *coeff);
   template<int N>
-  void filterHor        (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast,               TFilterCoeff const *coeff, bool biMCForDMVR);
+  void filterHor        (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast,               TFilterCoeff const *coeff);
 
   template<int N>
-  void filterVer        (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR);
+  void filterVer        (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff);
 
   template<bool isLast, int w>
   static void filterXxY_N2     (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV);
@@ -101,8 +101,8 @@ class InterpolationFilter
   ~InterpolationFilter() {}
 
   void( *m_filterN2_2D        )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *ch, TFilterCoeff const *cv );
-  void( *m_filterHor[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR);
-  void( *m_filterVer[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR);
+  void( *m_filterHor[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff);
+  void( *m_filterVer[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff);
   void( *m_filterCopy[2][2] )  ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool biMCForDMVR);
   void( *m_filter4x4  [2][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV );
   void( *m_filter8x8  [3][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV );
diff --git a/source/Lib/CommonLib/MCTF.cpp b/source/Lib/CommonLib/MCTF.cpp
index 8c8329f22..40e10ef78 100644
--- a/source/Lib/CommonLib/MCTF.cpp
+++ b/source/Lib/CommonLib/MCTF.cpp
@@ -468,6 +468,8 @@ void applyBlockCore( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const
         }
       }
     }
+    variance <<= 2*(10-clpRng.bd);
+    diffsum <<= 2*(10-clpRng.bd);
     const int cntV = w * h;
     const int cntD = 2 * cntV - w - h;
     vnoise[i] = ( int ) round( ( 15.0 * cntD / cntV * variance + 5.0 ) / ( diffsum + 5.0 ) );
@@ -691,7 +693,7 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
   int dropFramesFront = std::min( std::max(                                          filterIdx - filterFrames, 0 ), dropFrames );
   int dropFramesBack  = std::min( std::max( static_cast<int>( picFifo.size() ) - 1 - filterIdx - filterFrames, 0 ), dropFrames );
 
-  if( !pic->useScMCTF && !pic->gopEntry->m_isStartOfGop )
+  if( !pic->useMCTF && !pic->gopEntry->m_isStartOfGop )
   {
     isFilterThisFrame = false;
   }
@@ -749,7 +751,7 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
     }
 
     // filter
-    if( pic->useScMCTF )
+    if( pic->useMCTF )
     {
       fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding );
       bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength );
@@ -798,7 +800,7 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
 
       if( distFactor[0] < 3 && distFactor[1] < 3 && ( m_encCfg->m_usePerceptQPA || pic->gopEntry->m_isStartOfGop ) )
       {
-        const double bd12bScale = double (m_encCfg->m_internalBitDepth[CH_L] < 12 ? 1 << (12 - m_encCfg->m_internalBitDepth[CH_L]) : 1);
+        const double bd12bScale = double (m_encCfg->m_internalBitDepth[CH_L] < 12 ? 4 : 1);
 
         for( int i = 0; i < numCtu; i++ ) // start noise estimation with motion errors
         {
@@ -825,7 +827,7 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
         }
         pic->m_picShared->m_picMotEstError = uint16_t (0.5 + meanRmsAcrossPic / numCtu);
 
-        if( pic->gopEntry->m_isStartOfGop && !pic->useScMCTF && m_encCfg->m_vvencMCTF.MCTF > 0 && meanRmsAcrossPic > numCtu * 27.0 )
+        if( pic->gopEntry->m_isStartOfGop && !pic->useMCTF && m_encCfg->m_vvencMCTF.MCTF > 0 && meanRmsAcrossPic > numCtu * 27.0 )
         {
           // force filter
           fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding );
@@ -833,7 +835,7 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
         }
       }
 
-      if( !m_encCfg->m_blockImportanceMapping || !pic->useScMCTF )
+      if( !m_encCfg->m_blockImportanceMapping || !pic->useMCTF )
       {
         CHECKD( !pic->m_picShared->m_ctuBimQpOffset.empty(), "BIM disabled, but offset vector not empty!" );
         return;
@@ -1001,7 +1003,7 @@ int MCTF::motionErrorLuma(const PelStorage &orig,
 }
 
 bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX, Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize,
-  const Array2D<MotionVector> *previous, const int factor, const bool doubleRes, int blockY ) const
+  const Array2D<MotionVector> *previous, const int factor, const bool doubleRes, int blockY, int bitDepth ) const
 {
   PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_MCTF_SEARCH );
 
@@ -1145,10 +1147,12 @@ bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX,
     const int w = std::min<int>( blockSize, orig.Y().width  - blockX ) & ~7;
     const int h = std::min<int>( blockSize, orig.Y().height - blockY ) & ~7;
 
-    const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h );
-    const double mse  = best.error / double( w * h );
+    CHECKD(bitDepth>10, "unsupported internal bit depth (also in calcVar)" );
+    const double bdScale = double(1<<(2*(10-bitDepth)));
+    const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ) * bdScale;
+    const double mse  = best.error * bdScale / double( w * h );
 
-    best.error   = ( int ) ( 20 * ( ( best.error + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 );
+    best.error   = ( int ) ( 20 * ( ( best.error*bdScale + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 );
     best.rmsme   = uint16_t( 0.5 + sqrt( mse ) );
     best.overlap = ( ( double ) w * h ) / ( m_mctfUnitSize * m_mctfUnitSize );
 
@@ -1162,6 +1166,7 @@ void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &or
 {
   const int stepSize = blockSize;
   const int origHeight = orig.Y().height;
+  const int bitDepth = m_encCfg->m_internalBitDepth[CH_L];
 
   if( m_threadPool )
   {
@@ -1177,6 +1182,7 @@ void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &or
       int   factor; 
       bool  doubleRes;
       int   blockY;
+      int   bitDepth;
       const MCTF* mctf;
     };
 
@@ -1190,7 +1196,7 @@ void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &or
       {
         ITT_TASKSTART( itt_domain_MCTF_est, itt_handle_est );
 
-        bool ret = params->mctf->estimateLumaLn( params->blockX, params->prevLineX, *params->mvs, *params->orig, *params->buffer, params->blockSize, params->previous, params->factor, params->doubleRes, params->blockY );
+        bool ret = params->mctf->estimateLumaLn( params->blockX, params->prevLineX, *params->mvs, *params->orig, *params->buffer, params->blockSize, params->previous, params->factor, params->doubleRes, params->blockY, params->bitDepth );
 
         ITT_TASKEND( itt_domain_MCTF_est, itt_handle_est );
         return ret;
@@ -1208,6 +1214,7 @@ void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &or
       cEstParams.doubleRes = doubleRes;
       cEstParams.mctf = this;
       cEstParams.blockY = blockY;
+      cEstParams.bitDepth = bitDepth;
 
       m_threadPool->addBarrierTask<EstParams>( task, &cEstParams, &taskCounter);
     }
@@ -1218,7 +1225,7 @@ void MCTF::motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &or
     for( int blockY = 0; blockY + 7 <= origHeight; blockY += stepSize )
     {
       std::atomic_int blockX( 0 ), prevBlockX( orig.Y().width + stepSize );
-      estimateLumaLn( blockX, blockY ? &prevBlockX : nullptr, mvs, orig, buffer, blockSize, previous, factor, doubleRes, blockY );
+      estimateLumaLn( blockX, blockY ? &prevBlockX : nullptr, mvs, orig, buffer, blockSize, previous, factor, doubleRes, blockY, bitDepth );
     }
 
   }
diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h
index b93d473ae..cd60dae78 100644
--- a/source/Lib/CommonLib/MCTF.h
+++ b/source/Lib/CommonLib/MCTF.h
@@ -180,7 +180,7 @@ class MCTF : public EncStage
   int motionErrorLuma   (const PelStorage &orig, const PelStorage &buffer, const int x, const int y, int dx, int dy, const int bs, const int besterror) const;
 
   bool estimateLumaLn   ( std::atomic_int& blockX, std::atomic_int* prevLineX, Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize,
-    const Array2D<MotionVector> *previous, const int factor, const bool doubleRes, int blockY ) const;
+    const Array2D<MotionVector> *previous, const int factor, const bool doubleRes, int blockY, int bitDepth ) const;
 
   void motionEstimationLuma(Array2D<MotionVector> &mvs, const PelStorage &orig, const PelStorage &buffer, const int bs,
     const Array2D<MotionVector> *previous=0, const int factor = 1, const bool doubleRes = false) const;
diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp
index 5791fedee..03718cb6d 100644
--- a/source/Lib/CommonLib/Picture.cpp
+++ b/source/Lib/CommonLib/Picture.cpp
@@ -182,15 +182,15 @@ Picture::Picture()
     , picSpVisAct       ( 0 )
     , isSccWeak         ( false )
     , isSccStrong       ( false )
-    , useScME           ( false )
-    , useScMCTF         ( false )
-    , useScTS           ( false )
-    , useScBDPCM        ( false )
-    , useScIBC          ( false )
-    , useScLMCS         ( false )
-    , useScSAO          ( false )
-    , useScNumRefs      ( false )
-    , useScFastMrg      ( 0 )
+    , useME           ( false )
+    , useMCTF         ( false )
+    , useTS           ( false )
+    , useBDPCM        ( false )
+    , useIBC          ( false )
+    , useLMCS         ( false )
+    , useSAO          ( false )
+    , useNumRefs      ( false )
+    , useFastMrg      ( 0 )
     , useQtbttSpeedUpMode( 0 )
     , actualHeadBits    ( 0 )
     , actualTotalBits   ( 0 )
@@ -385,16 +385,16 @@ void Picture::finalInit( const VPS& _vps, const SPS& sps, const PPS& pps, PicHea
 
 void Picture::setSccFlags( const VVEncCfg* encCfg )
 {
-  useScME      = encCfg->m_motionEstimationSearchMethodSCC > 0                          && isSccStrong;
-  useScTS      = encCfg->m_TS == 1                || ( encCfg->m_TS == 2                && isSccWeak );
-  useScBDPCM   = encCfg->m_useBDPCM == 1          || ( encCfg->m_useBDPCM == 2          && isSccWeak );
-  useScMCTF    = encCfg->m_vvencMCTF.MCTF == 1    || ( encCfg->m_vvencMCTF.MCTF == 2    && ! isSccStrong );
-  useScLMCS    = encCfg->m_lumaReshapeEnable == 1 || ( encCfg->m_lumaReshapeEnable == 2 && ! isSccStrong );
-  useScIBC     = encCfg->m_IBCMode == 1           || ( encCfg->m_IBCMode == 2           && isSccStrong );
-  useScSAO     = encCfg->m_bUseSAO                && ( !encCfg->m_saoScc                || isSccWeak );
-  useScSelectiveRdoq = encCfg->m_useSelectiveRDOQ == 2 ? !isSccWeak : !!encCfg->m_useSelectiveRDOQ;
-  useScNumRefs = isSccStrong;
-  useScFastMrg = isSccStrong ? 0 : std::max(0, encCfg->m_useFastMrg - 2);
+  useME      = encCfg->m_motionEstimationSearchMethodSCC > 0                          && isSccStrong;
+  useTS      = encCfg->m_TS == 1                || ( encCfg->m_TS == 2                && isSccWeak );
+  useBDPCM   = encCfg->m_useBDPCM == 1          || ( encCfg->m_useBDPCM == 2          && isSccWeak );
+  useMCTF    = encCfg->m_vvencMCTF.MCTF == 1    || ( encCfg->m_vvencMCTF.MCTF == 2    && ! isSccStrong );
+  useLMCS    = encCfg->m_lumaReshapeEnable == 1 || ( encCfg->m_lumaReshapeEnable == 2 && ! isSccStrong );
+  useIBC     = encCfg->m_IBCMode == 1           || ( encCfg->m_IBCMode == 2           && isSccStrong );
+  useSAO     = encCfg->m_bUseSAO                && ( !encCfg->m_saoScc                || isSccWeak );
+  useSelectiveRdoq = encCfg->m_useSelectiveRDOQ == 2 ? !isSccWeak : !!encCfg->m_useSelectiveRDOQ;
+  useNumRefs = isSccStrong;
+  useFastMrg = isSccStrong ? 0 : std::max(0, encCfg->m_useFastMrg - 2);
   useQtbttSpeedUpMode = encCfg->m_qtbttSpeedUpMode;
 
   if( ( encCfg->m_qtbttSpeedUpMode & 2 ) && isSccStrong )
diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h
index 97b7d763c..b72b74016 100644
--- a/source/Lib/CommonLib/Picture.h
+++ b/source/Lib/CommonLib/Picture.h
@@ -252,16 +252,16 @@ struct Picture : public UnitArea
   StopClock                     encTime;
   bool                          isSccWeak;
   bool                          isSccStrong;
-  bool                          useScME;
-  bool                          useScMCTF;
-  bool                          useScTS;
-  bool                          useScBDPCM;
-  bool                          useScIBC;
-  bool                          useScLMCS;
-  bool                          useScSAO;
-  bool                          useScNumRefs;
-  bool                          useScSelectiveRdoq;
-  int                           useScFastMrg;
+  bool                          useME;
+  bool                          useMCTF;
+  bool                          useTS;
+  bool                          useBDPCM;
+  bool                          useIBC;
+  bool                          useLMCS;
+  bool                          useSAO;
+  bool                          useNumRefs;
+  bool                          useSelectiveRdoq;
+  int                           useFastMrg;
   int                           useQtbttSpeedUpMode;
   int                           actualHeadBits;
   int                           actualTotalBits;
diff --git a/source/Lib/CommonLib/QuantRDOQ.cpp b/source/Lib/CommonLib/QuantRDOQ.cpp
index 8928a7894..89990244a 100644
--- a/source/Lib/CommonLib/QuantRDOQ.cpp
+++ b/source/Lib/CommonLib/QuantRDOQ.cpp
@@ -470,7 +470,7 @@ void QuantRDOQ::quant(TransformUnit& tu, const ComponentID compID, const CCoeffB
 
   if( useRDOQ )
   {
-    if (!tu.cs->picture->useScSelectiveRdoq || xNeedRDOQ(tu, compID, piCoef, cQP))
+    if (!tu.cs->picture->useSelectiveRdoq || xNeedRDOQ(tu, compID, piCoef, cQP))
     {
       if( useTransformSkip )
       {
diff --git a/source/Lib/CommonLib/QuantRDOQ2.cpp b/source/Lib/CommonLib/QuantRDOQ2.cpp
index c93871e68..d9a32a99f 100644
--- a/source/Lib/CommonLib/QuantRDOQ2.cpp
+++ b/source/Lib/CommonLib/QuantRDOQ2.cpp
@@ -270,7 +270,7 @@ void QuantRDOQ2::quant( TransformUnit &tu, const ComponentID compID, const CCoef
 
   if( useRDOQ )
   {
-    if( !tu.cs->picture->useScSelectiveRdoq || xNeedRDOQ( tu, compID, piCoef, cQP ) )
+    if( !tu.cs->picture->useSelectiveRdoq || xNeedRDOQ( tu, compID, piCoef, cQP ) )
     {
       if( useTransformSkip )
       {
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index 807b2463e..8c93bf987 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -753,7 +753,7 @@ class static_vector
   template<class InputIt>
   iterator        insert( const_iterator _pos, InputIt first, InputIt last )
                                                 { const difference_type numEl = last - first;
-                                                  CHECKD( _size + numEl >= N, "capacity exceeded" );
+                                                  CHECKD( _size + numEl > N, "capacity exceeded" );
                                                   for( difference_type i = _size - 1; i >= _pos - _arr; i-- ) _arr[i + numEl] = _arr[i];
                                                   iterator it = _arr + ( _pos - _arr ); _size += numEl; iterator ret = it;
                                                   while( first != last ) *it++ = *first++;
@@ -761,7 +761,7 @@ class static_vector
 
   iterator        insert( const_iterator _pos, size_t numEl, const T& val )
                                                 { //const difference_type numEl = last - first;
-                                                  CHECKD( _size + numEl >= N, "capacity exceeded" );
+                                                  CHECKD( _size + numEl > N, "capacity exceeded" );
                                                   for( difference_type i = _size - 1; i >= _pos - _arr; i-- ) _arr[i + numEl] = _arr[i];
                                                   iterator it = _arr + ( _pos - _arr ); _size += numEl; iterator ret = it;
                                                   for ( int k = 0; k < numEl; k++) *it++ = val;
diff --git a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
index 2e9213500..6e77c7566 100644
--- a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
+++ b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
@@ -58,149 +58,121 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
-#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation)       \
-{                                                                                                              \
-inter0 = _mm_mul_epi32(x1, y1);                                                                                \
-inter1 = _mm_mul_epi32(tmp0, tmp2);                                                                            \
-inter2 = _mm_mul_epi32(x2, y2);                                                                                \
-inter3 = _mm_mul_epi32(tmp1, tmp3);                                                                            \
-inter2 = _mm_add_epi64(inter0, inter2);                                                                        \
-inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
-inter0 = _mm_loadl_epi64(loadLocation);                                                                        \
-inter3 = _mm_add_epi64(inter2, inter3);                                                                        \
-inter1 = _mm_srli_si128(inter3, 8);                                                                            \
-inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
-inter3 = _mm_add_epi64(inter0, inter3);                                                                        \
-}
-
   template<X86_VEXT vext>
-  static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height)
+  static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
   {
-    __m128i mmPred[4];
-    __m128i mm2xPred[2];
-    __m128i mmIntermediates[4];
-    __m128i mmDerivate[2];
+    CHECK( width % 8, "Invalid size!" );
 
-    assert(!(height % 2));
-    assert(!(width % 4));
-
-    /* Derivates of the rows and columns at the boundary are done at the end of this function */
-    /* The value of col and row indicate the columns and rows for which the derivates have already been computed */
-    for (int col = 1; (col + 2) < width; col += 2)
-    {
-      mmPred[0] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[0 * predStride + col - 1]));
-      mmPred[1] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[1 * predStride + col - 1]));
+    // pPred is 10-bit
 
-      mmPred[0] = _mm_cvtepi16_epi32(mmPred[0]);
-      mmPred[1] = _mm_cvtepi16_epi32(mmPred[1]);
+    // -1 0 1
+    // -2 0 2
+    // -1 0 1
+    // 
+    // sum( sobel ) = 8, i.e. 4-bit extension
 
-      for (int row = 1; row < (height - 1); row += 2)
+    for( int y = 1; y < ( height - 1 ); y++ )
+    {
+      int x = 1;
+      for( ; x < ( width - 8 ); x += 8 )
       {
-        mmPred[2] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[(row + 1) * predStride + col - 1]));
-        mmPred[3] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[(row + 2) * predStride + col - 1]));
-
-        mmPred[2] = _mm_cvtepi16_epi32(mmPred[2]);
-        mmPred[3] = _mm_cvtepi16_epi32(mmPred[3]);
-
-        mm2xPred[0] = _mm_slli_epi32(mmPred[1], 1);
-        mm2xPred[1] = _mm_slli_epi32(mmPred[2], 1);
-
-        mmIntermediates[0] = _mm_add_epi32(mm2xPred[0], mmPred[0]);
-        mmIntermediates[2] = _mm_add_epi32(mm2xPred[1], mmPred[1]);
-
-        mmIntermediates[0] = _mm_add_epi32(mmIntermediates[0], mmPred[2]);
-        mmIntermediates[2] = _mm_add_epi32(mmIntermediates[2], mmPred[3]);
-
-        mmPred[0] = mmPred[2];
-        mmPred[1] = mmPred[3];
-
-        mmIntermediates[1] = _mm_srli_si128(mmIntermediates[0], 8);
-        mmIntermediates[3] = _mm_srli_si128(mmIntermediates[2], 8);
+        __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] );
+        acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc );
+        acc         = _mm_slli_epi16( acc, 1 );
+        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
+        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
+        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
+        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
+
+        _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc );
+      }
 
-        mmDerivate[0] = _mm_sub_epi32(mmIntermediates[1], mmIntermediates[0]);
-        mmDerivate[1] = _mm_sub_epi32(mmIntermediates[3], mmIntermediates[2]);
+      __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] );
+      acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc );
+      acc         = _mm_slli_epi16( acc, 1 );
+      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
+      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
+      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
+      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
 
-        _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0]);
-        _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1]);
-      }
-    }
+      _mm_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x],                         acc );
+      _mm_storeu_si32 (              &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) );
 
-    for (int j = 1; j < (height - 1); j++)
-    {
-      pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-      pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
+      pDerivate[y * derivateBufStride]               = pDerivate[y * derivateBufStride + 1];
+      pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)];
     }
 
-    memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0]));
-    memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0])
-    );
+    memcpy( pDerivate,                                      pDerivate + derivateBufStride,                  width * sizeof( pDerivate[ 0 ] ) );
+    memcpy( pDerivate + ( height - 1 ) * derivateBufStride, pDerivate + ( height - 2 ) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) );
   }
 
   template<X86_VEXT vext>
-  static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height)
+  static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height)
   {
-    __m128i mmPred[4];
-    __m128i mmIntermediates[6];
-    __m128i mmDerivate[2];
+    CHECK( width % 8, "Invalid size!" );
 
-    assert(!(height % 2));
-    assert(!(width % 4));
-
-    /* Derivates of the rows and columns at the boundary are done at the end of this function */
-    /* The value of col and row indicate the columns and rows for which the derivates have already been computed */
-    for (int col = 1; col < (width - 1); col += 2)
-    {
-      mmPred[0] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[0 * predStride + col - 1]));
-      mmPred[1] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[1 * predStride + col - 1]));
+    // pPred is 10-bit
 
-      mmPred[0] = _mm_cvtepi16_epi32(mmPred[0]);
-      mmPred[1] = _mm_cvtepi16_epi32(mmPred[1]);
+    // -1 -2 -1
+    //  0  0  0
+    //  1  2  1
+    // 
+    // sum( sobel ) = 8, i.e. 4-bit extension
 
-      for (int row = 1; row < (height - 1); row += 2)
+    for( int y = 1; y < ( height - 1 ); y++ )
+    {
+      int x = 1;
+      for( ; x < ( width - 8 ); x += 8 )
       {
-        mmPred[2] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[(row + 1) * predStride + col - 1]));
-        mmPred[3] = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&pPred[(row + 2) * predStride + col - 1]));
-
-        mmPred[2] = _mm_cvtepi16_epi32(mmPred[2]);
-        mmPred[3] = _mm_cvtepi16_epi32(mmPred[3]);
-
-        mmIntermediates[0] = _mm_sub_epi32(mmPred[2], mmPred[0]);
-        mmIntermediates[3] = _mm_sub_epi32(mmPred[3], mmPred[1]);
-
-        mmPred[0] = mmPred[2];
-        mmPred[1] = mmPred[3];
-
-        mmIntermediates[1] = _mm_srli_si128(mmIntermediates[0], 4);
-        mmIntermediates[4] = _mm_srli_si128(mmIntermediates[3], 4);
-        mmIntermediates[2] = _mm_srli_si128(mmIntermediates[0], 8);
-        mmIntermediates[5] = _mm_srli_si128(mmIntermediates[3], 8);
-
-        mmIntermediates[1] = _mm_slli_epi32(mmIntermediates[1], 1);
-        mmIntermediates[4] = _mm_slli_epi32(mmIntermediates[4], 1);
-
-        mmIntermediates[0] = _mm_add_epi32(mmIntermediates[0], mmIntermediates[2]);
-        mmIntermediates[3] = _mm_add_epi32(mmIntermediates[3], mmIntermediates[5]);
-
-        mmDerivate[0] = _mm_add_epi32(mmIntermediates[0], mmIntermediates[1]);
-        mmDerivate[1] = _mm_add_epi32(mmIntermediates[3], mmIntermediates[4]);
-
-        _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0]);
-        _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1]);
+        __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] );
+        acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc );
+        acc         = _mm_slli_epi16( acc, 1 );
+        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
+        acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
+        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
+        acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
+
+        _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc );
       }
+      
+      __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] );
+      acc         = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc );
+      acc         = _mm_slli_epi16( acc, 1 );
+      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) );
+      acc         = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) );
+      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) );
+      acc         = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) );
+
+      _mm_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x],                         acc );
+      _mm_storeu_si32 (              &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) );
+
+      pDerivate[y * derivateBufStride]               = pDerivate[y * derivateBufStride + 1];
+      pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)];
     }
 
-    for (int j = 1; j < (height - 1); j++)
-    {
-      pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-      pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
-    }
-
-    memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0]));
-    memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0]));
+    memcpy( pDerivate,                                    pDerivate + derivateBufStride,                width * sizeof( pDerivate[ 0 ] ) );
+    memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) );
   }
 
-  template<X86_VEXT vext>
-  static void simdEqualCoeffComputer(Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param)
+
+
+#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation)       \
+{                                                                                                              \
+inter0 = _mm_mul_epi32(x1, y1);                                                                                \
+inter1 = _mm_mul_epi32(tmp0, tmp2);                                                                            \
+inter2 = _mm_mul_epi32(x2, y2);                                                                                \
+inter3 = _mm_mul_epi32(tmp1, tmp3);                                                                            \
+inter2 = _mm_add_epi64(inter0, inter2);                                                                        \
+inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
+inter0 = _mm_loadl_epi64(loadLocation);                                                                        \
+inter3 = _mm_add_epi64(inter2, inter3);                                                                        \
+inter1 = _mm_srli_si128(inter3, 8);                                                                            \
+inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
+inter3 = _mm_add_epi64(inter0, inter3);                                                                        \
+}
+
+  template<X86_VEXT vext, bool b6Param>
+  static void simdEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7])
   {
     __m128i mmFour;
     __m128i mmTmp[4];
@@ -210,14 +182,13 @@ inter3 = _mm_add_epi64(inter0, inter3);
     __m128i mmC[12];
 
     // Add directly to indexes to get new index
-    mmFour = _mm_set1_epi32(4);
+    mmFour  = _mm_set1_epi32(4);
     mmIndxJ = _mm_set1_epi32(-2);
 
 
-    int n = b6Param ? 6 : 4;
-    int idx1 = 0, idx2 = 0;
-    idx1 = -2 * derivateBufStride - 4;
-    idx2 = -derivateBufStride - 4;
+    static constexpr int n = b6Param ? 6 : 4;
+    int idx1 = -2 * derivateBufStride - 4;
+    int idx2 = -    derivateBufStride - 4;
 
     for (int j = 0; j < height; j += 2)
     {
@@ -236,16 +207,16 @@ inter3 = _mm_add_epi64(inter0, inter3);
         if (b6Param)
         {
           // mmC[0-5] for iC[0-5] of 1st row of pixels
-          mmC[0] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]);
-          mmC[2] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]);
+          mmC[0] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx1]));
+          mmC[2] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx1]));
           mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]);
           mmC[3] = _mm_mullo_epi32(mmIndxK, mmC[2]);
           mmC[4] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
           mmC[5] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
 
           // mmC[6-11] for iC[0-5] of 2nd row of pixels
-          mmC[6] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]);
-          mmC[8] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]);
+          mmC[6] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx2]));
+          mmC[8] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx2]));
           mmC[7] = _mm_mullo_epi32(mmIndxK, mmC[6]);
           mmC[9] = _mm_mullo_epi32(mmIndxK, mmC[8]);
           mmC[10] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
@@ -254,8 +225,8 @@ inter3 = _mm_add_epi64(inter0, inter3);
         else
         {
           // mmC[0-3] for iC[0-3] of 1st row of pixels
-          mmC[0] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]);
-          mmC[2] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]);
+          mmC[0] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx1]));
+          mmC[2] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx1]));
           mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]);
           mmC[3] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
           mmTmp[0] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
@@ -264,8 +235,8 @@ inter3 = _mm_add_epi64(inter0, inter3);
           mmC[3] = _mm_sub_epi32(mmC[3], mmTmp[1]);
 
           // mmC[4-7] for iC[0-3] of 1st row of pixels
-          mmC[4] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]);
-          mmC[6] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]);
+          mmC[4] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx2]));
+          mmC[6] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx2]));
           mmC[5] = _mm_mullo_epi32(mmIndxK, mmC[4]);
           mmC[7] = _mm_mullo_epi32(mmIndxJ, mmC[4]);
           mmTmp[2] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
@@ -311,13 +282,147 @@ inter3 = _mm_add_epi64(inter0, inter3);
     }
   }
 
+#if USE_AVX2
+
+#define CALC_EQUAL_COEFF_8PXLS_AVX2(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,res,loadLocation)  \
+{                                                                                                                  \
+inter0 = _mm256_mul_epi32(x1, y1);                                                                                 \
+inter1 = _mm256_mul_epi32(tmp0, tmp2);                                                                             \
+inter2 = _mm256_mul_epi32(x2, y2);                                                                                 \
+inter3 = _mm256_mul_epi32(tmp1, tmp3);                                                                             \
+inter2 = _mm256_add_epi64(inter0, inter2);                                                                         \
+inter3 = _mm256_add_epi64(inter1, inter3);                                                                         \
+res    = _mm_loadl_epi64(loadLocation);                                                                            \
+inter3 = _mm256_add_epi64(inter2, inter3);                                                                         \
+inter1 = _mm256_srli_si256(inter3, 8);                                                                             \
+inter3 = _mm256_add_epi64(inter1, inter3);                                                                         \
+res    = _mm_add_epi64(res, _mm256_castsi256_si128(inter3));                                                       \
+res    = _mm_add_epi64(res, _mm256_extracti128_si256(inter3, 1));                                                  \
+}
+
+  template<bool b6Param>
+  static void simdEqualCoeffComputer_avx2(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7])
+  {
+    __m256i mmFour;
+    __m256i mmTmp[4];
+    __m256i mmIntermediate[4];
+    __m256i mmIndxK, mmIndxJ;
+    __m256i mmResidue[2];
+    __m256i mmC[12];
+    __m128i mmRes;
+
+    // Add directly to indexes to get new index
+    mmFour  = _mm256_set1_epi32(4);
+    mmIndxJ = _mm256_set1_epi32(-2);
+
+    static constexpr int n = b6Param ? 6 : 4;
+    int idx1 = -2 * derivateBufStride - 8;
+    int idx2 = -    derivateBufStride - 8;
+
+    for (int j = 0; j < height; j += 2)
+    {
+      if (!(j & 3))
+        mmIndxJ = _mm256_add_epi32(mmIndxJ, mmFour);
+      mmIndxK = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( -6 ) ), _mm_set1_epi32( -2 ), 1 );
+      idx1 += (derivateBufStride << 1);
+      idx2 += (derivateBufStride << 1);
+
+      for (int k = 0; k < width; k += 8)
+      {
+        idx1 += 8;
+        idx2 += 8;
+        mmIndxK = _mm256_add_epi32(mmIndxK, mmFour);
+        mmIndxK = _mm256_add_epi32(mmIndxK, mmFour);
+
+        if (b6Param)
+        {
+          // mmC[0-5] for iC[0-5] of 1st row of pixels
+          mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]));
+          mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]));
+          mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]);
+          mmC[3] = _mm256_mullo_epi32(mmIndxK, mmC[2]);
+          mmC[4] = _mm256_mullo_epi32(mmIndxJ, mmC[0]);
+          mmC[5] = _mm256_mullo_epi32(mmIndxJ, mmC[2]);
+        
+          // mmC[6-11] for iC[0-5] of 2nd row of pixels
+          mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]));
+          mmC[8] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]));
+          mmC[7] = _mm256_mullo_epi32(mmIndxK, mmC[6]);
+          mmC[9] = _mm256_mullo_epi32(mmIndxK, mmC[8]);
+          mmC[10] = _mm256_mullo_epi32(mmIndxJ, mmC[6]);
+          mmC[11] = _mm256_mullo_epi32(mmIndxJ, mmC[8]);
+        }
+        else
+        {
+          // mmC[0-3] for iC[0-3] of 1st row of pixels
+          mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]));
+          mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]));
+          mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]);
+          mmC[3] = _mm256_mullo_epi32(mmIndxJ, mmC[0]);
+          mmTmp[0] = _mm256_mullo_epi32(mmIndxJ, mmC[2]);
+          mmTmp[1] = _mm256_mullo_epi32(mmIndxK, mmC[2]);
+          mmC[1] = _mm256_add_epi32(mmC[1], mmTmp[0]);
+          mmC[3] = _mm256_sub_epi32(mmC[3], mmTmp[1]);
+
+          // mmC[4-7] for iC[0-3] of 1st row of pixels
+          mmC[4] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]));
+          mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]));
+          mmC[5] = _mm256_mullo_epi32(mmIndxK, mmC[4]);
+          mmC[7] = _mm256_mullo_epi32(mmIndxJ, mmC[4]);
+          mmTmp[2] = _mm256_mullo_epi32(mmIndxJ, mmC[6]);
+          mmTmp[3] = _mm256_mullo_epi32(mmIndxK, mmC[6]);
+          mmC[5] = _mm256_add_epi32(mmC[5], mmTmp[2]);
+          mmC[7] = _mm256_sub_epi32(mmC[7], mmTmp[3]);
+        }
+
+        // Residue
+        mmResidue[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[idx1]));
+        mmResidue[1] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[idx2]));
+        mmResidue[0] = _mm256_slli_epi32(mmResidue[0], 3);
+        mmResidue[1] = _mm256_slli_epi32(mmResidue[1], 3);
+
+        // Calculation of coefficient matrix
+        for (int col = 0; col < n; col++)
+        {
+          mmTmp[0] = _mm256_srli_si256(mmC[0 + col], 4);
+          mmTmp[1] = _mm256_srli_si256(mmC[n + col], 4);
+          CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][col]);
+          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmRes);
+
+          for (int row = col + 1; row < n; row++)
+          {
+            mmTmp[2] = _mm256_srli_si256(mmC[0 + row], 4);
+            mmTmp[3] = _mm256_srli_si256(mmC[n + row], 4);
+            CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][row]);
+            _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmRes);
+            _mm_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmRes);
+          }
+
+          mmTmp[2] = _mm256_srli_si256(mmResidue[0], 4);
+          mmTmp[3] = _mm256_srli_si256(mmResidue[1], 4);
+          CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][n]);
+          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmRes);
+        }
+      }
+
+      idx1 -= (width);
+      idx2 -= (width);
+    }
+  }
+#endif
 
   template <X86_VEXT vext>
   void AffineGradientSearch::_initAffineGradientSearchX86()
   {
     m_HorizontalSobelFilter = simdHorizontalSobelFilter<vext>;
-    m_VerticalSobelFilter = simdVerticalSobelFilter<vext>;
-    m_EqualCoeffComputer = simdEqualCoeffComputer<vext>;
+    m_VerticalSobelFilter   = simdVerticalSobelFilter<vext>;
+#if USE_AVX2
+    m_EqualCoeffComputer[0] = simdEqualCoeffComputer_avx2<false>;
+    m_EqualCoeffComputer[1] = simdEqualCoeffComputer_avx2<true>;
+#else
+    m_EqualCoeffComputer[0] = simdEqualCoeffComputer<vext, false>;
+    m_EqualCoeffComputer[1] = simdEqualCoeffComputer<vext, true>;
+#endif
   }
 
   template void AffineGradientSearch::_initAffineGradientSearchX86<SIMDX86>();
diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
index 8cc7f7efe..0ac280152 100644
--- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h
+++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
@@ -290,6 +290,103 @@ static void simdFilterCopy( const ClpRng& clpRng, const Pel* src, int srcStride,
 }
 
 
+
+// SIMD interpolation horizontal, block width modulo 2
+template<X86_VEXT vext, int N, bool shiftBack>
+static void simdInterpolateHorM2( const int16_t* src, ptrdiff_t srcStride, int16_t *dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
+{
+  CHECKD( N != 4, "Only allowing w=2 filtering for chroma blocks using 4-tap IF" );
+
+  _mm_prefetch( (const char*) src + srcStride, _MM_HINT_T0 );
+
+  const __m128i voffset  = _mm_set1_epi32( offset );
+  const __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
+  const __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
+  const __m128i vzero    = _mm_setzero_si128();
+  const __m128i vcoeffh  = _mm_set1_epi64x( *( int64_t const* ) coeff );
+
+  __m128i vsum, vsrc, vsrc0, vsrc1;
+
+  for( int row = 0; row < height; row++ )
+  {
+    _mm_prefetch( (const char*)src + 2 * srcStride, _MM_HINT_T0 );
+
+    vsrc0 = _mm_loadl_epi64( ( __m128i const* )&src[0] );
+    vsrc1 = _mm_loadl_epi64( ( __m128i const* )&src[1] );
+    vsrc  = _mm_unpacklo_epi64( vsrc0, vsrc1 );
+
+    vsum  = _mm_madd_epi16( vsrc, vcoeffh );
+    vsum  = _mm_hadd_epi32( vsum, vsum );
+
+    vsum  = _mm_add_epi32  ( vsum, voffset );
+    vsum  = _mm_srai_epi32 ( vsum, shift );
+    vsum  = _mm_packs_epi32( vsum, vzero );
+
+    if( shiftBack )
+    { //clip
+      vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
+    }
+    _mm_storeu_si32( ( __m128i * )&dst[0], vsum );
+
+    src += srcStride;
+    dst += dstStride;
+  }
+}
+
+
+template<X86_VEXT vext, int N, bool shiftBack>
+static void simdInterpolateVerM2( const int16_t* src, ptrdiff_t srcStride, int16_t* dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff )
+{
+  CHECKD( N != 4, "Only allowing w=2 filtering for chroma blocks using 4-tap IF" );
+
+  _mm_prefetch( ( const char* ) &src[0 * srcStride], _MM_HINT_T0 );
+  _mm_prefetch( ( const char* ) &src[1 * srcStride], _MM_HINT_T0 );
+  _mm_prefetch( ( const char* ) &src[2 * srcStride], _MM_HINT_T0 );
+  _mm_prefetch( ( const char* ) &src[3 * srcStride], _MM_HINT_T0 );
+
+  const __m128i vcoeffv  = _mm_set1_epi64x( *( int64_t const* ) coeff );
+  const __m128i vzero    = _mm_setzero_si128();
+  const __m128i voffset  = _mm_set1_epi32( offset );
+  const __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
+  const __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
+  const __m128i vshuff   = _mm_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
+
+  __m128i vsrc, vnl, vsum, vtmp;
+
+  const ptrdiff_t nextLine = srcStride * ( N - 1 );
+
+  vsrc = _mm_setr_epi16( src[0], src[1], src[1 * srcStride], src[1 * srcStride + 1], src[2 * srcStride], src[2 * srcStride + 1], 0, 0 );
+
+  for( int row = 0; row < height; row++ )
+  {
+    _mm_prefetch( ( const char* ) &src[( N + 0 ) * srcStride], _MM_HINT_T0 );
+    _mm_prefetch( ( const char* ) &src[( N + 1 ) * srcStride], _MM_HINT_T0 );
+
+    vnl  = _mm_setr_epi16   ( src[nextLine], src[nextLine + 1], 0, 0, 0, 0, 0, 0 );
+    vnl  = _mm_slli_si128   ( vnl, 12 );
+    vsrc = _mm_or_si128     ( vsrc, vnl );
+    vtmp = _mm_shuffle_epi8 ( vsrc, vshuff );
+    vsum = _mm_madd_epi16   ( vtmp, vcoeffv );
+    vsum = _mm_hadd_epi32   ( vsum, vzero );
+    vsrc = _mm_srli_si128   ( vsrc, 4 );
+
+    vsum = _mm_add_epi32    ( vsum, voffset );
+    vsum = _mm_srai_epi32   ( vsum, shift );
+    vsum = _mm_packs_epi32  ( vsum, vzero );
+
+    if( shiftBack ) //clip
+    {
+      vsum = _mm_min_epi16  ( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
+    }
+
+    _mm_storeu_si32( (__m128i*) &dst[0], vsum );
+
+    src += srcStride;
+    dst += dstStride;
+  }
+}
+
+
 // SIMD interpolation horizontal, block width modulo 4
 template<X86_VEXT vext, int N, bool shiftBack>
 static void simdInterpolateHorM4( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
@@ -376,11 +473,8 @@ static void simdInterpolateHorM8( const int16_t* src, int srcStride, int16_t *ds
   __m128i vshuf0 = _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4, 0x5, 0x4, 0x3, 0x2, 0x3, 0x2, 0x1, 0x0 );
   __m128i vshuf1 = _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4 );
 
-#if __INTEL_COMPILER
   __m128i vcoeff[4];
-#else
-  __m128i vcoeff[N/2];
-#endif
+
   for( int i=0; i<N; i+=2 )
   {
     vcoeff[i/2] = _mm_unpacklo_epi16( _mm_set1_epi16( coeff[i] ), _mm_set1_epi16( coeff[i+1] ) );
@@ -394,7 +488,7 @@ static void simdInterpolateHorM8( const int16_t* src, int srcStride, int16_t *ds
 
     for( int col = 0; col < width; col+=8 )
     {
-      if( N != 4 )
+      if( N == 8 || N == 6 )
       {
         __m128i vsrca0, vsrca1, vsrcb0, vsrcb1;
         __m128i vsrc0 = _mm_loadu_si128( ( const __m128i* )&src[col] );
@@ -462,131 +556,153 @@ static void simdInterpolateHorM8( const int16_t* src, int srcStride, int16_t *ds
   }
 }
 
-template<X86_VEXT vext, bool clip>
-static void simdInterpolateHor_N8_singleCol(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff)
+
+template<X86_VEXT vext, int N, bool clip>
+static void simdInterpolateHorM1(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff)
 {
   CHECKD( width != 1, "Width needs to be '1'!" );
 
   cond_mm_prefetch((const char*)src, _MM_HINT_T0);
   cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0);
 
-  __m128i vcoeffh  = _mm_loadu_si128((__m128i const*)coeff);
-  __m128i voffset  = _mm_set1_epi32(offset);
-  __m128i vibdimin = _mm_set1_epi16(clpRng.min());
-  __m128i vibdimax = _mm_set1_epi16(clpRng.max());
+  if( N == 4 )
+  {
+    cond_mm_prefetch((const char*)src, _MM_HINT_T0);
+    cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0);
 
-  int row = 0;
+    __m128i vcoeffh  = _mm_loadl_epi64((__m128i const*)coeff);
+            vcoeffh  = _mm_unpacklo_epi64(vcoeffh, vcoeffh);
+    __m128i voffset  = _mm_set1_epi32(offset);
+    __m128i vibdimin = _mm_set1_epi16(clpRng.min());
+    __m128i vibdimax = _mm_set1_epi16(clpRng.max());
 
-  for (; row < ( height - 3 ); row += 4)
-  {
-    cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
+    int row = 0;
 
-    __m128i
-    vsrc0 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
-    vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh);
- 
-    __m128i
-    vsrc1 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
-    vsrc1 = _mm_madd_epi16 (vsrc1, vcoeffh);
-    
-    __m128i
-    vsrc2 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
-    vsrc2 = _mm_madd_epi16 (vsrc2, vcoeffh);
-    
-    __m128i
-    vsrc3 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
-    vsrc3 = _mm_madd_epi16 (vsrc3, vcoeffh);
+    for( ; row < ( height - 3 ); row += 4 )
+    {
+      cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
 
-    vsrc0 = _mm_hadd_epi32(vsrc0, vsrc1);
-    vsrc2 = _mm_hadd_epi32(vsrc2, vsrc3);
-    vsrc0 = _mm_hadd_epi32(vsrc0, vsrc2);
+      __m128i
+      vsrc0 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
 
-    vsrc0 = _mm_add_epi32 (vsrc0, voffset);
-    vsrc0 = _mm_srai_epi32(vsrc0, shift);
+      __m128i
+      vsrc1 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
 
-    if (clip) { //clip
-      vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0));
-    }
-    
-    *dst = _mm_cvtsi128_si32(vsrc0);    dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride;
-  }
+      vsrc1 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc0, vsrc1), vcoeffh);
 
-  for( ; row < height; row++, dst += dstStride, src += srcStride )
-  {
-    int
-    sum  = src[0] * coeff[0];
-    sum += src[1] * coeff[1];
-    sum += src[2] * coeff[2];
-    sum += src[3] * coeff[3];
-    sum += src[4] * coeff[4];
-    sum += src[5] * coeff[5];
-    sum += src[6] * coeff[6];
-    sum += src[7] * coeff[7];
+      __m128i
+      vsrc2 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
+
+      __m128i
+      vsrc3 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
+
+      vsrc3 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc2, vsrc3), vcoeffh);
+
+      vsrc0 = _mm_hadd_epi32(vsrc1, vsrc3);
+
+      vsrc0 = _mm_add_epi32 (vsrc0, voffset);
+      vsrc0 = _mm_srai_epi32(vsrc0, shift);
 
-    Pel val = ( sum + offset ) >> shift;
+      if (clip) { //clip
+        vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0));
+      }
+
+      *dst = _mm_cvtsi128_si32(vsrc0);    dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride;
+    }
 
-    if( clip )
+    for( ; row < height; row++, dst += dstStride, src += srcStride )
     {
-      val = ClipPel( val, clpRng );
+      cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
+
+      __m128i
+      vsrc0 = _mm_loadl_epi64((__m128i const*) src);
+      vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh);
+      vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0);
+
+      vsrc0 = _mm_add_epi32 (vsrc0, voffset);
+      vsrc0 = _mm_srai_epi32(vsrc0, shift);
+
+      if (clip) { //clip
+        vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0));
+      }
+
+      *dst = _mm_cvtsi128_si32(vsrc0);
     }
-    *dst = val;
   }
-}
+  else
+  {
+    CHECKD( N != 8, "N has to 8" );
 
-template<X86_VEXT vext, bool shiftBack>
-static void simdInterpolateHor_N4_singleCol(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff)
-{
-  CHECKD( width != 1 || ( height & 3 ), "Windth needs to be '1'!" );
+    __m128i vcoeffh  = _mm_loadu_si128((__m128i const*)coeff);
+    __m128i voffset  = _mm_set1_epi32(offset);
+    __m128i vibdimin = _mm_set1_epi16(clpRng.min());
+    __m128i vibdimax = _mm_set1_epi16(clpRng.max());
 
-  cond_mm_prefetch((const char*)src, _MM_HINT_T0);
-  cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0);
+    int row = 0;
 
-  __m128i vcoeffh  = _mm_loadl_epi64((__m128i const*)coeff);
-          vcoeffh  = _mm_unpacklo_epi64(vcoeffh, vcoeffh);
-  __m128i voffset  = _mm_set1_epi32(offset);
-  __m128i vibdimin = _mm_set1_epi16(clpRng.min());
-  __m128i vibdimax = _mm_set1_epi16(clpRng.max());
+    for( ; row < ( height - 3 ); row += 4 )
+    {
+      cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
 
-  for (int row = 0; row < height; row += 4)
-  {
-    cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
+      __m128i
+      vsrc0 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
+      vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh);
 
-    __m128i
-    vsrc0 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
- 
-    __m128i
-    vsrc1 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
+      __m128i
+      vsrc1 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
+      vsrc1 = _mm_madd_epi16 (vsrc1, vcoeffh);
 
-    vsrc1 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc0, vsrc1), vcoeffh);
-    
-    __m128i
-    vsrc2 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
-    
-    __m128i
-    vsrc3 = _mm_loadl_epi64((__m128i const*) src); src += srcStride;
+      __m128i
+      vsrc2 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
+      vsrc2 = _mm_madd_epi16 (vsrc2, vcoeffh);
+
+      __m128i
+      vsrc3 = _mm_loadu_si128((__m128i const*) src); src += srcStride;
+      vsrc3 = _mm_madd_epi16 (vsrc3, vcoeffh);
 
-    vsrc3 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc2, vsrc3), vcoeffh);
+      vsrc0 = _mm_hadd_epi32(vsrc0, vsrc1);
+      vsrc2 = _mm_hadd_epi32(vsrc2, vsrc3);
+      vsrc0 = _mm_hadd_epi32(vsrc0, vsrc2);
 
-    vsrc0 = _mm_hadd_epi32(vsrc1, vsrc3);
+      vsrc0 = _mm_add_epi32 (vsrc0, voffset);
+      vsrc0 = _mm_srai_epi32(vsrc0, shift);
 
-    vsrc0 = _mm_add_epi32 (vsrc0, voffset);
-    vsrc0 = _mm_srai_epi32(vsrc0, shift);
+      if (clip) { //clip
+        vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0));
+      }
 
-    if (shiftBack) { //clip
-      vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0));
+      *dst = _mm_cvtsi128_si32(vsrc0);    dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride;
+      *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride;
+    }
+
+    for( ; row < height; row++, dst += dstStride, src += srcStride )
+    {
+      _mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0);
+
+      __m128i
+        vsrc0 = N == 8 ? _mm_loadu_si128((const __m128i*) src) : _mm_loadl_epi64((const __m128i*) src);
+      vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh);
+
+      vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0);
+      if( N == 8 ) vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0);
+
+      vsrc0 = _mm_add_epi32 (vsrc0, voffset);
+      vsrc0 = _mm_srai_epi32(vsrc0, shift);
+
+      if (clip) { //clip
+        vsrc0 = _mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, vsrc0));
+      }
+
+      *dst = _mm_cvtsi128_si32(vsrc0);
     }
-    
-    *dst = _mm_cvtsi128_si32(vsrc0);    dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride;
-    *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride;
   }
 }
 
-
 template<X86_VEXT vext, int N, bool shiftBack>
 static void simdInterpolateHorM8_AVX2( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
 {
@@ -605,11 +721,8 @@ static void simdInterpolateHorM8_AVX2( const int16_t* src, int srcStride, int16_
   __m256i vshuf1 = _mm256_set_epi8( 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4,
                                     0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4 );
 
-#if __INTEL_COMPILER
   __m256i vcoeff[4];
-#else
-  __m256i vcoeff[N/2];
-#endif
+
   for( int i=0; i<N; i+=2 )
   {
     vcoeff[i/2] = _mm256_unpacklo_epi16( _mm256_set1_epi16( coeff[i] ), _mm256_set1_epi16( coeff[i+1] ) );
@@ -716,11 +829,9 @@ static void simdInterpolateHorM16_AVX2( const int16_t* src, int srcStride, int16
                                     0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4, 0x5, 0x4, 0x3, 0x2, 0x3, 0x2, 0x1, 0x0 );
   __m256i vshuf1 = _mm256_set_epi8( 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4,
                                     0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4 );
-#if __INTEL_COMPILER
+
   __m256i vcoeff[4];
-#else
-  __m256i vcoeff[N/2];
-#endif
+
   for( int i=0; i<N; i+=2 )
   {
     vcoeff[i/2] = _mm256_unpacklo_epi16( _mm256_set1_epi16( coeff[i] ), _mm256_set1_epi16( coeff[i+1] ) );
@@ -836,7 +947,7 @@ static void simdInterpolateVerM4( const int16_t *src, int srcStride, int16_t *ds
   const int16_t *srcOrig = src;
   int16_t *dstOrig = dst;
 
-  __m128i vcoeff[N / 2], vsrc[N];
+  __m128i vcoeff[N], vsrc[N];
   __m128i vzero = _mm_setzero_si128();
   __m128i voffset = _mm_set1_epi32( offset );
   __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
@@ -918,7 +1029,7 @@ static void simdInterpolateVerM8( const int16_t *src, int srcStride, int16_t *ds
   const Pel* srcOrig = src;
   int16_t *dstOrig = dst;
 
-  __m128i vcoeff[N / 2], vsrc[N];
+  __m128i vcoeff[N], vsrc[N];
   __m128i vzero = _mm_setzero_si128();
   __m128i voffset = _mm_set1_epi32( offset );
   __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
@@ -1012,7 +1123,8 @@ static void simdInterpolateVerM8_AVX2( const int16_t *src, int srcStride, int16_
 
   __m256i vsum;
   __m128i vsrc[N];
-  __m256i vcoeff[N/2];
+  __m256i vcoeff[N];
+
   for( int i=0; i<N; i+=2 )
   {
     vcoeff[i/2] = _mm256_unpacklo_epi16( _mm256_set1_epi16( coeff[i] ), _mm256_set1_epi16( coeff[i+1] ) );
@@ -1096,7 +1208,7 @@ static void simdInterpolateVerM16_AVX2( const int16_t *src, int srcStride, int16
   __m256i vsum, vsuma, vsumb;
 
   __m256i vsrc[N];
-  __m256i vcoeff[N/2];
+  __m256i vcoeff[N];
   for( int i=0; i<N; i+=2 )
   {
     vcoeff[i/2] = _mm256_unpacklo_epi16( _mm256_set1_epi16( coeff[i] ), _mm256_set1_epi16( coeff[i+1] ) );
@@ -1243,7 +1355,7 @@ static void simdInterpolateN2_10BIT_M4(const int16_t* src, int srcStride, int16_
 }
 
 template<X86_VEXT vext, int N, bool isVertical, bool isFirst, bool isLast>
-static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR)
+static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff )
 {
   int row, col;
 
@@ -1275,199 +1387,146 @@ static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel
   // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
   // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
   CHECK( shift < 0, "Negative shift" );
+  
 
 #define USE_M16_AVX2_IF 1
 
-  if( isLast )
+  if( N != 2 )
   {
-    shift += ( isFirst ) ? 0 : headRoom;
-    offset = 1 << ( shift - 1 );
-    offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    if( isLast )
+    {
+      shift  += ( isFirst ) ? 0 : headRoom;
+      offset  = 1 << ( shift - 1 );
+      offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    }
+    else
+    {
+      shift -= ( isFirst ) ? headRoom : 0;
+      offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0;
+    }
   }
   else
-  {
-    shift -= ( isFirst ) ? headRoom : 0;
-    offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0;
-  }
-
-  if (biMCForDMVR)
   {
     if( isFirst )
     {
-      shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd);
+      shift  = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd);
       offset = 1 << (shift - 1);
     }
     else
     {
-      shift = 4;
+      shift  = 4;
       offset = 1 << (shift - 1);
     }
   }
-  if( clpRng.bd <= 10 )
-  {
-    if( N == 6 )
-    {
-      c[6] = coeff[6];
-      c[7] = coeff[7];
-      int src8tOff = cStride;
-
-      if( !( width & 7 ) )
-      {
-        if( !isVertical )
-        {
-          if( vext >= AVX2 )
-#if USE_M16_AVX2_IF
-            if( !( width & 15 ) )
-              simdInterpolateHorM16_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-            else
-#endif
-              simdInterpolateHorM8_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-          else
-            simdInterpolateHorM8<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-        }
-        else
-        {
-          if( vext >= AVX2 )
-#if USE_M16_AVX2_IF
-            if( !( width & 15 ) )
-              simdInterpolateVerM16_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-            else
-#endif
-              simdInterpolateVerM8_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-          else
-            simdInterpolateVerM8<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
-        }
-
-        return;
-      }
-      else if( !( width & 3 ) )
-      {
-        if( !isVertical )
-        {
-          simdInterpolateHorM4<vext, 8, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-        }
-        else
-          simdInterpolateVerM4<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
 
-        return;
-      }
-      else if( width == 1 && !isVertical )
-      {
-        simdInterpolateHor_N8_singleCol<vext, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+  CHECKD( clpRng.bd > 10, "VVenC does not support bitdepths larger than 10!" );
 
-        return;
-      }
-      else if( width == 1 && isVertical )
-      {
-        // for vertical width of '1' filtering, use 8-tap functionality
-        src += ( N/2 - 1 ) * cStride;
-        simdFilter<vext, 8, true, isFirst, isLast>( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR );
-
-        return;
-      }
+  if( N == 6 )
+  {
+    c[6] = coeff[6];
+    c[7] = coeff[7];
+    int src8tOff = cStride;
 
-      THROW( "Unhandled case!" );
-    }
-    else if( N == 8 && !( width & 0x07 ) )
+    if( !( width & 7 ) )
     {
       if( !isVertical )
       {
-        if( vext>= AVX2 )
+        if( vext >= AVX2 )
 #if USE_M16_AVX2_IF
           if( !( width & 15 ) )
-            simdInterpolateHorM16_AVX2<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+            simdInterpolateHorM16_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
           else
 #endif
-          simdInterpolateHorM8_AVX2<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+            simdInterpolateHorM8_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
         else
-          simdInterpolateHorM8<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+          simdInterpolateHorM8<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
       }
       else
       {
-        if( vext>= AVX2 )
+        if( vext >= AVX2 )
 #if USE_M16_AVX2_IF
           if( !( width & 15 ) )
-            simdInterpolateVerM16_AVX2<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+            simdInterpolateVerM16_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
           else
 #endif
-          simdInterpolateVerM8_AVX2<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+            simdInterpolateVerM8_AVX2<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
         else
-          simdInterpolateVerM8<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+          simdInterpolateVerM8<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
       }
-      return;
     }
-    else if( N == 8 && !( width & 0x03 ) )
+    else if( !( width & 3 ) )
     {
       if( !isVertical )
       {
-        simdInterpolateHorM4<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+        simdInterpolateHorM4<vext, 8, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
       }
       else
-        simdInterpolateVerM4<vext, 8, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-      return;
-    }
-    else if( N == 4 && !( width & 0x03 ) )
-    {
-      if( !isVertical )
-      {
-        if( ( width % 8 ) == 0 )
-        {
-          if( vext>= AVX2 )
-#if USE_M16_AVX2_IF
-            if( !( width & 15 ) )
-              simdInterpolateHorM16_AVX2<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-            else
-#endif
-            simdInterpolateHorM8_AVX2<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-          else
-            simdInterpolateHorM8<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-        }
-        else
-          simdInterpolateHorM4<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-      }
-      else
-      {
-        if( ( width % 8 ) == 0 )
-        {
-          if( vext >= AVX2 )
-#if USE_M16_AVX2_IF
-            if( !( width & 15 ) )
-              simdInterpolateVerM16_AVX2<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-            else
-#endif
-              simdInterpolateVerM8_AVX2 <vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-          else
-            simdInterpolateVerM8<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-        }
-        else
-          simdInterpolateVerM4<vext, 4, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-      }
-      return;
+        simdInterpolateVerM4<vext, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
     }
-    else if( biMCForDMVR )
+    else if( width == 1 && !isVertical )
     {
-      if( N == 2 && !( width & 0x03 ) )
-      {
-        simdInterpolateN2_10BIT_M4<vext, isLast>( src, srcStride, dst, dstStride, cStride, width, height, shift, offset, clpRng, c );
-        return;
-      }
+      simdInterpolateHorM1<vext, 8, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
     }
-    else if( N == 2 )
+    else if( width == 1 && isVertical )
     {
-      THROW( "Should have already been handled!" );
+      c[0] = c[1]; c[1] = c[2]; c[2] = c[3]; c[3] = c[4]; c[4] = c[5]; c[5] = coeff[6];
+      goto scalar_if;
     }
-    else if( N == 8 && width == 1 && ( height & 3 ) == 0 && !isVertical )
+
+    return;
+  }
+
+  if( !isVertical && N != 2 )
+  {
+    if( ( width & 7 ) == 0 )
     {
-      simdInterpolateHor_N8_singleCol<vext, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-      return;
+      if( vext >= AVX2 )
+#if USE_M16_AVX2_IF
+        if( !( width & 15 ) )
+          simdInterpolateHorM16_AVX2<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+        else
+#endif
+          simdInterpolateHorM8_AVX2 <vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+      else
+        simdInterpolateHorM8<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
     }
-    else if( N == 4 && width == 1 && ( height & 3 ) == 0 && !isVertical )
+    else if( ( width & 3 ) == 0 )
+      simdInterpolateHorM4<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else if( ( width & 1 ) == 0 )
+      simdInterpolateHorM2<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else
+      simdInterpolateHorM1<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    return;
+  }
+  else if( N != 2 )
+  {
+    if( ( width & 7 ) == 0 )
     {
-      simdInterpolateHor_N4_singleCol<vext, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
-      return;
+      if( vext >= AVX2 )
+#if USE_M16_AVX2_IF
+        if( !( width & 15 ) )
+          simdInterpolateVerM16_AVX2<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+        else
+#endif
+          simdInterpolateVerM8_AVX2 <vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+      else
+        simdInterpolateVerM8<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
     }
+    else if( ( width & 3 ) == 0 )
+      simdInterpolateVerM4<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else if( ( width & 1 ) == 0 )
+      simdInterpolateVerM2<vext, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else
+      goto scalar_if;
+    return;
+  }
+  else// if( N == 2 )
+  {
+    simdInterpolateN2_10BIT_M4<vext, isLast>( src, srcStride, dst, dstStride, cStride, width, height, shift, offset, clpRng, c );
+    return;
   }
 
+scalar_if:
   for( row = 0; row < height; row++ )
   {
     for( col = 0; col < width; col++ )
diff --git a/source/Lib/CommonLib/x86/MCTFX86.h b/source/Lib/CommonLib/x86/MCTFX86.h
index 94a463a39..8342909f3 100644
--- a/source/Lib/CommonLib/x86/MCTFX86.h
+++ b/source/Lib/CommonLib/x86/MCTFX86.h
@@ -1325,6 +1325,8 @@ void applyBlockSIMD( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const
     xvar = _mm_hadd_epi32( xvar, xvar );
     int64_t variance = _mm_cvtsi128_si32( xvar );
     int64_t diffsum  = _mm_extract_epi32( xvar, 1 );
+    variance <<= 2*(10-clpRng.bd);
+    diffsum <<= 2*(10-clpRng.bd);
 
     const int cntV = w * h;
     const int cntD = 2 * cntV - w - h;
diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
index f8ea2214b..c41ec7283 100644
--- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
+++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
@@ -1092,7 +1092,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW
     m_alfCovarianceFrameCcAlf[compIdx - 1] = new AlfCovariance[numFilters];
     for (int k = 0; k < numFilters; k++)
     {
-      m_alfCovarianceFrameCcAlf[compIdx - 1][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, numBins);
+      m_alfCovarianceFrameCcAlf[compIdx - 1][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, 1);
     }
 
     m_alfCovarianceCcAlf[compIdx - 1] = new AlfCovariance *[numFilters];
@@ -1101,7 +1101,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW
       m_alfCovarianceCcAlf[compIdx - 1][j] = new AlfCovariance[m_numCTUsInPic];
       for (int k = 0; k < m_numCTUsInPic; k++)
       {
-        m_alfCovarianceCcAlf[compIdx - 1][j][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, numBins);
+        m_alfCovarianceCcAlf[compIdx - 1][j][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, 1);
       }
     }
   }
@@ -1965,7 +1965,6 @@ void EncAdaptiveLoopFilter::reconstructCTU( Picture& pic, CodingStructure& cs, c
 
 void EncAdaptiveLoopFilter::initEncProcess( Slice& slice )
 {
-  m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = false;
   if( isSkipAlfForFrame( *slice.pic ) )
   {
     return;
@@ -1973,8 +1972,8 @@ void EncAdaptiveLoopFilter::initEncProcess( Slice& slice )
 
   // NOTE: ALF is here enabled per default. However it can be disabled during filter derivation part.
   //       In line synchronized FPP mode, it cannot be disabled.
-  slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = true;
-  m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = slice.alfEnabled[COMP_Y] ? true : false;
+  slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = slice.sps->alfEnabled;
+  m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = slice.sps->ccalfEnabled;
 
   if( m_encCfg->m_fppLinesSynchro )
   {
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index 3b862f3da..29b558dcd 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -721,7 +721,7 @@ void EncCu::xCompressCU( CodingStructure*& tempCS, CodingStructure*& bestCS, Par
 
       bool isReuseCU = m_modeCtrl.isReusingCuValid( cs, partitioner, qp );
 
-      bool checkIbc = m_pcEncCfg->m_IBCMode && bestCS->picture->useScIBC && (partitioner.chType == CH_L);
+      bool checkIbc = m_pcEncCfg->m_IBCMode && bestCS->picture->useIBC && (partitioner.chType == CH_L);
       if ((m_pcEncCfg->m_IBCFastMethod>3) && (cs.area.lwidth() * cs.area.lheight()) > (16 * 16))
       {
         checkIbc = false;
@@ -1954,7 +1954,7 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC
       uiNumMrgSATDCand = (m_pcEncCfg->m_useFastMrg >= 2) ? (unsigned)candCostList.size() : uiNumMrgSATDCand;
       for( uint32_t i = 1; i < uiNumMrgSATDCand; i++ )
       {
-        if( candCostList[i] > MRG_FAST_RATIO[tempCS->picture->useScFastMrg] * candCostList[0] )
+        if( candCostList[i] > MRG_FAST_RATIO[tempCS->picture->useFastMrg] * candCostList[0] )
         {
           uiNumMrgSATDCand = i;
           break;
diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp
index 71d61e546..9823149aa 100644
--- a/source/Lib/EncoderLib/EncGOP.cpp
+++ b/source/Lib/EncoderLib/EncGOP.cpp
@@ -857,13 +857,40 @@ void EncGOP::xInitSPS(SPS &sps) const
   profileTierLevel->subProfileIdc.clear();
   profileTierLevel->subProfileIdc.push_back( m_pcEncCfg->m_subProfile );
 
-  sps.maxPicWidthInLumaSamples      = m_pcEncCfg->m_PadSourceWidth;
-  sps.maxPicHeightInLumaSamples     = m_pcEncCfg->m_PadSourceHeight;
-  sps.conformanceWindow.setWindow( m_pcEncCfg->m_confWinLeft, m_pcEncCfg->m_confWinRight, m_pcEncCfg->m_confWinTop, m_pcEncCfg->m_confWinBottom );
+  if( m_pcEncCfg->m_maxPicWidth != 0 && m_pcEncCfg->m_maxPicHeight != 0 )
+  {
+    const int minCuSize = std::max( 1 << ( vvenc::MIN_CU_LOG2 + 1 ), 1 << m_pcEncCfg->m_log2MinCodingBlockSize );
+    int padRight = 0, padBottom = 0;
+    if( m_pcEncCfg->m_maxPicWidth % minCuSize )
+    {
+      padRight = ( ( m_pcEncCfg->m_maxPicWidth / minCuSize) + 1 ) * minCuSize - m_pcEncCfg->m_maxPicWidth;
+    }
+    if( m_pcEncCfg->m_maxPicHeight % minCuSize )
+    {
+      padBottom = ( ( m_pcEncCfg->m_maxPicHeight / minCuSize) + 1 ) * minCuSize - m_pcEncCfg->m_maxPicHeight;
+    }
+    sps.maxPicWidthInLumaSamples      = m_pcEncCfg->m_maxPicWidth + padRight;
+    sps.maxPicHeightInLumaSamples     = m_pcEncCfg->m_maxPicHeight + padBottom;
+    
+    sps.conformanceWindow.setWindow( 0, padRight, 0, padBottom );
+  }
+  else
+  {
+    sps.maxPicWidthInLumaSamples      = m_pcEncCfg->m_PadSourceWidth;
+    sps.maxPicHeightInLumaSamples     = m_pcEncCfg->m_PadSourceHeight;
+    sps.conformanceWindow.setWindow( m_pcEncCfg->m_confWinLeft, m_pcEncCfg->m_confWinRight, m_pcEncCfg->m_confWinTop, m_pcEncCfg->m_confWinBottom );
+  }
   sps.chromaFormatIdc               = m_pcEncCfg->m_internChromaFormat;
   sps.CTUSize                       = m_pcEncCfg->m_CTUSize;
   sps.maxMTTDepth[0]                = m_pcEncCfg->m_maxMTTDepthI;
-  sps.maxMTTDepth[1]                = m_pcEncCfg->m_maxMTTDepth >= 10 ? 3 : m_pcEncCfg->m_maxMTTDepth;
+  int maxMTTDepthVal = m_pcEncCfg->m_maxMTTDepth;
+  int minMaxMttD = maxMTTDepthVal % 10;
+  while( maxMTTDepthVal )
+  {
+    minMaxMttD      = std::min( minMaxMttD, maxMTTDepthVal % 10 );
+    maxMTTDepthVal /= 10;
+  }
+  sps.maxMTTDepth[1]                = minMaxMttD;
   sps.maxMTTDepth[2]                = m_pcEncCfg->m_maxMTTDepthIChroma;
   for( int i = 0; i < 3; i++)
   {
@@ -1462,7 +1489,7 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod
     if( ( i == 1 ) && ( m_pcEncCfg->m_maxMTTDepth >= 10 ) )
     {
       slice->picHeader->maxMTTDepth[i]    = int( m_pcEncCfg->m_maxMTTDepth / pow( 10, sps.maxTLayers - slice->TLayer - 1 ) ) % 10;
-      slice->picHeader->splitConsOverride = true;
+      slice->picHeader->splitConsOverride = slice->picHeader->maxMTTDepth[i] != sps.maxMTTDepth[i];
     }
   }
 
@@ -1492,7 +1519,7 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod
   }
 
   const int maxTLayer  = m_pcEncCfg->m_picReordering && m_pcEncCfg->m_GOPSize > 1 ? vvenc::ceilLog2( m_pcEncCfg->m_GOPSize ) : 0;
-  const int numRefCode = pic.useScNumRefs ? m_pcEncCfg->m_numRefPicsSCC : m_pcEncCfg->m_numRefPics;
+  const int numRefCode = pic.useNumRefs ? m_pcEncCfg->m_numRefPicsSCC : m_pcEncCfg->m_numRefPics;
   const int tLayer     = slice->TLayer;
   const int numRefs    = numRefCode < 10 ? numRefCode : ( int( numRefCode / pow( 10, maxTLayer - tLayer ) ) % 10 );
 
@@ -1775,7 +1802,7 @@ void EncGOP::xInitLMCS( Picture& pic )
   Slice* slice = pic.cs->slice;
   const SliceType sliceType = slice->sliceType;
 
-  if( ! pic.useScLMCS || (!slice->isIntra() && m_disableLMCSIP) )
+  if( ! pic.useLMCS || (!slice->isIntra() && m_disableLMCSIP) )
   {
     pic.reshapeData.copyReshapeData( m_Reshaper );
     m_Reshaper.setCTUFlag     ( false );
diff --git a/source/Lib/EncoderLib/EncPicture.cpp b/source/Lib/EncoderLib/EncPicture.cpp
index 6ae96d16c..90e19aa71 100644
--- a/source/Lib/EncoderLib/EncPicture.cpp
+++ b/source/Lib/EncoderLib/EncPicture.cpp
@@ -93,7 +93,7 @@ void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder )
   pic.cs->createTempBuffers( true );
   pic.cs->initStructData( MAX_INT, false, nullptr );
 
-  if( pic.useScLMCS && m_pcEncCfg->m_reshapeSignalType == RESHAPE_SIGNAL_PQ && m_pcEncCfg->m_alf )
+  if( pic.useLMCS && m_pcEncCfg->m_reshapeSignalType == RESHAPE_SIGNAL_PQ && m_pcEncCfg->m_alf )
   {
     const double *weights = gopEncoder.getReshaper().getlumaLevelToWeightPLUT();
     auto& vec = m_ALF.getLumaLevelWeightTable();
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index aea699c8c..d3e3a4df2 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -313,7 +313,7 @@ void EncSlice::initPic( Picture* pic )
   }
 
   m_ctuEncDelay = 1;
-  if( pic->useScIBC )
+  if( pic->useIBC )
   {
     // IBC needs unfiltered samples up to max IBC search range
     // therefore ensure that numCtuDelayLUT CTU's have been enocded first
@@ -552,7 +552,7 @@ void EncSlice::compressSlice( Picture* pic )
     lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
     lnRsrc->m_CachedBvs         .resetIbcBvCand();
 
-    if( slice->sps->saoEnabled && pic->useScSAO )
+    if( slice->sps->saoEnabled && pic->useSAO )
     {
       lnRsrc->m_encSao          .initSlice( slice );
     }
@@ -711,7 +711,7 @@ void EncSlice::finishCompressSlice( Picture* pic, Slice& slice )
   CodingStructure& cs = *pic->cs;
 
   // finalize
-  if( slice.sps->saoEnabled && pic->useScSAO )
+  if( slice.sps->saoEnabled && pic->useSAO )
   {
     // store disabled statistics
     if( !m_pcEncCfg->m_numThreads )
@@ -740,7 +740,7 @@ void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const
     setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() );
   }
 
-  if( slice.sps->saoEnabled && pic->useScSAO )
+  if( slice.sps->saoEnabled && pic->useSAO )
   {
     // check SAO enabled or disabled
     EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
@@ -1073,7 +1073,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam )
         ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
 
         // SAO filter
-        if( slice.sps->saoEnabled && pic->useScSAO )
+        if( slice.sps->saoEnabled && pic->useSAO )
         {
           PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
           TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp
index fbc3e0654..9fa022ff3 100644
--- a/source/Lib/EncoderLib/InterSearch.cpp
+++ b/source/Lib/EncoderLib/InterSearch.cpp
@@ -261,10 +261,10 @@ void InterSearch::init( const VVEncCfg& encCfg, TrQuant* pTrQuant, RdCost* pRdCo
   }
   m_tmpStorageLCU.create( UnitArea( cform, Area( 0, 0, MAX_CU_SIZE, MAX_CU_SIZE ) ) );
   m_pTempPel = new Pel[ encCfg.m_CTUSize * encCfg.m_CTUSize ];
-  m_tmpAffiStorage.create(UnitArea(cform, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE)));
+  m_tmpAffiStorage.create(UnitArea(cform, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE + 2)));  // allow overread by 2 samples
   m_tmpAffiError = new Pel[MAX_CU_SIZE * MAX_CU_SIZE];
-  m_tmpAffiDeri[0] = new int[MAX_CU_SIZE * MAX_CU_SIZE];
-  m_tmpAffiDeri[1] = new int[MAX_CU_SIZE * MAX_CU_SIZE];
+  m_tmpAffiDeri[0] = new Pel[MAX_CU_SIZE * MAX_CU_SIZE];
+  m_tmpAffiDeri[1] = new Pel[MAX_CU_SIZE * MAX_CU_SIZE];
 
   CompArea chromaArea( COMP_Cb, cform, Area( 0, 0, encCfg.m_CTUSize, encCfg.m_CTUSize ), true );
   for( int i = 0; i < 4; i++ )
@@ -2197,7 +2197,7 @@ void InterSearch::xPatternSearchFast( const CodingUnit& cu,
                                       Mv&                   rcMv,
                                       Distortion&           ruiSAD )
 {
-  if( cu.cs->picture->useScME )
+  if( cu.cs->picture->useME )
   {
     switch ( m_motionEstimationSearchMethodSCC )
     {
@@ -3446,7 +3446,7 @@ void InterSearch::xEstimateInterResidualQT(CodingStructure &cs, Partitioner &par
   const uint32_t numTBlocks    = getNumberValidTBlocks   ( *cs.pcv );
   CodingUnit& cu               = *cs.getCU(partitioner.chType, partitioner.treeType);
   const unsigned currDepth = partitioner.currTrDepth;
-  const bool useTS = cs.picture->useScTS;
+  const bool useTS = cs.picture->useTS;
 
   bool bCheckFull  = !partitioner.canSplit( TU_MAX_TR_SPLIT, cs );
   if( cu.sbtInfo && partitioner.canSplit( CU::getSbtTuSplit( cu.sbtInfo ), cs ) )
@@ -5315,7 +5315,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
 
   int64_t  i64EqualCoeff[7][7];
   Pel    *piError = m_tmpAffiError;
-  int    *pdDerivate[2];
+  Pel    *pdDerivate[2];
   pdDerivate[0] = m_tmpAffiDeri[0];
   pdDerivate[1] = m_tmpAffiDeri[1];
 
@@ -5354,7 +5354,6 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
 
   ::memcpy(acMv, acMvTemp, sizeof(Mv) * 3);
 
-  const int bufStride = pBuf->Y().stride;
   const int predBufStride = predBuf.Y().stride;
   Mv prevIterMv[7][3];
   int iIterTime;
@@ -5379,23 +5378,13 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
     *                         use gradient to update mv
     *********************************************************************************/
     // get Error Matrix
-    const Pel* pOrg = pBuf->Y().buf;
-    Pel* pPred = predBuf.Y().buf;
-    for (int j = 0; j< height; j++)
-    {
-      for (int i = 0; i< width; i++)
-      {
-        piError[i + j * width] = pOrg[i] - pPred[i];
-      }
-      pOrg += bufStride;
-      pPred += predBufStride;
-    }
+    PelBuf( piError, width, height ).subtract( pBuf->Y(), predBuf.Y() );
 
     // sobel x direction
     // -1 0 1
     // -2 0 2
     // -1 0 1
-    pPred = predBuf.Y().buf;
+    Pel* pPred = predBuf.Y().buf;
     m_HorizontalSobelFilter(pPred, predBufStride, pdDerivate[0], width, width, height);
 
     // sobel y direction
@@ -5410,9 +5399,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu,
       memset(&i64EqualCoeff[row][0], 0, iParaNum * sizeof(int64_t));
     }
 
-    m_EqualCoeffComputer(piError, width, pdDerivate, width, i64EqualCoeff, width, height
-      , (cu.affineType == AFFINEMODEL_6PARAM)
-    );
+    m_EqualCoeffComputer[cu.affineType]( piError, width, pdDerivate, width, width, height, i64EqualCoeff );
 
     for (int row = 0; row < iParaNum; row++)
     {
diff --git a/source/Lib/EncoderLib/InterSearch.h b/source/Lib/EncoderLib/InterSearch.h
index 70012cf39..277408584 100644
--- a/source/Lib/EncoderLib/InterSearch.h
+++ b/source/Lib/EncoderLib/InterSearch.h
@@ -377,7 +377,7 @@ class InterSearch : public InterPrediction, AffineGradientSearch
   EncAffineMotion   m_affineMotion;
   PelStorage        m_tmpAffiStorage;
   Pel*              m_tmpAffiError;
-  int*              m_tmpAffiDeri[2];
+  Pel*              m_tmpAffiDeri[2];
   MotionInfo        m_subPuMiBuf[(MAX_CU_SIZE * MAX_CU_SIZE) >> (MIN_CU_LOG2 << 1)];
   // Misc.
   Pel*              m_pTempPel;
diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp
index ae7a57e23..ba10c2753 100644
--- a/source/Lib/EncoderLib/IntraSearch.cpp
+++ b/source/Lib/EncoderLib/IntraSearch.cpp
@@ -528,7 +528,7 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, d
   csBest->initStructData();
 
   int   bestLfnstIdx  = 0;
-  const bool useBDPCM = cs.picture->useScBDPCM;
+  const bool useBDPCM = cs.picture->useBDPCM;
   int   NumBDPCMCand  = (useBDPCM && sps.BDPCM && CU::bdpcmAllowed(cu, ComponentID(partitioner.chType))) ? 2 : 0;
   int   bestbdpcmMode = 0;
   int   bestISP       = 0;
@@ -729,7 +729,7 @@ void IntraSearch::estIntraPredChromaQT( CodingUnit& cu, Partitioner& partitioner
   PartSplit ispType     = lumaUsesISP ? CU::getISPType(cu, COMP_Y) : TU_NO_ISP;
   double bestCostSoFar  = maxCostAllowed;
   const uint32_t numberValidComponents = getNumberValidComponents( cu.chromaFormat );
-  const bool useBDPCM   = cs.picture->useScBDPCM;
+  const bool useBDPCM   = cs.picture->useBDPCM;
 
   uint32_t   uiBestMode = 0;
   Distortion uiBestDist = 0;
@@ -1514,7 +1514,7 @@ void IntraSearch::xIntraCodingLumaQT(CodingStructure& cs, Partitioner& partition
   double dSingleCost        = MAX_DOUBLE;
   int endLfnstIdx           = (partitioner.isSepTree(cs) && partitioner.chType == CH_C && (currArea.lwidth() < 8 || currArea.lheight() < 8))
                            || (currArea.lwidth() > sps.getMaxTbSize() || currArea.lheight() > sps.getMaxTbSize()) || !sps.LFNST || (numMode < 0) ? 0 : 2;
-  const bool useTS          = cs.picture->useScTS;
+  const bool useTS          = cs.picture->useTS;
   numMode                   = (numMode < 0) ? -numMode : numMode;
 
   if (cu.mipFlag && !allowLfnstWithMip(cu.lumaSize()))
@@ -1994,7 +1994,7 @@ ChromaCbfs IntraSearch::xIntraChromaCodingQT(CodingStructure& cs, Partitioner& p
   const CodingUnit& cu  = *cs.getCU( currArea.chromaPos(), CH_C, TREE_D );
   ChromaCbfs cbfs(false);
   uint32_t   currDepth = partitioner.currTrDepth;
-  const bool useTS = cs.picture->useScTS;
+  const bool useTS = cs.picture->useTS;
   if (currDepth == currTU.depth)
   {
     if (!currArea.Cb().valid() || !currArea.Cr().valid())
diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp
index 85c82a73b..cf8454bac 100644
--- a/source/Lib/EncoderLib/PreProcess.cpp
+++ b/source/Lib/EncoderLib/PreProcess.cpp
@@ -87,7 +87,7 @@ void PreProcess::init( const VVEncCfg& encCfg, bool isFinalPass )
   m_doTempDown  = m_encCfg->m_FirstPassMode == 2 || m_encCfg->m_FirstPassMode == 4;
   m_doVisAct    = m_encCfg->m_usePerceptQPA
                   || (m_encCfg->m_LookAhead && m_encCfg->m_RCTargetBitrate)
-                  || (m_encCfg->m_RCNumPasses > 1 && ((!isFinalPass) || (m_encCfg->m_FirstPassMode > 2)));
+                  || (m_encCfg->m_RCNumPasses > 1 && (!isFinalPass));
   m_doVisActQpa = m_encCfg->m_usePerceptQPA;
 
 
diff --git a/source/Lib/EncoderLib/RateCtrl.cpp b/source/Lib/EncoderLib/RateCtrl.cpp
index 61f992392..0f5e4afc8 100644
--- a/source/Lib/EncoderLib/RateCtrl.cpp
+++ b/source/Lib/EncoderLib/RateCtrl.cpp
@@ -178,7 +178,7 @@ void EncRCPic::destroy()
   encRCSeq = NULL;
 }
 
-void EncRCPic::clipTargetQP (std::list<EncRCPic*>& listPreviousPictures, const int baseQP, const int maxTL, const double resRatio, int &qp, int* qpAvg)
+void EncRCPic::clipTargetQP (std::list<EncRCPic*>& listPreviousPictures, const int baseQP, const int refrIncrFac, const int maxTL, const double resRatio, int &qp, int* qpAvg)
 {
   const int rShift = (resRatio < 0.03125 ? 12 : (resRatio < 0.125 ? 13 : (resRatio < 0.5 ? 14 : 15)));
   const int initQP = qp;
@@ -222,7 +222,7 @@ void EncRCPic::clipTargetQP (std::list<EncRCPic*>& listPreviousPictures, const i
   {
     const int clipRange = (refreshParams ? 5 + (encRCSeq->intraPeriod + (encRCSeq->gopSize >> 1)) / encRCSeq->gopSize : std::max (3, 6 - (frameLevel >> 1)));
 
-    qp = Clip3 (lastCurrTLQP - clipRange, std::min (MAX_QP, lastCurrTLQP + clipRange), qp);
+    qp = Clip3 (lastCurrTLQP - clipRange, std::min (MAX_QP, lastCurrTLQP + (refreshParams ? (refrIncrFac * clipRange) >> 1 : clipRange)), qp);
   }
   if (lastPrevTLQP >= 0) // prevent QP from being lower than QPs at lower temporal level
   {
@@ -353,6 +353,20 @@ int RateCtrl::getBaseQP()
     {
       sumFrBits += stats.numBits;
     }
+    if (m_pcEncCfg->m_usePerceptQPA && m_pcEncCfg->m_LookAhead) // account for very low visual activity
+    {
+      const double hpEnerPic = sqrt (32.0 * double (1 << (2 * encRCSeq->bitDepth - 10)) * sqrt (d));
+      uint32_t hpEner = 0;
+
+      for (auto& stats : firstPassData)
+      {
+        hpEner += stats.visActY;
+      }
+      if (hpEner > 0 && hpEner < hpEnerPic * firstPassData.size()) // similar to applyQPAdaptationSlice
+      {
+        sumFrBits = uint64_t (0.5 + sumFrBits * sqrt (hpEner / (hpEnerPic * firstPassData.size())));
+      }
+    }
     baseQP = int (24.5 - log (d) / log (2.0)); // QPstart, equivalent to round (24 + 2*log2 (resRatio))
     d = (double) m_pcEncCfg->m_RCTargetBitrate * (double) firstPassData.size() / (encRCSeq->frameRate * sumFrBits);
     d = firstPassBaseQP - (105.0 / 128.0) * sqrt ((double) std::max (1, firstPassBaseQP)) * log (d) / log (2.0);
@@ -475,6 +489,8 @@ void RateCtrl::storeStatsData( TRCPassStats statsData )
       CHECK( statsData.poc - srcData.poc >= m_pcEncCfg->m_GOPSize, "miss stats data from previous frame for temporal down-sampling" );
       statsData.qp        = srcData.qp;
       statsData.lambda    = srcData.lambda;
+      if( statsData.visActY == 0 && statsData.spVisAct == 0 )
+        statsData.spVisAct = srcData.spVisAct;
       if( statsData.visActY == 0 )
         statsData.visActY = srcData.visActY;
       statsData.numBits   = srcData.numBits;
@@ -682,6 +698,7 @@ void RateCtrl::adjustStatsDownsample()
       {
         value_gopcur += statValue;
         num_gopcur++;
+        doChangeBits = false;
         if (stat.gopNum != 0)
         {
           int var_cur = abs(statValue - meanValue);
@@ -689,20 +706,19 @@ void RateCtrl::adjustStatsDownsample()
           {
             doChangeBits = true;
           }
-          int rate1 = (((value_gopcur / num_gopcur) * 100) / meanValue);
-          int rate2 = (value_gopbefore == 0) ? 100 : (((value_gopcur / num_gopcur) * 100) / value_gopbefore);
-          if ((rate1 > 140) || (rate1 < 60)
-            || (rate2 > 140) || (rate2 < 60))
-          {
-            doChangeBits = true;
-          }
-          else if (doChangeBits)
+          else
           {
-            doChangeBits = false;
+            int rate1 = (((value_gopcur / num_gopcur) * 100) / meanValue);
+            int rate2 = (value_gopbefore == 0) ? 100 : (((value_gopcur / num_gopcur) * 100) / value_gopbefore);
+            if ((rate1 > 140) || (rate1 < 60)
+              || (rate2 > 140) || (rate2 < 60))
+            {
+              doChangeBits = true;
+            }
           }
         }
       }
-      if ((stat.gopNum != 0) && doChangeBits)
+      if ((stat.gopNum != 0) && doChangeBits && (stat.tempLayer > 1))
       {
         stat.numBits = (stat.numBits * 3) >> 1;
       }
@@ -833,7 +849,7 @@ double RateCtrl::getAverageBitsFromFirstPass()
     }
 
     totalBitsFirstPass = (2 * tlBits[0] + (tlCount[0] >> 1)) / std::max (1u, tlCount[0]) +
-        ((gopsInIp - l) * tlBits[1] + (tlCount[1] >> 1)) / std::max (1u, tlCount[1]);
+            ((gopsInIp - l) * tlBits[1] + (tlCount[1] >> 1)) / std::max (1u, tlCount[1]);
     for (l = 2; l <= 7; l++)
     {
       totalBitsFirstPass += ((gopsInIp << (l - 2)) * tlBits[l] + (tlCount[l] >> 1)) / std::max (1u, tlCount[l]);
@@ -927,6 +943,7 @@ void RateCtrl::processGops()
       vecIdx++;
     }
   }
+
   vecIdx = 0;
   fac = 1.0 / gopBits[vecIdx];
   gopTempVal[vecIdx] = 1.0f;
@@ -1160,34 +1177,17 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double&
         {
           const double sqrOfResRatio = double( m_pcEncCfg->m_SourceWidth * m_pcEncCfg->m_SourceHeight ) / ( 3840.0 * 2160.0 );
           const int firstPassSliceQP = it->qp;
-          const int secondPassBaseQP = ( m_pcEncCfg->m_LookAhead ? ( m_pcEncCfg->m_QP + getBaseQP() ) >> 1 : m_pcEncCfg->m_QP );
           const int budgetRelaxScale = ( encRCSeq->maxGopRate + 0.5 < 2.0 * (double)encRCSeq->targetRate * encRCSeq->gopSize / encRCSeq->frameRate ? 2 : 3 ); // quarters
           const bool isRateCapperMax = ( encRCSeq->maxGopRate + 0.5 >= 3.0 * (double)encRCSeq->targetRate * encRCSeq->gopSize / encRCSeq->frameRate );
           const bool isEndOfSequence = ( it->poc >= flushPOC && flushPOC >= 0 );
           const double dLimit = ( isRateCapperMax ? 3.0 : 0.5 * budgetRelaxScale + 0.5 );
           double d = (double)it->targetBits, tmpVal;
-          uint16_t visAct = it->visActY;
-
-          if ( it->isNewScene ) // spatiotemporal visual activity is transient at camera/scene change, find next steady-state activity
-          {
-            std::list<TRCPassStats>::iterator itNext = it;
 
-            itNext++;
-            while ( itNext != encRCSeq->firstPassData.end() && !itNext->isIntra )
-            {
-              if ( itNext->poc == it->poc + 2 )
-              {
-                visAct = itNext->visActY;
-                break;
-              }
-              itNext++;
-            }
-          }
-          encRcPic->visActSteady  = visAct; // TODO: try removing all visAct(Y) related code except for the one in detectSceneCuts()
+          encRcPic->visActSteady = it->visActY;
 
           if ( it->refreshParameters ) // reset counters for budget usage in subsequent frames
           {
-            encRCSeq->qpCorrection[ frameLevel ] = ( it->poc == 0 && d < it->numBits ? std::max( -1.0 * it->visActY / double( 1 << ( encRCSeq->bitDepth - 3 ) ), 1.0 - it->numBits / d ) : 0.0 );
+            encRCSeq->qpCorrection[ frameLevel ] = ( it->poc == 0 && it->isIntra && d < it->numBits ? std::max( -1.0 * it->visActY / double( 1 << ( encRCSeq->bitDepth - 3 ) ), 1.0 - it->numBits / d ) : 0.0 );
             if ( !m_pcEncCfg->m_LookAhead )
             {
               encRCSeq->actualBitCnt[ frameLevel ] = encRCSeq->targetBitCnt[ frameLevel ] = 0;
@@ -1270,7 +1270,7 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double&
           d = firstPassSliceQP - ( 105.0 / 128.0 ) * sqrt( (double)std::max( 1, firstPassSliceQP ) ) * log( d ) / log( 2.0 );
           sliceQP = int( 0.5 + d + 0.5 * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] );
 
-          encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : secondPassBaseQP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ),
+          encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : m_pcEncCfg->m_QP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ), 5 - budgetRelaxScale,
                                   ( it->poc < encRCSeq->gopSize ? 0 : ( m_pcEncCfg->m_maxTLayer + 1 ) >> 1 ), sqrOfResRatio, sliceQP, &encRCSeq->lastAverageQP );
           lambda = it->lambda * pow( 2.0, double( sliceQP - firstPassSliceQP ) / 3.0 );
           lambda = Clip3( encRCSeq->minEstLambda, encRCSeq->maxEstLambda, lambda );
diff --git a/source/Lib/EncoderLib/RateCtrl.h b/source/Lib/EncoderLib/RateCtrl.h
index 3ed7425f6..c870ebf57 100644
--- a/source/Lib/EncoderLib/RateCtrl.h
+++ b/source/Lib/EncoderLib/RateCtrl.h
@@ -142,7 +142,7 @@ namespace vvenc {
 
     void   create( EncRCSeq* encRCSeq, int frameLevel, int framePoc );
     void   destroy();
-    void   clipTargetQP (std::list<EncRCPic*>& listPreviousPictures, const int baseQP, const int maxTL, const double resRatio, int &qp, int* qpAvg);
+    void   clipTargetQP (std::list<EncRCPic*>& listPreviousPictures, const int baseQP, const int refrIncrFac, const int maxTL, const double resRatio, int &qp, int* qpAvg);
     void   updateAfterPicture (const int picActualBits, const int averageQP);
     void   addToPictureList( std::list<EncRCPic*>& listPreviousPictures );
 
diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp
index 26e2fe3ae..0db624682 100644
--- a/source/Lib/EncoderLib/VLCWriter.cpp
+++ b/source/Lib/EncoderLib/VLCWriter.cpp
@@ -1863,7 +1863,7 @@ void HLSWriter::codeSliceHeader( const Slice* slice )
     }
 
     //Write L1 related syntax elements
-      if (!slice->pps->rpl1IdxPresent && slice->pps->rpl1IdxPresent)
+      if (slice->sps->getNumRPL(1) > 1 && slice->pps->rpl1IdxPresent)
       {
         WRITE_FLAG(slice->rplIdx[1] != -1 ? 1 : 0, "ref_pic_list_sps_flag[1]");
       }
diff --git a/source/Lib/apputils/LogoRenderer.h b/source/Lib/apputils/LogoRenderer.h
index f09dc47ac..45fb562f0 100644
--- a/source/Lib/apputils/LogoRenderer.h
+++ b/source/Lib/apputils/LogoRenderer.h
@@ -166,7 +166,7 @@ class LogoRenderer
     if( m_bInitialized ){ uninit(); }
   }
   
-  int init( const std::string &fileName, vvencChromaFormat chromaFormat, int internalBitdepth, std::ostream& rcOstr )
+  int init( const std::string &fileName, vvencChromaFormat chromaFormat, int inputBitdepth, std::ostream& rcOstr )
   {
     if( m_bInitialized )
     { 
@@ -212,7 +212,13 @@ class LogoRenderer
     {
       rcOstr << "Logo input file error: invalid size " << m_cLogo.inputOpts.sourceWidth  << "x" << m_cLogo.inputOpts.sourceHeight << std::endl;
       return -1; 
-    } 
+    }
+    
+    if( inputBitdepth == 8 && m_cLogo.inputOpts.bitdepth == 10 )
+    {
+      m_cLogo.inputOpts.bgColorMin = ( m_cLogo.inputOpts.bgColorMin + 2) >> 2;
+      m_cLogo.inputOpts.bgColorMax = ( m_cLogo.inputOpts.bgColorMax + 2) >> 2;      
+    }
        
     vvenc_YUVBuffer_default( &m_cYuvBufLogo );
     vvenc_YUVBuffer_alloc_buffer( &m_cYuvBufLogo, chromaFormat, m_cLogo.inputOpts.sourceWidth, m_cLogo.inputOpts.sourceHeight );
@@ -257,8 +263,9 @@ class LogoRenderer
 
     // read the logo int yuvBuffer
     bool is16bit       = m_cLogo.inputOpts.bitdepth > 8 ? true : false;
-    int  bitdepthShift = internalBitdepth  - m_cLogo.inputOpts.bitdepth;
-    const LPel maxVal = ( 1 << m_cLogo.inputOpts.bitdepth ) - 1;
+    int  bitdepthShift = inputBitdepth  - m_cLogo.inputOpts.bitdepth;
+    const LPel maxVal = ( 1 << inputBitdepth ) - 1;
+    
     for( int comp = 0; comp < 3; comp++ )
     {
       vvencYUVPlane yuvPlane = m_cYuvBufLogo.planes[ comp ];   
diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h
index 4c4f18409..0b63e095c 100644
--- a/source/Lib/apputils/VVEncAppCfg.h
+++ b/source/Lib/apputils/VVEncAppCfg.h
@@ -479,6 +479,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
   IStreamToEnum<vvencMsgLevel>      toMsgLevel                   ( &c->m_verbosity,   &MsgLevelToEnumMap );
   IStreamToFunc<vvencPresetMode>    toPreset                     ( setPresets, this, c, &PresetToEnumMap,vvencPresetMode::VVENC_MEDIUM);
   IStreamToRefVec<int>              toSourceSize                 ( { &c->m_SourceWidth, &c->m_SourceHeight }, true, 'x' );
+  IStreamToRefVec<int>              toMaxPicSize                 ( { &c->m_maxPicWidth, &c->m_maxPicHeight }, true, 'x' );
   IStreamToRefVec<int>              toFps                        ( { &c->m_FrameRate, &c->m_FrameScale }, false, '/' );
 
   IStreamToEnum<vvencProfile>       toProfile                    ( &c->m_profile,                     &ProfileToEnumMap      );
@@ -754,6 +755,10 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("VerticalPadding",                                 c->m_aiPad[1],                                       "Vertical source padding for conformance window mode 2")
     ("InputChromaFormat",                               toInputFileChromaFormat,                             "input file chroma format (400, 420, 422, 444)")
     ("PackedInput",                                     m_packedYUVInput,                                    "Enable 10-bit packed YUV input data ( pack 4 samples( 8-byte) into 5-bytes consecutively.")
+
+    ("MaxPicSize",                                      toMaxPicSize,                                        "Maximum resolution (maxWidth x maxHeight)")
+    ("MaxPicWidth",                                     c->m_maxPicWidth,                                    "Maximum picture width")
+    ("MaxPicHeight",                                    c->m_maxPicHeight,                                   "Maximum picture height")
     ;
 
     opts.setSubSection("Profile, Level, Tier");
@@ -1203,6 +1208,11 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
       err.warn( "Bitstream file" ) << cErr;
     }
 
+    if ( m_FrameSkip < 0 )
+    {
+      err.error( "number of frames to skip" ) << (m_easyMode ? "frameskip must be >= 0\n" : "FrameSkip must be >= 0\n");
+    }
+
     // check for y4m input
     bool isY4m = ( m_forceY4mInput || apputils::FileIOHelper::isY4mInputFilename( m_inputFileName ) ) ? true : false;
     if( !isY4m && apputils::FileIOHelper::isY4mHeaderAvailable( m_inputFileName ) )
diff --git a/source/Lib/apputils/YuvFileIO.h b/source/Lib/apputils/YuvFileIO.h
index f8386ada5..a8a6267e2 100644
--- a/source/Lib/apputils/YuvFileIO.h
+++ b/source/Lib/apputils/YuvFileIO.h
@@ -102,6 +102,10 @@ class YuvFileIO
     m_fileBitdepth        = std::min<unsigned>( fileBitDepth, 16 );
     m_MSBExtendedBitDepth = MSBExtendedBitDepth;
     m_bitdepthShift       = internalBitDepth - m_MSBExtendedBitDepth;
+    if( internalBitDepth == 8 && fileBitDepth == 10 && MSBExtendedBitDepth == fileBitDepth )
+    {
+      m_bitdepthShift     = 0;
+    }
     m_fileChrFmt          = fileChrFmt;
     m_bufferChrFmt        = bufferChrFmt;
     m_clipToRec709        = clipToRec709;
@@ -143,7 +147,7 @@ class YuvFileIO
       if( !cLogoFilename.empty() )
       {
         std::stringstream strstr;
-        if ( 0 != m_cLogoRenderer.init( cLogoFilename, m_bufferChrFmt, internalBitDepth, strstr ) )
+        if ( 0 != m_cLogoRenderer.init( cLogoFilename, m_bufferChrFmt, fileBitDepth, strstr ) )
         {
           if( !strstr.str().empty() )
             m_lastError = strstr.str();
diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp
index 7291a50b1..2779c6e12 100644
--- a/source/Lib/vvenc/vvencCfg.cpp
+++ b/source/Lib/vvenc/vvencCfg.cpp
@@ -400,6 +400,9 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c )
   c->m_PadSourceWidth                          = 0;                                     ///< source width in pixel
   c->m_PadSourceHeight                         = 0;                                     ///< source height in pixel (when interlaced = field height)
 
+  c->m_maxPicWidth                             = 0;
+  c->m_maxPicHeight                            = 0;
+
   memset(&c->m_aiPad,0, sizeof(c->m_aiPad));                                    ///< number of padded pixels for width and height
   c->m_enablePictureHeaderInSliceHeader        = true;
   c->m_AccessUnitDelimiter                     = -1;                                    ///< add Access Unit Delimiter NAL units, default: auto (only enable if needed by dependent options)
@@ -1209,6 +1212,11 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
     c->m_craAPSreset            = true;
     c->m_rprRASLtoolSwitch      = true;
   }
+  
+  if( c->m_maxPicWidth > 0 && c->m_maxPicHeight > 0 )
+  {
+    vvenc_confirmParameter( c, !c->m_rprEnabledFlag || !c->m_resChangeInClvsEnabled, "if a maxSize is set, both RPR and resChangeInClvsEnabled have to enabled" );
+  }
 
   if( c->m_IntraPeriod == 0 && c->m_IntraPeriodSec > 0 )
   {  
diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp
index 43ee76992..2406a550e 100644
--- a/source/Lib/vvenc/vvencimpl.cpp
+++ b/source/Lib/vvenc/vvencimpl.cpp
@@ -338,7 +338,7 @@ int VVEncImpl::encode( vvencYUVBuffer* pcYUVBuffer, vvencAccessUnit* pcAccessUni
       }
     }
 
-    if ( ! xVerifyYUVBuffer( pcYUVBuffer ) )
+    if ( ! xConvertVerifyYUVBuffer( pcYUVBuffer ) )
     {     
       m_cErrorString = "InputPicture: Source image contains values outside the specified bit range";
       return VVENC_ERR_UNSPECIFIED;
@@ -557,10 +557,17 @@ int VVEncImpl::printSummary() const
   return 0;
 }
 
-bool VVEncImpl::xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer )
+bool VVEncImpl::xConvertVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer )
 {
   if( pcYUVBuffer == nullptr ){ return false; }
 
+  bool conv8bit = false;
+  if ( m_cVVEncCfg.m_inputBitDepth[0] == 10 && m_cVVEncCfg.m_internalBitDepth[0] == 8 &&
+       m_cVVEncCfg.m_inputBitDepth[0] == m_cVVEncCfg.m_MSBExtendedBitDepth[0] )
+  {
+    conv8bit = true;
+  }
+
   const int numComp  = (m_cVVEncCfg.m_internChromaFormat==VVENC_CHROMA_400) ? 1 : 3;
   const int16_t mask = ~( ( 1 << m_cVVEncCfg.m_internalBitDepth[0] ) - 1 );
   int dstSum = 0;
@@ -568,11 +575,26 @@ bool VVEncImpl::xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer )
   {
     vvencYUVPlane& plane = pcYUVBuffer->planes[ comp ];
     int16_t* dst     = plane.ptr;
-    for( int y = 0; y < plane.height; y++, dst += plane.stride )
+
+    if ( conv8bit )
     {
-      for( int x = 0; x < plane.width; x++ )
+      for( int y = 0; y < plane.height; y++, dst += plane.stride )
       {
-        dstSum |= dst[ x ] & mask;
+        for( int x = 0; x < plane.width; x++ )
+        {
+          dst[ x ] = (Pel)std::min<Pel>( 255, ( dst[x] + 2 ) >> 2 );
+          dstSum |= dst[ x ] & mask;
+        }
+      }
+    }
+    else
+    {
+      for( int y = 0; y < plane.height; y++, dst += plane.stride )
+      {
+        for( int x = 0; x < plane.width; x++ )
+        {
+          dstSum |= dst[ x ] & mask;
+        }
       }
     }
   }
diff --git a/source/Lib/vvenc/vvencimpl.h b/source/Lib/vvenc/vvencimpl.h
index a98a62eb8..fbbba7363 100644
--- a/source/Lib/vvenc/vvencimpl.h
+++ b/source/Lib/vvenc/vvencimpl.h
@@ -130,7 +130,7 @@ class VVEncImpl
 private:
   int xGetAccessUnitsSize( const vvenc::AccessUnitList& rcAuList );
   int xCopyAu( vvencAccessUnit& rcAccessUnit, const AccessUnitList& rcAu );
-  bool xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer );
+  bool xConvertVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer );
 
 private:
   VVEncInternalState     m_eState               = INTERNAL_STATE_UNINITIALIZED;