diff --git a/Makefile b/Makefile index 5641208d6..d0da05c8e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -# VVCEnc/Makefile +# VVenC/Makefile # # How to build a single target: # make -r => build variant=release @@ -163,22 +163,22 @@ DEFAULT_BUILD_TARGETS_SHARED := $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),$(t) DEFAULT_BUILD_TARGETS := $(DEFAULT_BUILD_TARGETS_STATIC) $(DEFAULT_BUILD_TARGETS_SHARED) -release: $(BUILD_DIR-release) +release: $(BUILD_DIR-release)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) -debug: $(BUILD_DIR-debug) +debug: $(BUILD_DIR-debug)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) -relwithdebinfo: $(BUILD_DIR-relwithdebinfo) +relwithdebinfo: $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) -release-shared: $(BUILD_DIR-release-shared) +release-shared: $(BUILD_DIR-release-shared)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) -debug-shared: $(BUILD_DIR-debug-shared) +debug-shared: $(BUILD_DIR-debug-shared)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) -relwithdebinfo-shared: $(BUILD_DIR-relwithdebinfo-shared) +relwithdebinfo-shared: $(BUILD_DIR-relwithdebinfo-shared)/CMakeCache.txt cmake $(BUILD_OPTIONS-$@) $(BUILD_JOBS) $(BUILD_TOOL_OPTIONS) $(foreach t,$(DEFAULT_BUILD_TARGETS),clean-$(t)): @@ -204,37 +204,37 @@ install-relwithdebinfo-shared: relwithdebinfo-shared ifeq ($(CMAKE_MCONFIG),) -$(BUILD_DIR-release) configure-release: +$(BUILD_DIR-release)/CMakeCache.txt configure-release: cmake -S . -B $(BUILD_DIR-release) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Release -$(BUILD_DIR-debug) configure-debug: +$(BUILD_DIR-debug)/CMakeCache.txt configure-debug: cmake -S . -B $(BUILD_DIR-debug) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Debug -$(BUILD_DIR-relwithdebinfo) configure-relwithdebinfo: +$(BUILD_DIR-relwithdebinfo)/CMakeCache.txt configure-relwithdebinfo: cmake -S . -B $(BUILD_DIR-relwithdebinfo) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=RelWithDebInfo -$(BUILD_DIR-release-shared) configure-release-shared: +$(BUILD_DIR-release-shared)/CMakeCache.txt configure-release-shared: cmake -S . -B $(BUILD_DIR-release-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=1 -$(BUILD_DIR-debug-shared) configure-debug-shared: +$(BUILD_DIR-debug-shared)/CMakeCache.txt configure-debug-shared: cmake -S . -B $(BUILD_DIR-debug-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=1 -$(BUILD_DIR-relwithdebinfo-shared) configure-relwithdebinfo-shared: +$(BUILD_DIR-relwithdebinfo-shared)/CMakeCache.txt configure-relwithdebinfo-shared: cmake -S . -B $(BUILD_DIR-relwithdebinfo-shared) $(CONFIG_OPTIONS) -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_SHARED_LIBS=1 configure-static: $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t)) configure-shared: $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t)) else -$(BUILD_DIR_STATIC) configure-static $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t)): +$(BUILD_DIR_STATIC)/CMakeCache.txt configure-static $(foreach t,$(DEFAULT_BUILD_TARGETS_STATIC),configure-$(t)): cmake -S . -B $(BUILD_DIR_STATIC) $(CONFIG_OPTIONS) -$(BUILD_DIR_SHARED) configure-shared $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t)): +$(BUILD_DIR_SHARED)/CMakeCache.txt configure-shared $(foreach t,$(DEFAULT_BUILD_TARGETS_SHARED),configure-$(t)): cmake -S . -B $(BUILD_DIR_SHARED) $(CONFIG_OPTIONS) -DBUILD_SHARED_LIBS=1 endif static: $(DEFAULT_BUILD_TARGETS_STATIC) -shared: $(DEFAULT_BUILD_TARGETS_SHARED) +shared: $(DEFAULT_BUILD_TARGETS_SHARED) all: static shared @@ -303,26 +303,24 @@ TARGETS_RELEASE_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cr) TARGETS_DEBUG_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cd) TARGETS_RELWITHDEBINFO_CLEAN_FIRST := $(foreach t,$(TARGETS),$(t)-cp) -$(TARGETS_RELEASE): $(BUILD_DIR-release) +$(TARGETS_RELEASE): $(BUILD_DIR-release)/CMakeCache.txt cmake $(BUILD_OPTIONS-release) $(BUILD_JOBS) --target $(patsubst %-r,%,$@) $(BUILD_TOOL_OPTIONS) -$(TARGETS_RELEASE_CLEAN_FIRST): $(BUILD_DIR-release) +$(TARGETS_RELEASE_CLEAN_FIRST): $(BUILD_DIR-release)/CMakeCache.txt cmake $(BUILD_OPTIONS-release) $(BUILD_JOBS) --clean-first --target $(patsubst %-cr,%,$@) $(BUILD_TOOL_OPTIONS) -$(TARGETS_DEBUG): $(BUILD_DIR-debug) +$(TARGETS_DEBUG): $(BUILD_DIR-debug)/CMakeCache.txt cmake $(BUILD_OPTIONS-debug) $(BUILD_JOBS) --target $(patsubst %-d,%,$@) $(BUILD_TOOL_OPTIONS) -$(TARGETS_DEBUG_CLEAN_FIRST): $(BUILD_DIR-debug) +$(TARGETS_DEBUG_CLEAN_FIRST): $(BUILD_DIR-debug)/CMakeCache.txt cmake $(BUILD_OPTIONS-debug) $(BUILD_JOBS) --clean-first --target $(patsubst %-cd,%,$@) $(BUILD_TOOL_OPTIONS) -$(TARGETS_RELWITHDEBINFO): $(BUILD_DIR-relwithdebinfo) +$(TARGETS_RELWITHDEBINFO): $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt cmake $(BUILD_OPTIONS-relwithdebinfo) $(BUILD_JOBS) --target $(patsubst %-p,%,$@) $(BUILD_TOOL_OPTIONS) -$(TARGETS_RELWITHDEBINFO_CLEAN_FIRST): $(BUILD_DIR-relwithdebinfo) +$(TARGETS_RELWITHDEBINFO_CLEAN_FIRST): $(BUILD_DIR-relwithdebinfo)/CMakeCache.txt cmake $(BUILD_OPTIONS-relwithdebinfo) $(BUILD_JOBS) --clean-first --target $(patsubst %-cp,%,$@) $(BUILD_TOOL_OPTIONS) -.PHONY: install +.PHONY: install clean realclean distclean -ifeq ($(OS),Windows_NT) .NOTPARALLEL: -endif diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h index 9cc4506aa..16c3c5a58 100644 --- a/include/vvenc/vvencCfg.h +++ b/include/vvenc/vvencCfg.h @@ -498,8 +498,8 @@ typedef struct vvenc_config int m_cfgUnused4[ 7 ]; // TODO: remove unused memory from configuration int m_cfgUnused5[ 7 ]; int m_cfgUnused6; - int m_cfgUnused7; - int m_cfgUnused8; + int m_maxPicWidth; + int m_maxPicHeight; bool m_useSameChromaQPTables; vvencChromaQpMappingTableParams m_chromaQpMappingTableParams; diff --git a/source/App/vvencFFapp/EncApp.cpp b/source/App/vvencFFapp/EncApp.cpp index 74bf1b527..a6a449d45 100644 --- a/source/App/vvencFFapp/EncApp.cpp +++ b/source/App/vvencFFapp/EncApp.cpp @@ -305,6 +305,7 @@ int EncApp::encode() apputils::Stats cStats; int64_t frameCount = apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput ); + frameCount = std::max( 0, frameCount-appCfg.m_FrameSkip ); int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded; cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " ); bool statsInfoReady = false; diff --git a/source/App/vvencapp/vvencapp.cpp b/source/App/vvencapp/vvencapp.cpp index bf86041b6..19612ab1a 100644 --- a/source/App/vvencapp/vvencapp.cpp +++ b/source/App/vvencapp/vvencapp.cpp @@ -353,6 +353,7 @@ int main( int argc, char* argv[] ) } int64_t frameCount = apputils::VVEncAppCfg::getFrameCount( vvencappCfg.m_inputFileName, vvenccfg.m_SourceWidth, vvenccfg.m_SourceHeight, vvenccfg.m_inputBitDepth[0], vvencappCfg.m_packedYUVInput ); + frameCount = std::max( 0, frameCount-vvencappCfg.m_FrameSkip ); int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded; apputils::Stats cStats; diff --git a/source/Lib/CommonLib/AffineGradientSearch.cpp b/source/Lib/CommonLib/AffineGradientSearch.cpp index 435c62aa9..52acd1c1d 100644 --- a/source/Lib/CommonLib/AffineGradientSearch.cpp +++ b/source/Lib/CommonLib/AffineGradientSearch.cpp @@ -64,8 +64,9 @@ namespace vvenc { AffineGradientSearch::AffineGradientSearch() { m_HorizontalSobelFilter = xHorizontalSobelFilter; - m_VerticalSobelFilter = xVerticalSobelFilter; - m_EqualCoeffComputer = xEqualCoeffComputer; + m_VerticalSobelFilter = xVerticalSobelFilter; + m_EqualCoeffComputer[0] = xEqualCoeffComputer; + m_EqualCoeffComputer[1] = xEqualCoeffComputer; #if ENABLE_SIMD_OPT_AFFINE_ME #ifdef TARGET_SIMD_X86 @@ -74,7 +75,7 @@ namespace vvenc { #endif } - void AffineGradientSearch::xHorizontalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height) + void AffineGradientSearch::xHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) { for (int j = 1; j < height - 1; j++) { @@ -106,7 +107,7 @@ namespace vvenc { } } - void AffineGradientSearch::xVerticalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height) + void AffineGradientSearch::xVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) { for (int k = 1; k < width - 1; k++) { @@ -139,7 +140,8 @@ namespace vvenc { } } - void AffineGradientSearch::xEqualCoeffComputer(Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param) + template + void AffineGradientSearch::xEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]) { int affineParamNum = b6Param ? 6 : 4; diff --git a/source/Lib/CommonLib/AffineGradientSearch.h b/source/Lib/CommonLib/AffineGradientSearch.h index 60c7904d2..f06995157 100644 --- a/source/Lib/CommonLib/AffineGradientSearch.h +++ b/source/Lib/CommonLib/AffineGradientSearch.h @@ -56,13 +56,14 @@ namespace vvenc { class AffineGradientSearch { public: - void (*m_HorizontalSobelFilter) (Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height); - void (*m_VerticalSobelFilter) (Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height); - void (*m_EqualCoeffComputer) (Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param); - - static void xHorizontalSobelFilter( Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height); - static void xVerticalSobelFilter ( Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height); - static void xEqualCoeffComputer ( Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param); + void (*m_HorizontalSobelFilter) (Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height); + void (*m_VerticalSobelFilter) (Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height); + void (*m_EqualCoeffComputer[2]) (Pel* const pResi, const int resiStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]); + + static void xHorizontalSobelFilter( Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height); + static void xVerticalSobelFilter ( Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height); + template + static void xEqualCoeffComputer ( Pel* const pResi, const int resiStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]); AffineGradientSearch(); ~AffineGradientSearch() {} diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index 738c4086f..c229d2329 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -1661,7 +1661,7 @@ DepQuant::~DepQuant() void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff& uiAbsSum, const QpParam& cQP, const Ctx& ctx ) { - if( tu.cs->picture->useScSelectiveRdoq && !xNeedRDOQ( tu, compID, pSrc, cQP ) ) + if( tu.cs->picture->useSelectiveRdoq && !xNeedRDOQ( tu, compID, pSrc, cQP ) ) { tu.lastPos[compID] = -1; uiAbsSum = 0; diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index c45ee0864..20b522582 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -356,7 +356,7 @@ void InterpolationFilter::filterCopy( const ClpRng& clpRng, const Pel* src, int // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template -void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR) +void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff) { int row, col; @@ -389,19 +389,21 @@ void InterpolationFilter::filter(const ClpRng& clpRng, Pel const *src, int srcSt // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 CHECK(shift < 0, "Negative shift"); - if ( isLast ) + if( N != 2 ) { - shift += (isFirst) ? 0 : headRoom; - offset = 1 << (shift - 1); - offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; + if ( isLast ) + { + shift += (isFirst) ? 0 : headRoom; + offset = 1 << (shift - 1); + offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; + } + else + { + shift -= (isFirst) ? headRoom : 0; + offset = (isFirst) ? -IF_INTERNAL_OFFS *(1< -void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR) +void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff) { if( N == 8 ) { - m_filterHor[0][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterHor[0][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 4 ) { - m_filterHor[1][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterHor[1][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 2 ) { - m_filterHor[2][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterHor[2][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 6 ) { - m_filterHor[3][1][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR ); + m_filterHor[3][1][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else { @@ -506,23 +509,23 @@ void InterpolationFilter::filterHor(const ClpRng& clpRng, Pel const *src, int sr * \param coeff Pointer to filter taps */ template -void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR) +void InterpolationFilter::filterVer(const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff) { if( N == 8 ) { - m_filterVer[0][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterVer[0][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 4 ) { - m_filterVer[1][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterVer[1][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 2 ) { - m_filterVer[2][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR); + m_filterVer[2][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else if( N == 6 ) { - m_filterVer[3][isFirst][isLast]( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR ); + m_filterVer[3][isFirst][isLast](clpRng, src, srcStride, dst, dstStride, width, height, coeff); } else{ THROW( "Invalid tap number" ); @@ -562,7 +565,7 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in CHECK( frac < 0 || frac >= LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" ); if( nFilterIdx == 1 ) { - filterHor(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilterPrec4[frac], biMCForDMVR); + filterHor(clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_bilinearFilterPrec4[frac]); } else { @@ -570,24 +573,24 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in { if( useAltHpelIf && frac == 8 ) { - filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaAltHpelIFilter, biMCForDMVR ); + filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaAltHpelIFilter ); } else if( ( width == 4 && height == 4 ) || ( width == 4 && height == ( 4 + NTAPS_LUMA - 1 ) ) ) { - filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac], biMCForDMVR ); + filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac] ); } else { - filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac], biMCForDMVR ); + filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac] ); } } else if( reduceTap == 1 ) { - filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac], biMCForDMVR ); + filterHor<6>( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter4x4[frac] ); } else { - filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << 1], biMCForDMVR ); + filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << 1] ); } } } @@ -595,7 +598,7 @@ void InterpolationFilter::filterHor(const ComponentID compID, Pel const *src, in { const uint32_t csx = getComponentScaleX( compID, fmt ); CHECK( frac < 0 || csx >= 2 || ( frac << ( 1 - csx ) ) >= CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" ); - filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << ( 1 - csx )], biMCForDMVR); + filterHor( clpRng, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac << ( 1 - csx )] ); } } @@ -627,7 +630,7 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in CHECK( frac < 0 || frac >= LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" ); if (nFilterIdx == 1) { - filterVer(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilterPrec4[frac], biMCForDMVR); + filterVer(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_bilinearFilterPrec4[frac]); } else { @@ -635,24 +638,24 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in { if( useAltHpelIf && frac == 8 ) { - filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaAltHpelIFilter, biMCForDMVR ); + filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaAltHpelIFilter ); } else if( width == 4 && height == 4 ) { - filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac], biMCForDMVR ); + filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac] ); } else { - filterVer( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac], biMCForDMVR ); + filterVer( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac] ); } } else if( reduceTap == 1 ) { - filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac], biMCForDMVR ); + filterVer<6>( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter4x4[frac] ); } else { - filterVer( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << 1], biMCForDMVR ); + filterVer( clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << 1] ); } } } @@ -660,7 +663,7 @@ void InterpolationFilter::filterVer(const ComponentID compID, Pel const *src, in { const uint32_t csy = getComponentScaleY( compID, fmt ); CHECK( frac < 0 || csy >= 2 || ( frac << ( 1 - csy ) ) >= CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS, "Invalid fraction" ); - filterVer(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << (1 - csy)], biMCForDMVR); + filterVer(clpRng, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac << (1 - csy)]); } } @@ -674,8 +677,8 @@ void InterpolationFilter::scalarFilterN2_2D( const ClpRng& clpRng, Pel const *sr { Pel *tmp = ( Pel* ) alloca( width * ( height + 1 ) * sizeof( Pel ) ); - filter<2, false, true, false>( clpRng, src, srcStride, tmp, width, width, height + 1, ch, true ); - filter<2, true , false, false>( clpRng, tmp, width, dst, dstStride, width, height, cv, true ); + filter<2, false, true, false>( clpRng, src, srcStride, tmp, width, width, height + 1, ch ); + filter<2, true , false, false>( clpRng, tmp, width, dst, dstStride, width, height, cv ); } void InterpolationFilter::filter4x4( const ComponentID compID, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng,bool useAltHpelIf/*= false*/,int nFilterIdx /*= 0*/ ) diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 6e2c09aee..6e3c83ee4 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -72,15 +72,15 @@ class InterpolationFilter static const TFilterCoeff m_bilinearFilterPrec4[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][NTAPS_BILINEAR]; ///< bilinear filter taps public: template - static void filterCopy(const ClpRng& clpRng, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool biMCForDMVR); + static void filterCopy(const ClpRng& clpRng, const Pel* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool biMCForDMVR); template - static void filter (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR); + static void filter (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff); template - void filterHor (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR); + void filterHor (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isLast, TFilterCoeff const *coeff); template - void filterVer (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff, bool biMCForDMVR); + void filterVer (const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width, int height, bool isFirst, bool isLast, TFilterCoeff const *coeff); template static void filterXxY_N2 (const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV); @@ -101,8 +101,8 @@ class InterpolationFilter ~InterpolationFilter() {} void( *m_filterN2_2D )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *ch, TFilterCoeff const *cv ); - void( *m_filterHor[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR); - void( *m_filterVer[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR); + void( *m_filterHor[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff); + void( *m_filterVer[4][2][2] )( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff); void( *m_filterCopy[2][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, bool biMCForDMVR); void( *m_filter4x4 [2][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV ); void( *m_filter8x8 [3][2] ) ( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV ); diff --git a/source/Lib/CommonLib/MCTF.cpp b/source/Lib/CommonLib/MCTF.cpp index 8c8329f22..40e10ef78 100644 --- a/source/Lib/CommonLib/MCTF.cpp +++ b/source/Lib/CommonLib/MCTF.cpp @@ -468,6 +468,8 @@ void applyBlockCore( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const } } } + variance <<= 2*(10-clpRng.bd); + diffsum <<= 2*(10-clpRng.bd); const int cntV = w * h; const int cntD = 2 * cntV - w - h; vnoise[i] = ( int ) round( ( 15.0 * cntD / cntV * variance + 5.0 ) / ( diffsum + 5.0 ) ); @@ -691,7 +693,7 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) int dropFramesFront = std::min( std::max( filterIdx - filterFrames, 0 ), dropFrames ); int dropFramesBack = std::min( std::max( static_cast( picFifo.size() ) - 1 - filterIdx - filterFrames, 0 ), dropFrames ); - if( !pic->useScMCTF && !pic->gopEntry->m_isStartOfGop ) + if( !pic->useMCTF && !pic->gopEntry->m_isStartOfGop ) { isFilterThisFrame = false; } @@ -749,7 +751,7 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) } // filter - if( pic->useScMCTF ) + if( pic->useMCTF ) { fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding ); bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength ); @@ -798,7 +800,7 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) if( distFactor[0] < 3 && distFactor[1] < 3 && ( m_encCfg->m_usePerceptQPA || pic->gopEntry->m_isStartOfGop ) ) { - const double bd12bScale = double (m_encCfg->m_internalBitDepth[CH_L] < 12 ? 1 << (12 - m_encCfg->m_internalBitDepth[CH_L]) : 1); + const double bd12bScale = double (m_encCfg->m_internalBitDepth[CH_L] < 12 ? 4 : 1); for( int i = 0; i < numCtu; i++ ) // start noise estimation with motion errors { @@ -825,7 +827,7 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) } pic->m_picShared->m_picMotEstError = uint16_t (0.5 + meanRmsAcrossPic / numCtu); - if( pic->gopEntry->m_isStartOfGop && !pic->useScMCTF && m_encCfg->m_vvencMCTF.MCTF > 0 && meanRmsAcrossPic > numCtu * 27.0 ) + if( pic->gopEntry->m_isStartOfGop && !pic->useMCTF && m_encCfg->m_vvencMCTF.MCTF > 0 && meanRmsAcrossPic > numCtu * 27.0 ) { // force filter fltrBuf.create( m_encCfg->m_internChromaFormat, m_area, 0, m_padding ); @@ -833,7 +835,7 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) } } - if( !m_encCfg->m_blockImportanceMapping || !pic->useScMCTF ) + if( !m_encCfg->m_blockImportanceMapping || !pic->useMCTF ) { CHECKD( !pic->m_picShared->m_ctuBimQpOffset.empty(), "BIM disabled, but offset vector not empty!" ); return; @@ -1001,7 +1003,7 @@ int MCTF::motionErrorLuma(const PelStorage &orig, } bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX, Array2D &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize, - const Array2D *previous, const int factor, const bool doubleRes, int blockY ) const + const Array2D *previous, const int factor, const bool doubleRes, int blockY, int bitDepth ) const { PROFILER_SCOPE_AND_STAGE( 1, _TPROF, P_MCTF_SEARCH ); @@ -1145,10 +1147,12 @@ bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX, const int w = std::min( blockSize, orig.Y().width - blockX ) & ~7; const int h = std::min( blockSize, orig.Y().height - blockY ) & ~7; - const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ); - const double mse = best.error / double( w * h ); + CHECKD(bitDepth>10, "unsupported internal bit depth (also in calcVar)" ); + const double bdScale = double(1<<(2*(10-bitDepth))); + const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ) * bdScale; + const double mse = best.error * bdScale / double( w * h ); - best.error = ( int ) ( 20 * ( ( best.error + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 ); + best.error = ( int ) ( 20 * ( ( best.error*bdScale + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 ); best.rmsme = uint16_t( 0.5 + sqrt( mse ) ); best.overlap = ( ( double ) w * h ) / ( m_mctfUnitSize * m_mctfUnitSize ); @@ -1162,6 +1166,7 @@ void MCTF::motionEstimationLuma(Array2D &mvs, const PelStorage &or { const int stepSize = blockSize; const int origHeight = orig.Y().height; + const int bitDepth = m_encCfg->m_internalBitDepth[CH_L]; if( m_threadPool ) { @@ -1177,6 +1182,7 @@ void MCTF::motionEstimationLuma(Array2D &mvs, const PelStorage &or int factor; bool doubleRes; int blockY; + int bitDepth; const MCTF* mctf; }; @@ -1190,7 +1196,7 @@ void MCTF::motionEstimationLuma(Array2D &mvs, const PelStorage &or { ITT_TASKSTART( itt_domain_MCTF_est, itt_handle_est ); - bool ret = params->mctf->estimateLumaLn( params->blockX, params->prevLineX, *params->mvs, *params->orig, *params->buffer, params->blockSize, params->previous, params->factor, params->doubleRes, params->blockY ); + bool ret = params->mctf->estimateLumaLn( params->blockX, params->prevLineX, *params->mvs, *params->orig, *params->buffer, params->blockSize, params->previous, params->factor, params->doubleRes, params->blockY, params->bitDepth ); ITT_TASKEND( itt_domain_MCTF_est, itt_handle_est ); return ret; @@ -1208,6 +1214,7 @@ void MCTF::motionEstimationLuma(Array2D &mvs, const PelStorage &or cEstParams.doubleRes = doubleRes; cEstParams.mctf = this; cEstParams.blockY = blockY; + cEstParams.bitDepth = bitDepth; m_threadPool->addBarrierTask( task, &cEstParams, &taskCounter); } @@ -1218,7 +1225,7 @@ void MCTF::motionEstimationLuma(Array2D &mvs, const PelStorage &or for( int blockY = 0; blockY + 7 <= origHeight; blockY += stepSize ) { std::atomic_int blockX( 0 ), prevBlockX( orig.Y().width + stepSize ); - estimateLumaLn( blockX, blockY ? &prevBlockX : nullptr, mvs, orig, buffer, blockSize, previous, factor, doubleRes, blockY ); + estimateLumaLn( blockX, blockY ? &prevBlockX : nullptr, mvs, orig, buffer, blockSize, previous, factor, doubleRes, blockY, bitDepth ); } } diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h index b93d473ae..cd60dae78 100644 --- a/source/Lib/CommonLib/MCTF.h +++ b/source/Lib/CommonLib/MCTF.h @@ -180,7 +180,7 @@ class MCTF : public EncStage int motionErrorLuma (const PelStorage &orig, const PelStorage &buffer, const int x, const int y, int dx, int dy, const int bs, const int besterror) const; bool estimateLumaLn ( std::atomic_int& blockX, std::atomic_int* prevLineX, Array2D &mvs, const PelStorage &orig, const PelStorage &buffer, const int blockSize, - const Array2D *previous, const int factor, const bool doubleRes, int blockY ) const; + const Array2D *previous, const int factor, const bool doubleRes, int blockY, int bitDepth ) const; void motionEstimationLuma(Array2D &mvs, const PelStorage &orig, const PelStorage &buffer, const int bs, const Array2D *previous=0, const int factor = 1, const bool doubleRes = false) const; diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp index 5791fedee..03718cb6d 100644 --- a/source/Lib/CommonLib/Picture.cpp +++ b/source/Lib/CommonLib/Picture.cpp @@ -182,15 +182,15 @@ Picture::Picture() , picSpVisAct ( 0 ) , isSccWeak ( false ) , isSccStrong ( false ) - , useScME ( false ) - , useScMCTF ( false ) - , useScTS ( false ) - , useScBDPCM ( false ) - , useScIBC ( false ) - , useScLMCS ( false ) - , useScSAO ( false ) - , useScNumRefs ( false ) - , useScFastMrg ( 0 ) + , useME ( false ) + , useMCTF ( false ) + , useTS ( false ) + , useBDPCM ( false ) + , useIBC ( false ) + , useLMCS ( false ) + , useSAO ( false ) + , useNumRefs ( false ) + , useFastMrg ( 0 ) , useQtbttSpeedUpMode( 0 ) , actualHeadBits ( 0 ) , actualTotalBits ( 0 ) @@ -385,16 +385,16 @@ void Picture::finalInit( const VPS& _vps, const SPS& sps, const PPS& pps, PicHea void Picture::setSccFlags( const VVEncCfg* encCfg ) { - useScME = encCfg->m_motionEstimationSearchMethodSCC > 0 && isSccStrong; - useScTS = encCfg->m_TS == 1 || ( encCfg->m_TS == 2 && isSccWeak ); - useScBDPCM = encCfg->m_useBDPCM == 1 || ( encCfg->m_useBDPCM == 2 && isSccWeak ); - useScMCTF = encCfg->m_vvencMCTF.MCTF == 1 || ( encCfg->m_vvencMCTF.MCTF == 2 && ! isSccStrong ); - useScLMCS = encCfg->m_lumaReshapeEnable == 1 || ( encCfg->m_lumaReshapeEnable == 2 && ! isSccStrong ); - useScIBC = encCfg->m_IBCMode == 1 || ( encCfg->m_IBCMode == 2 && isSccStrong ); - useScSAO = encCfg->m_bUseSAO && ( !encCfg->m_saoScc || isSccWeak ); - useScSelectiveRdoq = encCfg->m_useSelectiveRDOQ == 2 ? !isSccWeak : !!encCfg->m_useSelectiveRDOQ; - useScNumRefs = isSccStrong; - useScFastMrg = isSccStrong ? 0 : std::max(0, encCfg->m_useFastMrg - 2); + useME = encCfg->m_motionEstimationSearchMethodSCC > 0 && isSccStrong; + useTS = encCfg->m_TS == 1 || ( encCfg->m_TS == 2 && isSccWeak ); + useBDPCM = encCfg->m_useBDPCM == 1 || ( encCfg->m_useBDPCM == 2 && isSccWeak ); + useMCTF = encCfg->m_vvencMCTF.MCTF == 1 || ( encCfg->m_vvencMCTF.MCTF == 2 && ! isSccStrong ); + useLMCS = encCfg->m_lumaReshapeEnable == 1 || ( encCfg->m_lumaReshapeEnable == 2 && ! isSccStrong ); + useIBC = encCfg->m_IBCMode == 1 || ( encCfg->m_IBCMode == 2 && isSccStrong ); + useSAO = encCfg->m_bUseSAO && ( !encCfg->m_saoScc || isSccWeak ); + useSelectiveRdoq = encCfg->m_useSelectiveRDOQ == 2 ? !isSccWeak : !!encCfg->m_useSelectiveRDOQ; + useNumRefs = isSccStrong; + useFastMrg = isSccStrong ? 0 : std::max(0, encCfg->m_useFastMrg - 2); useQtbttSpeedUpMode = encCfg->m_qtbttSpeedUpMode; if( ( encCfg->m_qtbttSpeedUpMode & 2 ) && isSccStrong ) diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h index 97b7d763c..b72b74016 100644 --- a/source/Lib/CommonLib/Picture.h +++ b/source/Lib/CommonLib/Picture.h @@ -252,16 +252,16 @@ struct Picture : public UnitArea StopClock encTime; bool isSccWeak; bool isSccStrong; - bool useScME; - bool useScMCTF; - bool useScTS; - bool useScBDPCM; - bool useScIBC; - bool useScLMCS; - bool useScSAO; - bool useScNumRefs; - bool useScSelectiveRdoq; - int useScFastMrg; + bool useME; + bool useMCTF; + bool useTS; + bool useBDPCM; + bool useIBC; + bool useLMCS; + bool useSAO; + bool useNumRefs; + bool useSelectiveRdoq; + int useFastMrg; int useQtbttSpeedUpMode; int actualHeadBits; int actualTotalBits; diff --git a/source/Lib/CommonLib/QuantRDOQ.cpp b/source/Lib/CommonLib/QuantRDOQ.cpp index 8928a7894..89990244a 100644 --- a/source/Lib/CommonLib/QuantRDOQ.cpp +++ b/source/Lib/CommonLib/QuantRDOQ.cpp @@ -470,7 +470,7 @@ void QuantRDOQ::quant(TransformUnit& tu, const ComponentID compID, const CCoeffB if( useRDOQ ) { - if (!tu.cs->picture->useScSelectiveRdoq || xNeedRDOQ(tu, compID, piCoef, cQP)) + if (!tu.cs->picture->useSelectiveRdoq || xNeedRDOQ(tu, compID, piCoef, cQP)) { if( useTransformSkip ) { diff --git a/source/Lib/CommonLib/QuantRDOQ2.cpp b/source/Lib/CommonLib/QuantRDOQ2.cpp index c93871e68..d9a32a99f 100644 --- a/source/Lib/CommonLib/QuantRDOQ2.cpp +++ b/source/Lib/CommonLib/QuantRDOQ2.cpp @@ -270,7 +270,7 @@ void QuantRDOQ2::quant( TransformUnit &tu, const ComponentID compID, const CCoef if( useRDOQ ) { - if( !tu.cs->picture->useScSelectiveRdoq || xNeedRDOQ( tu, compID, piCoef, cQP ) ) + if( !tu.cs->picture->useSelectiveRdoq || xNeedRDOQ( tu, compID, piCoef, cQP ) ) { if( useTransformSkip ) { diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index 807b2463e..8c93bf987 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -753,7 +753,7 @@ class static_vector template iterator insert( const_iterator _pos, InputIt first, InputIt last ) { const difference_type numEl = last - first; - CHECKD( _size + numEl >= N, "capacity exceeded" ); + CHECKD( _size + numEl > N, "capacity exceeded" ); for( difference_type i = _size - 1; i >= _pos - _arr; i-- ) _arr[i + numEl] = _arr[i]; iterator it = _arr + ( _pos - _arr ); _size += numEl; iterator ret = it; while( first != last ) *it++ = *first++; @@ -761,7 +761,7 @@ class static_vector iterator insert( const_iterator _pos, size_t numEl, const T& val ) { //const difference_type numEl = last - first; - CHECKD( _size + numEl >= N, "capacity exceeded" ); + CHECKD( _size + numEl > N, "capacity exceeded" ); for( difference_type i = _size - 1; i >= _pos - _arr; i-- ) _arr[i + numEl] = _arr[i]; iterator it = _arr + ( _pos - _arr ); _size += numEl; iterator ret = it; for ( int k = 0; k < numEl; k++) *it++ = val; diff --git a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h index 2e9213500..6e77c7566 100644 --- a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h +++ b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h @@ -58,149 +58,121 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { -#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation) \ -{ \ -inter0 = _mm_mul_epi32(x1, y1); \ -inter1 = _mm_mul_epi32(tmp0, tmp2); \ -inter2 = _mm_mul_epi32(x2, y2); \ -inter3 = _mm_mul_epi32(tmp1, tmp3); \ -inter2 = _mm_add_epi64(inter0, inter2); \ -inter3 = _mm_add_epi64(inter1, inter3); \ -inter0 = _mm_loadl_epi64(loadLocation); \ -inter3 = _mm_add_epi64(inter2, inter3); \ -inter1 = _mm_srli_si128(inter3, 8); \ -inter3 = _mm_add_epi64(inter1, inter3); \ -inter3 = _mm_add_epi64(inter0, inter3); \ -} - template - static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height) + static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) { - __m128i mmPred[4]; - __m128i mm2xPred[2]; - __m128i mmIntermediates[4]; - __m128i mmDerivate[2]; + CHECK( width % 8, "Invalid size!" ); - assert(!(height % 2)); - assert(!(width % 4)); - - /* Derivates of the rows and columns at the boundary are done at the end of this function */ - /* The value of col and row indicate the columns and rows for which the derivates have already been computed */ - for (int col = 1; (col + 2) < width; col += 2) - { - mmPred[0] = _mm_loadl_epi64(reinterpret_cast(&pPred[0 * predStride + col - 1])); - mmPred[1] = _mm_loadl_epi64(reinterpret_cast(&pPred[1 * predStride + col - 1])); + // pPred is 10-bit - mmPred[0] = _mm_cvtepi16_epi32(mmPred[0]); - mmPred[1] = _mm_cvtepi16_epi32(mmPred[1]); + // -1 0 1 + // -2 0 2 + // -1 0 1 + // + // sum( sobel ) = 8, i.e. 4-bit extension - for (int row = 1; row < (height - 1); row += 2) + for( int y = 1; y < ( height - 1 ); y++ ) + { + int x = 1; + for( ; x < ( width - 8 ); x += 8 ) { - mmPred[2] = _mm_loadl_epi64(reinterpret_cast(&pPred[(row + 1) * predStride + col - 1])); - mmPred[3] = _mm_loadl_epi64(reinterpret_cast(&pPred[(row + 2) * predStride + col - 1])); - - mmPred[2] = _mm_cvtepi16_epi32(mmPred[2]); - mmPred[3] = _mm_cvtepi16_epi32(mmPred[3]); - - mm2xPred[0] = _mm_slli_epi32(mmPred[1], 1); - mm2xPred[1] = _mm_slli_epi32(mmPred[2], 1); - - mmIntermediates[0] = _mm_add_epi32(mm2xPred[0], mmPred[0]); - mmIntermediates[2] = _mm_add_epi32(mm2xPred[1], mmPred[1]); - - mmIntermediates[0] = _mm_add_epi32(mmIntermediates[0], mmPred[2]); - mmIntermediates[2] = _mm_add_epi32(mmIntermediates[2], mmPred[3]); - - mmPred[0] = mmPred[2]; - mmPred[1] = mmPred[3]; - - mmIntermediates[1] = _mm_srli_si128(mmIntermediates[0], 8); - mmIntermediates[3] = _mm_srli_si128(mmIntermediates[2], 8); + __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] ); + acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc ); + acc = _mm_slli_epi16( acc, 1 ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); + + _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); + } - mmDerivate[0] = _mm_sub_epi32(mmIntermediates[1], mmIntermediates[0]); - mmDerivate[1] = _mm_sub_epi32(mmIntermediates[3], mmIntermediates[2]); + __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] ); + acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc ); + acc = _mm_slli_epi16( acc, 1 ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); - _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0]); - _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1]); - } - } + _mm_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); + _mm_storeu_si32 ( &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) ); - for (int j = 1; j < (height - 1); j++) - { - pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1]; - pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)]; + pDerivate[y * derivateBufStride] = pDerivate[y * derivateBufStride + 1]; + pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)]; } - memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0])); - memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0]) - ); + memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); + memcpy( pDerivate + ( height - 1 ) * derivateBufStride, pDerivate + ( height - 2 ) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); } template - static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, int *const pDerivate, const int derivateBufStride, const int width, const int height) + static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) { - __m128i mmPred[4]; - __m128i mmIntermediates[6]; - __m128i mmDerivate[2]; + CHECK( width % 8, "Invalid size!" ); - assert(!(height % 2)); - assert(!(width % 4)); - - /* Derivates of the rows and columns at the boundary are done at the end of this function */ - /* The value of col and row indicate the columns and rows for which the derivates have already been computed */ - for (int col = 1; col < (width - 1); col += 2) - { - mmPred[0] = _mm_loadl_epi64(reinterpret_cast(&pPred[0 * predStride + col - 1])); - mmPred[1] = _mm_loadl_epi64(reinterpret_cast(&pPred[1 * predStride + col - 1])); + // pPred is 10-bit - mmPred[0] = _mm_cvtepi16_epi32(mmPred[0]); - mmPred[1] = _mm_cvtepi16_epi32(mmPred[1]); + // -1 -2 -1 + // 0 0 0 + // 1 2 1 + // + // sum( sobel ) = 8, i.e. 4-bit extension - for (int row = 1; row < (height - 1); row += 2) + for( int y = 1; y < ( height - 1 ); y++ ) + { + int x = 1; + for( ; x < ( width - 8 ); x += 8 ) { - mmPred[2] = _mm_loadl_epi64(reinterpret_cast(&pPred[(row + 1) * predStride + col - 1])); - mmPred[3] = _mm_loadl_epi64(reinterpret_cast(&pPred[(row + 2) * predStride + col - 1])); - - mmPred[2] = _mm_cvtepi16_epi32(mmPred[2]); - mmPred[3] = _mm_cvtepi16_epi32(mmPred[3]); - - mmIntermediates[0] = _mm_sub_epi32(mmPred[2], mmPred[0]); - mmIntermediates[3] = _mm_sub_epi32(mmPred[3], mmPred[1]); - - mmPred[0] = mmPred[2]; - mmPred[1] = mmPred[3]; - - mmIntermediates[1] = _mm_srli_si128(mmIntermediates[0], 4); - mmIntermediates[4] = _mm_srli_si128(mmIntermediates[3], 4); - mmIntermediates[2] = _mm_srli_si128(mmIntermediates[0], 8); - mmIntermediates[5] = _mm_srli_si128(mmIntermediates[3], 8); - - mmIntermediates[1] = _mm_slli_epi32(mmIntermediates[1], 1); - mmIntermediates[4] = _mm_slli_epi32(mmIntermediates[4], 1); - - mmIntermediates[0] = _mm_add_epi32(mmIntermediates[0], mmIntermediates[2]); - mmIntermediates[3] = _mm_add_epi32(mmIntermediates[3], mmIntermediates[5]); - - mmDerivate[0] = _mm_add_epi32(mmIntermediates[0], mmIntermediates[1]); - mmDerivate[1] = _mm_add_epi32(mmIntermediates[3], mmIntermediates[4]); - - _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0]); - _mm_storel_epi64(reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1]); + __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] ); + acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc ); + acc = _mm_slli_epi16( acc, 1 ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); + + _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); } + + __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] ); + acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc ); + acc = _mm_slli_epi16( acc, 1 ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); + acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); + acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); + + _mm_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); + _mm_storeu_si32 ( &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) ); + + pDerivate[y * derivateBufStride] = pDerivate[y * derivateBufStride + 1]; + pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)]; } - for (int j = 1; j < (height - 1); j++) - { - pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1]; - pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)]; - } - - memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0])); - memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0])); + memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); + memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); } - template - static void simdEqualCoeffComputer(Pel* pResidue, int residueStride, int **ppDerivate, int derivateBufStride, int64_t(*pEqualCoeff)[7], int width, int height, bool b6Param) + + +#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation) \ +{ \ +inter0 = _mm_mul_epi32(x1, y1); \ +inter1 = _mm_mul_epi32(tmp0, tmp2); \ +inter2 = _mm_mul_epi32(x2, y2); \ +inter3 = _mm_mul_epi32(tmp1, tmp3); \ +inter2 = _mm_add_epi64(inter0, inter2); \ +inter3 = _mm_add_epi64(inter1, inter3); \ +inter0 = _mm_loadl_epi64(loadLocation); \ +inter3 = _mm_add_epi64(inter2, inter3); \ +inter1 = _mm_srli_si128(inter3, 8); \ +inter3 = _mm_add_epi64(inter1, inter3); \ +inter3 = _mm_add_epi64(inter0, inter3); \ +} + + template + static void simdEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]) { __m128i mmFour; __m128i mmTmp[4]; @@ -210,14 +182,13 @@ inter3 = _mm_add_epi64(inter0, inter3); __m128i mmC[12]; // Add directly to indexes to get new index - mmFour = _mm_set1_epi32(4); + mmFour = _mm_set1_epi32(4); mmIndxJ = _mm_set1_epi32(-2); - int n = b6Param ? 6 : 4; - int idx1 = 0, idx2 = 0; - idx1 = -2 * derivateBufStride - 4; - idx2 = -derivateBufStride - 4; + static constexpr int n = b6Param ? 6 : 4; + int idx1 = -2 * derivateBufStride - 4; + int idx2 = - derivateBufStride - 4; for (int j = 0; j < height; j += 2) { @@ -236,16 +207,16 @@ inter3 = _mm_add_epi64(inter0, inter3); if (b6Param) { // mmC[0-5] for iC[0-5] of 1st row of pixels - mmC[0] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]); - mmC[2] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]); + mmC[0] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx1])); + mmC[2] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx1])); mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]); mmC[3] = _mm_mullo_epi32(mmIndxK, mmC[2]); mmC[4] = _mm_mullo_epi32(mmIndxJ, mmC[0]); mmC[5] = _mm_mullo_epi32(mmIndxJ, mmC[2]); // mmC[6-11] for iC[0-5] of 2nd row of pixels - mmC[6] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]); - mmC[8] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]); + mmC[6] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx2])); + mmC[8] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx2])); mmC[7] = _mm_mullo_epi32(mmIndxK, mmC[6]); mmC[9] = _mm_mullo_epi32(mmIndxK, mmC[8]); mmC[10] = _mm_mullo_epi32(mmIndxJ, mmC[6]); @@ -254,8 +225,8 @@ inter3 = _mm_add_epi64(inter0, inter3); else { // mmC[0-3] for iC[0-3] of 1st row of pixels - mmC[0] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1]); - mmC[2] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1]); + mmC[0] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx1])); + mmC[2] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx1])); mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]); mmC[3] = _mm_mullo_epi32(mmIndxJ, mmC[0]); mmTmp[0] = _mm_mullo_epi32(mmIndxJ, mmC[2]); @@ -264,8 +235,8 @@ inter3 = _mm_add_epi64(inter0, inter3); mmC[3] = _mm_sub_epi32(mmC[3], mmTmp[1]); // mmC[4-7] for iC[0-3] of 1st row of pixels - mmC[4] = _mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2]); - mmC[6] = _mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2]); + mmC[4] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[0][idx2])); + mmC[6] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*)&ppDerivate[1][idx2])); mmC[5] = _mm_mullo_epi32(mmIndxK, mmC[4]); mmC[7] = _mm_mullo_epi32(mmIndxJ, mmC[4]); mmTmp[2] = _mm_mullo_epi32(mmIndxJ, mmC[6]); @@ -311,13 +282,147 @@ inter3 = _mm_add_epi64(inter0, inter3); } } +#if USE_AVX2 + +#define CALC_EQUAL_COEFF_8PXLS_AVX2(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,res,loadLocation) \ +{ \ +inter0 = _mm256_mul_epi32(x1, y1); \ +inter1 = _mm256_mul_epi32(tmp0, tmp2); \ +inter2 = _mm256_mul_epi32(x2, y2); \ +inter3 = _mm256_mul_epi32(tmp1, tmp3); \ +inter2 = _mm256_add_epi64(inter0, inter2); \ +inter3 = _mm256_add_epi64(inter1, inter3); \ +res = _mm_loadl_epi64(loadLocation); \ +inter3 = _mm256_add_epi64(inter2, inter3); \ +inter1 = _mm256_srli_si256(inter3, 8); \ +inter3 = _mm256_add_epi64(inter1, inter3); \ +res = _mm_add_epi64(res, _mm256_castsi256_si128(inter3)); \ +res = _mm_add_epi64(res, _mm256_extracti128_si256(inter3, 1)); \ +} + + template + static void simdEqualCoeffComputer_avx2(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]) + { + __m256i mmFour; + __m256i mmTmp[4]; + __m256i mmIntermediate[4]; + __m256i mmIndxK, mmIndxJ; + __m256i mmResidue[2]; + __m256i mmC[12]; + __m128i mmRes; + + // Add directly to indexes to get new index + mmFour = _mm256_set1_epi32(4); + mmIndxJ = _mm256_set1_epi32(-2); + + static constexpr int n = b6Param ? 6 : 4; + int idx1 = -2 * derivateBufStride - 8; + int idx2 = - derivateBufStride - 8; + + for (int j = 0; j < height; j += 2) + { + if (!(j & 3)) + mmIndxJ = _mm256_add_epi32(mmIndxJ, mmFour); + mmIndxK = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( -6 ) ), _mm_set1_epi32( -2 ), 1 ); + idx1 += (derivateBufStride << 1); + idx2 += (derivateBufStride << 1); + + for (int k = 0; k < width; k += 8) + { + idx1 += 8; + idx2 += 8; + mmIndxK = _mm256_add_epi32(mmIndxK, mmFour); + mmIndxK = _mm256_add_epi32(mmIndxK, mmFour); + + if (b6Param) + { + // mmC[0-5] for iC[0-5] of 1st row of pixels + mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1])); + mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1])); + mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]); + mmC[3] = _mm256_mullo_epi32(mmIndxK, mmC[2]); + mmC[4] = _mm256_mullo_epi32(mmIndxJ, mmC[0]); + mmC[5] = _mm256_mullo_epi32(mmIndxJ, mmC[2]); + + // mmC[6-11] for iC[0-5] of 2nd row of pixels + mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2])); + mmC[8] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2])); + mmC[7] = _mm256_mullo_epi32(mmIndxK, mmC[6]); + mmC[9] = _mm256_mullo_epi32(mmIndxK, mmC[8]); + mmC[10] = _mm256_mullo_epi32(mmIndxJ, mmC[6]); + mmC[11] = _mm256_mullo_epi32(mmIndxJ, mmC[8]); + } + else + { + // mmC[0-3] for iC[0-3] of 1st row of pixels + mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1])); + mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1])); + mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]); + mmC[3] = _mm256_mullo_epi32(mmIndxJ, mmC[0]); + mmTmp[0] = _mm256_mullo_epi32(mmIndxJ, mmC[2]); + mmTmp[1] = _mm256_mullo_epi32(mmIndxK, mmC[2]); + mmC[1] = _mm256_add_epi32(mmC[1], mmTmp[0]); + mmC[3] = _mm256_sub_epi32(mmC[3], mmTmp[1]); + + // mmC[4-7] for iC[0-3] of 1st row of pixels + mmC[4] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2])); + mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2])); + mmC[5] = _mm256_mullo_epi32(mmIndxK, mmC[4]); + mmC[7] = _mm256_mullo_epi32(mmIndxJ, mmC[4]); + mmTmp[2] = _mm256_mullo_epi32(mmIndxJ, mmC[6]); + mmTmp[3] = _mm256_mullo_epi32(mmIndxK, mmC[6]); + mmC[5] = _mm256_add_epi32(mmC[5], mmTmp[2]); + mmC[7] = _mm256_sub_epi32(mmC[7], mmTmp[3]); + } + + // Residue + mmResidue[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[idx1])); + mmResidue[1] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[idx2])); + mmResidue[0] = _mm256_slli_epi32(mmResidue[0], 3); + mmResidue[1] = _mm256_slli_epi32(mmResidue[1], 3); + + // Calculation of coefficient matrix + for (int col = 0; col < n; col++) + { + mmTmp[0] = _mm256_srli_si256(mmC[0 + col], 4); + mmTmp[1] = _mm256_srli_si256(mmC[n + col], 4); + CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][col]); + _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmRes); + + for (int row = col + 1; row < n; row++) + { + mmTmp[2] = _mm256_srli_si256(mmC[0 + row], 4); + mmTmp[3] = _mm256_srli_si256(mmC[n + row], 4); + CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][row]); + _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmRes); + _mm_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmRes); + } + + mmTmp[2] = _mm256_srli_si256(mmResidue[0], 4); + mmTmp[3] = _mm256_srli_si256(mmResidue[1], 4); + CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][n]); + _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmRes); + } + } + + idx1 -= (width); + idx2 -= (width); + } + } +#endif template void AffineGradientSearch::_initAffineGradientSearchX86() { m_HorizontalSobelFilter = simdHorizontalSobelFilter; - m_VerticalSobelFilter = simdVerticalSobelFilter; - m_EqualCoeffComputer = simdEqualCoeffComputer; + m_VerticalSobelFilter = simdVerticalSobelFilter; +#if USE_AVX2 + m_EqualCoeffComputer[0] = simdEqualCoeffComputer_avx2; + m_EqualCoeffComputer[1] = simdEqualCoeffComputer_avx2; +#else + m_EqualCoeffComputer[0] = simdEqualCoeffComputer; + m_EqualCoeffComputer[1] = simdEqualCoeffComputer; +#endif } template void AffineGradientSearch::_initAffineGradientSearchX86(); diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h index 8cc7f7efe..0ac280152 100644 --- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h +++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h @@ -290,6 +290,103 @@ static void simdFilterCopy( const ClpRng& clpRng, const Pel* src, int srcStride, } + +// SIMD interpolation horizontal, block width modulo 2 +template +static void simdInterpolateHorM2( const int16_t* src, ptrdiff_t srcStride, int16_t *dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff ) +{ + CHECKD( N != 4, "Only allowing w=2 filtering for chroma blocks using 4-tap IF" ); + + _mm_prefetch( (const char*) src + srcStride, _MM_HINT_T0 ); + + const __m128i voffset = _mm_set1_epi32( offset ); + const __m128i vibdimin = _mm_set1_epi16( clpRng.min() ); + const __m128i vibdimax = _mm_set1_epi16( clpRng.max() ); + const __m128i vzero = _mm_setzero_si128(); + const __m128i vcoeffh = _mm_set1_epi64x( *( int64_t const* ) coeff ); + + __m128i vsum, vsrc, vsrc0, vsrc1; + + for( int row = 0; row < height; row++ ) + { + _mm_prefetch( (const char*)src + 2 * srcStride, _MM_HINT_T0 ); + + vsrc0 = _mm_loadl_epi64( ( __m128i const* )&src[0] ); + vsrc1 = _mm_loadl_epi64( ( __m128i const* )&src[1] ); + vsrc = _mm_unpacklo_epi64( vsrc0, vsrc1 ); + + vsum = _mm_madd_epi16( vsrc, vcoeffh ); + vsum = _mm_hadd_epi32( vsum, vsum ); + + vsum = _mm_add_epi32 ( vsum, voffset ); + vsum = _mm_srai_epi32 ( vsum, shift ); + vsum = _mm_packs_epi32( vsum, vzero ); + + if( shiftBack ) + { //clip + vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) ); + } + _mm_storeu_si32( ( __m128i * )&dst[0], vsum ); + + src += srcStride; + dst += dstStride; + } +} + + +template +static void simdInterpolateVerM2( const int16_t* src, ptrdiff_t srcStride, int16_t* dst, ptrdiff_t dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff ) +{ + CHECKD( N != 4, "Only allowing w=2 filtering for chroma blocks using 4-tap IF" ); + + _mm_prefetch( ( const char* ) &src[0 * srcStride], _MM_HINT_T0 ); + _mm_prefetch( ( const char* ) &src[1 * srcStride], _MM_HINT_T0 ); + _mm_prefetch( ( const char* ) &src[2 * srcStride], _MM_HINT_T0 ); + _mm_prefetch( ( const char* ) &src[3 * srcStride], _MM_HINT_T0 ); + + const __m128i vcoeffv = _mm_set1_epi64x( *( int64_t const* ) coeff ); + const __m128i vzero = _mm_setzero_si128(); + const __m128i voffset = _mm_set1_epi32( offset ); + const __m128i vibdimin = _mm_set1_epi16( clpRng.min() ); + const __m128i vibdimax = _mm_set1_epi16( clpRng.max() ); + const __m128i vshuff = _mm_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); + + __m128i vsrc, vnl, vsum, vtmp; + + const ptrdiff_t nextLine = srcStride * ( N - 1 ); + + vsrc = _mm_setr_epi16( src[0], src[1], src[1 * srcStride], src[1 * srcStride + 1], src[2 * srcStride], src[2 * srcStride + 1], 0, 0 ); + + for( int row = 0; row < height; row++ ) + { + _mm_prefetch( ( const char* ) &src[( N + 0 ) * srcStride], _MM_HINT_T0 ); + _mm_prefetch( ( const char* ) &src[( N + 1 ) * srcStride], _MM_HINT_T0 ); + + vnl = _mm_setr_epi16 ( src[nextLine], src[nextLine + 1], 0, 0, 0, 0, 0, 0 ); + vnl = _mm_slli_si128 ( vnl, 12 ); + vsrc = _mm_or_si128 ( vsrc, vnl ); + vtmp = _mm_shuffle_epi8 ( vsrc, vshuff ); + vsum = _mm_madd_epi16 ( vtmp, vcoeffv ); + vsum = _mm_hadd_epi32 ( vsum, vzero ); + vsrc = _mm_srli_si128 ( vsrc, 4 ); + + vsum = _mm_add_epi32 ( vsum, voffset ); + vsum = _mm_srai_epi32 ( vsum, shift ); + vsum = _mm_packs_epi32 ( vsum, vzero ); + + if( shiftBack ) //clip + { + vsum = _mm_min_epi16 ( vibdimax, _mm_max_epi16( vibdimin, vsum ) ); + } + + _mm_storeu_si32( (__m128i*) &dst[0], vsum ); + + src += srcStride; + dst += dstStride; + } +} + + // SIMD interpolation horizontal, block width modulo 4 template static void simdInterpolateHorM4( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff ) @@ -376,11 +473,8 @@ static void simdInterpolateHorM8( const int16_t* src, int srcStride, int16_t *ds __m128i vshuf0 = _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4, 0x5, 0x4, 0x3, 0x2, 0x3, 0x2, 0x1, 0x0 ); __m128i vshuf1 = _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4 ); -#if __INTEL_COMPILER __m128i vcoeff[4]; -#else - __m128i vcoeff[N/2]; -#endif + for( int i=0; i -static void simdInterpolateHor_N8_singleCol(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff) + +template +static void simdInterpolateHorM1(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff) { CHECKD( width != 1, "Width needs to be '1'!" ); cond_mm_prefetch((const char*)src, _MM_HINT_T0); cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0); - __m128i vcoeffh = _mm_loadu_si128((__m128i const*)coeff); - __m128i voffset = _mm_set1_epi32(offset); - __m128i vibdimin = _mm_set1_epi16(clpRng.min()); - __m128i vibdimax = _mm_set1_epi16(clpRng.max()); + if( N == 4 ) + { + cond_mm_prefetch((const char*)src, _MM_HINT_T0); + cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0); - int row = 0; + __m128i vcoeffh = _mm_loadl_epi64((__m128i const*)coeff); + vcoeffh = _mm_unpacklo_epi64(vcoeffh, vcoeffh); + __m128i voffset = _mm_set1_epi32(offset); + __m128i vibdimin = _mm_set1_epi16(clpRng.min()); + __m128i vibdimax = _mm_set1_epi16(clpRng.max()); - for (; row < ( height - 3 ); row += 4) - { - cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); + int row = 0; - __m128i - vsrc0 = _mm_loadu_si128((__m128i const*) src); src += srcStride; - vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh); - - __m128i - vsrc1 = _mm_loadu_si128((__m128i const*) src); src += srcStride; - vsrc1 = _mm_madd_epi16 (vsrc1, vcoeffh); - - __m128i - vsrc2 = _mm_loadu_si128((__m128i const*) src); src += srcStride; - vsrc2 = _mm_madd_epi16 (vsrc2, vcoeffh); - - __m128i - vsrc3 = _mm_loadu_si128((__m128i const*) src); src += srcStride; - vsrc3 = _mm_madd_epi16 (vsrc3, vcoeffh); + for( ; row < ( height - 3 ); row += 4 ) + { + cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); - vsrc0 = _mm_hadd_epi32(vsrc0, vsrc1); - vsrc2 = _mm_hadd_epi32(vsrc2, vsrc3); - vsrc0 = _mm_hadd_epi32(vsrc0, vsrc2); + __m128i + vsrc0 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; - vsrc0 = _mm_add_epi32 (vsrc0, voffset); - vsrc0 = _mm_srai_epi32(vsrc0, shift); + __m128i + vsrc1 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; - if (clip) { //clip - vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0)); - } - - *dst = _mm_cvtsi128_si32(vsrc0); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride; - } + vsrc1 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc0, vsrc1), vcoeffh); - for( ; row < height; row++, dst += dstStride, src += srcStride ) - { - int - sum = src[0] * coeff[0]; - sum += src[1] * coeff[1]; - sum += src[2] * coeff[2]; - sum += src[3] * coeff[3]; - sum += src[4] * coeff[4]; - sum += src[5] * coeff[5]; - sum += src[6] * coeff[6]; - sum += src[7] * coeff[7]; + __m128i + vsrc2 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; + + __m128i + vsrc3 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; + + vsrc3 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc2, vsrc3), vcoeffh); + + vsrc0 = _mm_hadd_epi32(vsrc1, vsrc3); + + vsrc0 = _mm_add_epi32 (vsrc0, voffset); + vsrc0 = _mm_srai_epi32(vsrc0, shift); - Pel val = ( sum + offset ) >> shift; + if (clip) { //clip + vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0)); + } + + *dst = _mm_cvtsi128_si32(vsrc0); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride; + } - if( clip ) + for( ; row < height; row++, dst += dstStride, src += srcStride ) { - val = ClipPel( val, clpRng ); + cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); + + __m128i + vsrc0 = _mm_loadl_epi64((__m128i const*) src); + vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh); + vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0); + + vsrc0 = _mm_add_epi32 (vsrc0, voffset); + vsrc0 = _mm_srai_epi32(vsrc0, shift); + + if (clip) { //clip + vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0)); + } + + *dst = _mm_cvtsi128_si32(vsrc0); } - *dst = val; } -} + else + { + CHECKD( N != 8, "N has to 8" ); -template -static void simdInterpolateHor_N4_singleCol(const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const* coeff) -{ - CHECKD( width != 1 || ( height & 3 ), "Windth needs to be '1'!" ); + __m128i vcoeffh = _mm_loadu_si128((__m128i const*)coeff); + __m128i voffset = _mm_set1_epi32(offset); + __m128i vibdimin = _mm_set1_epi16(clpRng.min()); + __m128i vibdimax = _mm_set1_epi16(clpRng.max()); - cond_mm_prefetch((const char*)src, _MM_HINT_T0); - cond_mm_prefetch((const char*)src + srcStride, _MM_HINT_T0); + int row = 0; - __m128i vcoeffh = _mm_loadl_epi64((__m128i const*)coeff); - vcoeffh = _mm_unpacklo_epi64(vcoeffh, vcoeffh); - __m128i voffset = _mm_set1_epi32(offset); - __m128i vibdimin = _mm_set1_epi16(clpRng.min()); - __m128i vibdimax = _mm_set1_epi16(clpRng.max()); + for( ; row < ( height - 3 ); row += 4 ) + { + cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); - for (int row = 0; row < height; row += 4) - { - cond_mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); + __m128i + vsrc0 = _mm_loadu_si128((__m128i const*) src); src += srcStride; + vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh); - __m128i - vsrc0 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; - - __m128i - vsrc1 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; + __m128i + vsrc1 = _mm_loadu_si128((__m128i const*) src); src += srcStride; + vsrc1 = _mm_madd_epi16 (vsrc1, vcoeffh); - vsrc1 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc0, vsrc1), vcoeffh); - - __m128i - vsrc2 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; - - __m128i - vsrc3 = _mm_loadl_epi64((__m128i const*) src); src += srcStride; + __m128i + vsrc2 = _mm_loadu_si128((__m128i const*) src); src += srcStride; + vsrc2 = _mm_madd_epi16 (vsrc2, vcoeffh); + + __m128i + vsrc3 = _mm_loadu_si128((__m128i const*) src); src += srcStride; + vsrc3 = _mm_madd_epi16 (vsrc3, vcoeffh); - vsrc3 = _mm_madd_epi16 (_mm_unpacklo_epi64(vsrc2, vsrc3), vcoeffh); + vsrc0 = _mm_hadd_epi32(vsrc0, vsrc1); + vsrc2 = _mm_hadd_epi32(vsrc2, vsrc3); + vsrc0 = _mm_hadd_epi32(vsrc0, vsrc2); - vsrc0 = _mm_hadd_epi32(vsrc1, vsrc3); + vsrc0 = _mm_add_epi32 (vsrc0, voffset); + vsrc0 = _mm_srai_epi32(vsrc0, shift); - vsrc0 = _mm_add_epi32 (vsrc0, voffset); - vsrc0 = _mm_srai_epi32(vsrc0, shift); + if (clip) { //clip + vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0)); + } - if (shiftBack) { //clip - vsrc0 = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, vsrc0)); + *dst = _mm_cvtsi128_si32(vsrc0); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride; + *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride; + } + + for( ; row < height; row++, dst += dstStride, src += srcStride ) + { + _mm_prefetch((const char*)src + 2 * srcStride, _MM_HINT_T0); + + __m128i + vsrc0 = N == 8 ? _mm_loadu_si128((const __m128i*) src) : _mm_loadl_epi64((const __m128i*) src); + vsrc0 = _mm_madd_epi16 (vsrc0, vcoeffh); + + vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0); + if( N == 8 ) vsrc0 = _mm_hadd_epi32(vsrc0, vsrc0); + + vsrc0 = _mm_add_epi32 (vsrc0, voffset); + vsrc0 = _mm_srai_epi32(vsrc0, shift); + + if (clip) { //clip + vsrc0 = _mm_min_epi32(vibdimax, _mm_max_epi32(vibdimin, vsrc0)); + } + + *dst = _mm_cvtsi128_si32(vsrc0); } - - *dst = _mm_cvtsi128_si32(vsrc0); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 1); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 2); dst += dstStride; - *dst = _mm_extract_epi32(vsrc0, 3); dst += dstStride; } } - template static void simdInterpolateHorM8_AVX2( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff ) { @@ -605,11 +721,8 @@ static void simdInterpolateHorM8_AVX2( const int16_t* src, int srcStride, int16_ __m256i vshuf1 = _mm256_set_epi8( 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4, 0xd, 0xc, 0xb, 0xa, 0xb, 0xa, 0x9, 0x8, 0x9, 0x8, 0x7, 0x6, 0x7, 0x6, 0x5, 0x4 ); -#if __INTEL_COMPILER __m256i vcoeff[4]; -#else - __m256i vcoeff[N/2]; -#endif + for( int i=0; i -static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff, bool biMCForDMVR) +static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff ) { int row, col; @@ -1275,199 +1387,146 @@ static void simdFilter( const ClpRng& clpRng, Pel const *src, int srcStride, Pel // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 CHECK( shift < 0, "Negative shift" ); + #define USE_M16_AVX2_IF 1 - if( isLast ) + if( N != 2 ) { - shift += ( isFirst ) ? 0 : headRoom; - offset = 1 << ( shift - 1 ); - offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; + if( isLast ) + { + shift += ( isFirst ) ? 0 : headRoom; + offset = 1 << ( shift - 1 ); + offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; + } + else + { + shift -= ( isFirst ) ? headRoom : 0; + offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0; + } } else - { - shift -= ( isFirst ) ? headRoom : 0; - offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0; - } - - if (biMCForDMVR) { if( isFirst ) { - shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); + shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); offset = 1 << (shift - 1); } else { - shift = 4; + shift = 4; offset = 1 << (shift - 1); } } - if( clpRng.bd <= 10 ) - { - if( N == 6 ) - { - c[6] = coeff[6]; - c[7] = coeff[7]; - int src8tOff = cStride; - - if( !( width & 7 ) ) - { - if( !isVertical ) - { - if( vext >= AVX2 ) -#if USE_M16_AVX2_IF - if( !( width & 15 ) ) - simdInterpolateHorM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - else -#endif - simdInterpolateHorM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - else - simdInterpolateHorM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - } - else - { - if( vext >= AVX2 ) -#if USE_M16_AVX2_IF - if( !( width & 15 ) ) - simdInterpolateVerM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - else -#endif - simdInterpolateVerM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - else - simdInterpolateVerM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - } - - return; - } - else if( !( width & 3 ) ) - { - if( !isVertical ) - { - simdInterpolateHorM4( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - } - else - simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); - return; - } - else if( width == 1 && !isVertical ) - { - simdInterpolateHor_N8_singleCol( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + CHECKD( clpRng.bd > 10, "VVenC does not support bitdepths larger than 10!" ); - return; - } - else if( width == 1 && isVertical ) - { - // for vertical width of '1' filtering, use 8-tap functionality - src += ( N/2 - 1 ) * cStride; - simdFilter( clpRng, src, srcStride, dst, dstStride, width, height, coeff, biMCForDMVR ); - - return; - } + if( N == 6 ) + { + c[6] = coeff[6]; + c[7] = coeff[7]; + int src8tOff = cStride; - THROW( "Unhandled case!" ); - } - else if( N == 8 && !( width & 0x07 ) ) + if( !( width & 7 ) ) { if( !isVertical ) { - if( vext>= AVX2 ) + if( vext >= AVX2 ) #if USE_M16_AVX2_IF if( !( width & 15 ) ) - simdInterpolateHorM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateHorM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); else #endif - simdInterpolateHorM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateHorM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); else - simdInterpolateHorM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateHorM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); } else { - if( vext>= AVX2 ) + if( vext >= AVX2 ) #if USE_M16_AVX2_IF if( !( width & 15 ) ) - simdInterpolateVerM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateVerM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); else #endif - simdInterpolateVerM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateVerM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); else - simdInterpolateVerM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateVerM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); } - return; } - else if( N == 8 && !( width & 0x03 ) ) + else if( !( width & 3 ) ) { if( !isVertical ) { - simdInterpolateHorM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + simdInterpolateHorM4( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); } else - simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - return; - } - else if( N == 4 && !( width & 0x03 ) ) - { - if( !isVertical ) - { - if( ( width % 8 ) == 0 ) - { - if( vext>= AVX2 ) -#if USE_M16_AVX2_IF - if( !( width & 15 ) ) - simdInterpolateHorM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - else -#endif - simdInterpolateHorM8_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - else - simdInterpolateHorM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - } - else - simdInterpolateHorM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - } - else - { - if( ( width % 8 ) == 0 ) - { - if( vext >= AVX2 ) -#if USE_M16_AVX2_IF - if( !( width & 15 ) ) - simdInterpolateVerM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - else -#endif - simdInterpolateVerM8_AVX2 ( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - else - simdInterpolateVerM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - } - else - simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - } - return; + simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); } - else if( biMCForDMVR ) + else if( width == 1 && !isVertical ) { - if( N == 2 && !( width & 0x03 ) ) - { - simdInterpolateN2_10BIT_M4( src, srcStride, dst, dstStride, cStride, width, height, shift, offset, clpRng, c ); - return; - } + simdInterpolateHorM1( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); } - else if( N == 2 ) + else if( width == 1 && isVertical ) { - THROW( "Should have already been handled!" ); + c[0] = c[1]; c[1] = c[2]; c[2] = c[3]; c[3] = c[4]; c[4] = c[5]; c[5] = coeff[6]; + goto scalar_if; } - else if( N == 8 && width == 1 && ( height & 3 ) == 0 && !isVertical ) + + return; + } + + if( !isVertical && N != 2 ) + { + if( ( width & 7 ) == 0 ) { - simdInterpolateHor_N8_singleCol( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - return; + if( vext >= AVX2 ) +#if USE_M16_AVX2_IF + if( !( width & 15 ) ) + simdInterpolateHorM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else +#endif + simdInterpolateHorM8_AVX2 ( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + simdInterpolateHorM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); } - else if( N == 4 && width == 1 && ( height & 3 ) == 0 && !isVertical ) + else if( ( width & 3 ) == 0 ) + simdInterpolateHorM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else if( ( width & 1 ) == 0 ) + simdInterpolateHorM2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + simdInterpolateHorM1( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + return; + } + else if( N != 2 ) + { + if( ( width & 7 ) == 0 ) { - simdInterpolateHor_N4_singleCol( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); - return; + if( vext >= AVX2 ) +#if USE_M16_AVX2_IF + if( !( width & 15 ) ) + simdInterpolateVerM16_AVX2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else +#endif + simdInterpolateVerM8_AVX2 ( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + simdInterpolateVerM8( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); } + else if( ( width & 3 ) == 0 ) + simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else if( ( width & 1 ) == 0 ) + simdInterpolateVerM2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + goto scalar_if; + return; + } + else// if( N == 2 ) + { + simdInterpolateN2_10BIT_M4( src, srcStride, dst, dstStride, cStride, width, height, shift, offset, clpRng, c ); + return; } +scalar_if: for( row = 0; row < height; row++ ) { for( col = 0; col < width; col++ ) diff --git a/source/Lib/CommonLib/x86/MCTFX86.h b/source/Lib/CommonLib/x86/MCTFX86.h index 94a463a39..8342909f3 100644 --- a/source/Lib/CommonLib/x86/MCTFX86.h +++ b/source/Lib/CommonLib/x86/MCTFX86.h @@ -1325,6 +1325,8 @@ void applyBlockSIMD( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const xvar = _mm_hadd_epi32( xvar, xvar ); int64_t variance = _mm_cvtsi128_si32( xvar ); int64_t diffsum = _mm_extract_epi32( xvar, 1 ); + variance <<= 2*(10-clpRng.bd); + diffsum <<= 2*(10-clpRng.bd); const int cntV = w * h; const int cntD = 2 * cntV - w - h; diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index f8ea2214b..c41ec7283 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -1092,7 +1092,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW m_alfCovarianceFrameCcAlf[compIdx - 1] = new AlfCovariance[numFilters]; for (int k = 0; k < numFilters; k++) { - m_alfCovarianceFrameCcAlf[compIdx - 1][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, numBins); + m_alfCovarianceFrameCcAlf[compIdx - 1][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, 1); } m_alfCovarianceCcAlf[compIdx - 1] = new AlfCovariance *[numFilters]; @@ -1101,7 +1101,7 @@ void EncAdaptiveLoopFilter::init( const VVEncCfg& encCfg, const PPS& pps, CABACW m_alfCovarianceCcAlf[compIdx - 1][j] = new AlfCovariance[m_numCTUsInPic]; for (int k = 0; k < m_numCTUsInPic; k++) { - m_alfCovarianceCcAlf[compIdx - 1][j][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, numBins); + m_alfCovarianceCcAlf[compIdx - 1][j][k].create(m_filterShapesCcAlf[compIdx - 1].numCoeff, 1); } } } @@ -1965,7 +1965,6 @@ void EncAdaptiveLoopFilter::reconstructCTU( Picture& pic, CodingStructure& cs, c void EncAdaptiveLoopFilter::initEncProcess( Slice& slice ) { - m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = false; if( isSkipAlfForFrame( *slice.pic ) ) { return; @@ -1973,8 +1972,8 @@ void EncAdaptiveLoopFilter::initEncProcess( Slice& slice ) // NOTE: ALF is here enabled per default. However it can be disabled during filter derivation part. // In line synchronized FPP mode, it cannot be disabled. - slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = true; - m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = slice.alfEnabled[COMP_Y] ? true : false; + slice.alfEnabled[COMP_Y] = slice.alfEnabled[COMP_Cb] = slice.alfEnabled[COMP_Cr] = slice.sps->alfEnabled; + m_ccAlfFilterParam.ccAlfFilterEnabled[0] = m_ccAlfFilterParam.ccAlfFilterEnabled[1] = slice.sps->ccalfEnabled; if( m_encCfg->m_fppLinesSynchro ) { diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 3b862f3da..29b558dcd 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -721,7 +721,7 @@ void EncCu::xCompressCU( CodingStructure*& tempCS, CodingStructure*& bestCS, Par bool isReuseCU = m_modeCtrl.isReusingCuValid( cs, partitioner, qp ); - bool checkIbc = m_pcEncCfg->m_IBCMode && bestCS->picture->useScIBC && (partitioner.chType == CH_L); + bool checkIbc = m_pcEncCfg->m_IBCMode && bestCS->picture->useIBC && (partitioner.chType == CH_L); if ((m_pcEncCfg->m_IBCFastMethod>3) && (cs.area.lwidth() * cs.area.lheight()) > (16 * 16)) { checkIbc = false; @@ -1954,7 +1954,7 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC uiNumMrgSATDCand = (m_pcEncCfg->m_useFastMrg >= 2) ? (unsigned)candCostList.size() : uiNumMrgSATDCand; for( uint32_t i = 1; i < uiNumMrgSATDCand; i++ ) { - if( candCostList[i] > MRG_FAST_RATIO[tempCS->picture->useScFastMrg] * candCostList[0] ) + if( candCostList[i] > MRG_FAST_RATIO[tempCS->picture->useFastMrg] * candCostList[0] ) { uiNumMrgSATDCand = i; break; diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 71d61e546..9823149aa 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -857,13 +857,40 @@ void EncGOP::xInitSPS(SPS &sps) const profileTierLevel->subProfileIdc.clear(); profileTierLevel->subProfileIdc.push_back( m_pcEncCfg->m_subProfile ); - sps.maxPicWidthInLumaSamples = m_pcEncCfg->m_PadSourceWidth; - sps.maxPicHeightInLumaSamples = m_pcEncCfg->m_PadSourceHeight; - sps.conformanceWindow.setWindow( m_pcEncCfg->m_confWinLeft, m_pcEncCfg->m_confWinRight, m_pcEncCfg->m_confWinTop, m_pcEncCfg->m_confWinBottom ); + if( m_pcEncCfg->m_maxPicWidth != 0 && m_pcEncCfg->m_maxPicHeight != 0 ) + { + const int minCuSize = std::max( 1 << ( vvenc::MIN_CU_LOG2 + 1 ), 1 << m_pcEncCfg->m_log2MinCodingBlockSize ); + int padRight = 0, padBottom = 0; + if( m_pcEncCfg->m_maxPicWidth % minCuSize ) + { + padRight = ( ( m_pcEncCfg->m_maxPicWidth / minCuSize) + 1 ) * minCuSize - m_pcEncCfg->m_maxPicWidth; + } + if( m_pcEncCfg->m_maxPicHeight % minCuSize ) + { + padBottom = ( ( m_pcEncCfg->m_maxPicHeight / minCuSize) + 1 ) * minCuSize - m_pcEncCfg->m_maxPicHeight; + } + sps.maxPicWidthInLumaSamples = m_pcEncCfg->m_maxPicWidth + padRight; + sps.maxPicHeightInLumaSamples = m_pcEncCfg->m_maxPicHeight + padBottom; + + sps.conformanceWindow.setWindow( 0, padRight, 0, padBottom ); + } + else + { + sps.maxPicWidthInLumaSamples = m_pcEncCfg->m_PadSourceWidth; + sps.maxPicHeightInLumaSamples = m_pcEncCfg->m_PadSourceHeight; + sps.conformanceWindow.setWindow( m_pcEncCfg->m_confWinLeft, m_pcEncCfg->m_confWinRight, m_pcEncCfg->m_confWinTop, m_pcEncCfg->m_confWinBottom ); + } sps.chromaFormatIdc = m_pcEncCfg->m_internChromaFormat; sps.CTUSize = m_pcEncCfg->m_CTUSize; sps.maxMTTDepth[0] = m_pcEncCfg->m_maxMTTDepthI; - sps.maxMTTDepth[1] = m_pcEncCfg->m_maxMTTDepth >= 10 ? 3 : m_pcEncCfg->m_maxMTTDepth; + int maxMTTDepthVal = m_pcEncCfg->m_maxMTTDepth; + int minMaxMttD = maxMTTDepthVal % 10; + while( maxMTTDepthVal ) + { + minMaxMttD = std::min( minMaxMttD, maxMTTDepthVal % 10 ); + maxMTTDepthVal /= 10; + } + sps.maxMTTDepth[1] = minMaxMttD; sps.maxMTTDepth[2] = m_pcEncCfg->m_maxMTTDepthIChroma; for( int i = 0; i < 3; i++) { @@ -1462,7 +1489,7 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod if( ( i == 1 ) && ( m_pcEncCfg->m_maxMTTDepth >= 10 ) ) { slice->picHeader->maxMTTDepth[i] = int( m_pcEncCfg->m_maxMTTDepth / pow( 10, sps.maxTLayers - slice->TLayer - 1 ) ) % 10; - slice->picHeader->splitConsOverride = true; + slice->picHeader->splitConsOverride = slice->picHeader->maxMTTDepth[i] != sps.maxMTTDepth[i]; } } @@ -1492,7 +1519,7 @@ void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncod } const int maxTLayer = m_pcEncCfg->m_picReordering && m_pcEncCfg->m_GOPSize > 1 ? vvenc::ceilLog2( m_pcEncCfg->m_GOPSize ) : 0; - const int numRefCode = pic.useScNumRefs ? m_pcEncCfg->m_numRefPicsSCC : m_pcEncCfg->m_numRefPics; + const int numRefCode = pic.useNumRefs ? m_pcEncCfg->m_numRefPicsSCC : m_pcEncCfg->m_numRefPics; const int tLayer = slice->TLayer; const int numRefs = numRefCode < 10 ? numRefCode : ( int( numRefCode / pow( 10, maxTLayer - tLayer ) ) % 10 ); @@ -1775,7 +1802,7 @@ void EncGOP::xInitLMCS( Picture& pic ) Slice* slice = pic.cs->slice; const SliceType sliceType = slice->sliceType; - if( ! pic.useScLMCS || (!slice->isIntra() && m_disableLMCSIP) ) + if( ! pic.useLMCS || (!slice->isIntra() && m_disableLMCSIP) ) { pic.reshapeData.copyReshapeData( m_Reshaper ); m_Reshaper.setCTUFlag ( false ); diff --git a/source/Lib/EncoderLib/EncPicture.cpp b/source/Lib/EncoderLib/EncPicture.cpp index 6ae96d16c..90e19aa71 100644 --- a/source/Lib/EncoderLib/EncPicture.cpp +++ b/source/Lib/EncoderLib/EncPicture.cpp @@ -93,7 +93,7 @@ void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder ) pic.cs->createTempBuffers( true ); pic.cs->initStructData( MAX_INT, false, nullptr ); - if( pic.useScLMCS && m_pcEncCfg->m_reshapeSignalType == RESHAPE_SIGNAL_PQ && m_pcEncCfg->m_alf ) + if( pic.useLMCS && m_pcEncCfg->m_reshapeSignalType == RESHAPE_SIGNAL_PQ && m_pcEncCfg->m_alf ) { const double *weights = gopEncoder.getReshaper().getlumaLevelToWeightPLUT(); auto& vec = m_ALF.getLumaLevelWeightTable(); diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp index aea699c8c..d3e3a4df2 100644 --- a/source/Lib/EncoderLib/EncSlice.cpp +++ b/source/Lib/EncoderLib/EncSlice.cpp @@ -313,7 +313,7 @@ void EncSlice::initPic( Picture* pic ) } m_ctuEncDelay = 1; - if( pic->useScIBC ) + if( pic->useIBC ) { // IBC needs unfiltered samples up to max IBC search range // therefore ensure that numCtuDelayLUT CTU's have been enocded first @@ -552,7 +552,7 @@ void EncSlice::compressSlice( Picture* pic ) lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList(); lnRsrc->m_CachedBvs .resetIbcBvCand(); - if( slice->sps->saoEnabled && pic->useScSAO ) + if( slice->sps->saoEnabled && pic->useSAO ) { lnRsrc->m_encSao .initSlice( slice ); } @@ -711,7 +711,7 @@ void EncSlice::finishCompressSlice( Picture* pic, Slice& slice ) CodingStructure& cs = *pic->cs; // finalize - if( slice.sps->saoEnabled && pic->useScSAO ) + if( slice.sps->saoEnabled && pic->useSAO ) { // store disabled statistics if( !m_pcEncCfg->m_numThreads ) @@ -740,7 +740,7 @@ void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() ); } - if( slice.sps->saoEnabled && pic->useScSAO ) + if( slice.sps->saoEnabled && pic->useSAO ) { // check SAO enabled or disabled EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat ); @@ -1073,7 +1073,7 @@ bool EncSlice::xProcessCtuTask( int threadIdx, CtuEncParam* ctuEncParam ) ITT_TASKSTART( itt_domain_encode, itt_handle_sao ); // SAO filter - if( slice.sps->saoEnabled && pic->useScSAO ) + if( slice.sps->saoEnabled && pic->useSAO ) { PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L ); TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; diff --git a/source/Lib/EncoderLib/InterSearch.cpp b/source/Lib/EncoderLib/InterSearch.cpp index fbc3e0654..9fa022ff3 100644 --- a/source/Lib/EncoderLib/InterSearch.cpp +++ b/source/Lib/EncoderLib/InterSearch.cpp @@ -261,10 +261,10 @@ void InterSearch::init( const VVEncCfg& encCfg, TrQuant* pTrQuant, RdCost* pRdCo } m_tmpStorageLCU.create( UnitArea( cform, Area( 0, 0, MAX_CU_SIZE, MAX_CU_SIZE ) ) ); m_pTempPel = new Pel[ encCfg.m_CTUSize * encCfg.m_CTUSize ]; - m_tmpAffiStorage.create(UnitArea(cform, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE))); + m_tmpAffiStorage.create(UnitArea(cform, Area(0, 0, MAX_CU_SIZE, MAX_CU_SIZE + 2))); // allow overread by 2 samples m_tmpAffiError = new Pel[MAX_CU_SIZE * MAX_CU_SIZE]; - m_tmpAffiDeri[0] = new int[MAX_CU_SIZE * MAX_CU_SIZE]; - m_tmpAffiDeri[1] = new int[MAX_CU_SIZE * MAX_CU_SIZE]; + m_tmpAffiDeri[0] = new Pel[MAX_CU_SIZE * MAX_CU_SIZE]; + m_tmpAffiDeri[1] = new Pel[MAX_CU_SIZE * MAX_CU_SIZE]; CompArea chromaArea( COMP_Cb, cform, Area( 0, 0, encCfg.m_CTUSize, encCfg.m_CTUSize ), true ); for( int i = 0; i < 4; i++ ) @@ -2197,7 +2197,7 @@ void InterSearch::xPatternSearchFast( const CodingUnit& cu, Mv& rcMv, Distortion& ruiSAD ) { - if( cu.cs->picture->useScME ) + if( cu.cs->picture->useME ) { switch ( m_motionEstimationSearchMethodSCC ) { @@ -3446,7 +3446,7 @@ void InterSearch::xEstimateInterResidualQT(CodingStructure &cs, Partitioner &par const uint32_t numTBlocks = getNumberValidTBlocks ( *cs.pcv ); CodingUnit& cu = *cs.getCU(partitioner.chType, partitioner.treeType); const unsigned currDepth = partitioner.currTrDepth; - const bool useTS = cs.picture->useScTS; + const bool useTS = cs.picture->useTS; bool bCheckFull = !partitioner.canSplit( TU_MAX_TR_SPLIT, cs ); if( cu.sbtInfo && partitioner.canSplit( CU::getSbtTuSplit( cu.sbtInfo ), cs ) ) @@ -5315,7 +5315,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, int64_t i64EqualCoeff[7][7]; Pel *piError = m_tmpAffiError; - int *pdDerivate[2]; + Pel *pdDerivate[2]; pdDerivate[0] = m_tmpAffiDeri[0]; pdDerivate[1] = m_tmpAffiDeri[1]; @@ -5354,7 +5354,6 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, ::memcpy(acMv, acMvTemp, sizeof(Mv) * 3); - const int bufStride = pBuf->Y().stride; const int predBufStride = predBuf.Y().stride; Mv prevIterMv[7][3]; int iIterTime; @@ -5379,23 +5378,13 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, * use gradient to update mv *********************************************************************************/ // get Error Matrix - const Pel* pOrg = pBuf->Y().buf; - Pel* pPred = predBuf.Y().buf; - for (int j = 0; j< height; j++) - { - for (int i = 0; i< width; i++) - { - piError[i + j * width] = pOrg[i] - pPred[i]; - } - pOrg += bufStride; - pPred += predBufStride; - } + PelBuf( piError, width, height ).subtract( pBuf->Y(), predBuf.Y() ); // sobel x direction // -1 0 1 // -2 0 2 // -1 0 1 - pPred = predBuf.Y().buf; + Pel* pPred = predBuf.Y().buf; m_HorizontalSobelFilter(pPred, predBufStride, pdDerivate[0], width, width, height); // sobel y direction @@ -5410,9 +5399,7 @@ void InterSearch::xAffineMotionEstimation(CodingUnit& cu, memset(&i64EqualCoeff[row][0], 0, iParaNum * sizeof(int64_t)); } - m_EqualCoeffComputer(piError, width, pdDerivate, width, i64EqualCoeff, width, height - , (cu.affineType == AFFINEMODEL_6PARAM) - ); + m_EqualCoeffComputer[cu.affineType]( piError, width, pdDerivate, width, width, height, i64EqualCoeff ); for (int row = 0; row < iParaNum; row++) { diff --git a/source/Lib/EncoderLib/InterSearch.h b/source/Lib/EncoderLib/InterSearch.h index 70012cf39..277408584 100644 --- a/source/Lib/EncoderLib/InterSearch.h +++ b/source/Lib/EncoderLib/InterSearch.h @@ -377,7 +377,7 @@ class InterSearch : public InterPrediction, AffineGradientSearch EncAffineMotion m_affineMotion; PelStorage m_tmpAffiStorage; Pel* m_tmpAffiError; - int* m_tmpAffiDeri[2]; + Pel* m_tmpAffiDeri[2]; MotionInfo m_subPuMiBuf[(MAX_CU_SIZE * MAX_CU_SIZE) >> (MIN_CU_LOG2 << 1)]; // Misc. Pel* m_pTempPel; diff --git a/source/Lib/EncoderLib/IntraSearch.cpp b/source/Lib/EncoderLib/IntraSearch.cpp index ae7a57e23..ba10c2753 100644 --- a/source/Lib/EncoderLib/IntraSearch.cpp +++ b/source/Lib/EncoderLib/IntraSearch.cpp @@ -528,7 +528,7 @@ bool IntraSearch::estIntraPredLumaQT(CodingUnit &cu, Partitioner &partitioner, d csBest->initStructData(); int bestLfnstIdx = 0; - const bool useBDPCM = cs.picture->useScBDPCM; + const bool useBDPCM = cs.picture->useBDPCM; int NumBDPCMCand = (useBDPCM && sps.BDPCM && CU::bdpcmAllowed(cu, ComponentID(partitioner.chType))) ? 2 : 0; int bestbdpcmMode = 0; int bestISP = 0; @@ -729,7 +729,7 @@ void IntraSearch::estIntraPredChromaQT( CodingUnit& cu, Partitioner& partitioner PartSplit ispType = lumaUsesISP ? CU::getISPType(cu, COMP_Y) : TU_NO_ISP; double bestCostSoFar = maxCostAllowed; const uint32_t numberValidComponents = getNumberValidComponents( cu.chromaFormat ); - const bool useBDPCM = cs.picture->useScBDPCM; + const bool useBDPCM = cs.picture->useBDPCM; uint32_t uiBestMode = 0; Distortion uiBestDist = 0; @@ -1514,7 +1514,7 @@ void IntraSearch::xIntraCodingLumaQT(CodingStructure& cs, Partitioner& partition double dSingleCost = MAX_DOUBLE; int endLfnstIdx = (partitioner.isSepTree(cs) && partitioner.chType == CH_C && (currArea.lwidth() < 8 || currArea.lheight() < 8)) || (currArea.lwidth() > sps.getMaxTbSize() || currArea.lheight() > sps.getMaxTbSize()) || !sps.LFNST || (numMode < 0) ? 0 : 2; - const bool useTS = cs.picture->useScTS; + const bool useTS = cs.picture->useTS; numMode = (numMode < 0) ? -numMode : numMode; if (cu.mipFlag && !allowLfnstWithMip(cu.lumaSize())) @@ -1994,7 +1994,7 @@ ChromaCbfs IntraSearch::xIntraChromaCodingQT(CodingStructure& cs, Partitioner& p const CodingUnit& cu = *cs.getCU( currArea.chromaPos(), CH_C, TREE_D ); ChromaCbfs cbfs(false); uint32_t currDepth = partitioner.currTrDepth; - const bool useTS = cs.picture->useScTS; + const bool useTS = cs.picture->useTS; if (currDepth == currTU.depth) { if (!currArea.Cb().valid() || !currArea.Cr().valid()) diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp index 85c82a73b..cf8454bac 100644 --- a/source/Lib/EncoderLib/PreProcess.cpp +++ b/source/Lib/EncoderLib/PreProcess.cpp @@ -87,7 +87,7 @@ void PreProcess::init( const VVEncCfg& encCfg, bool isFinalPass ) m_doTempDown = m_encCfg->m_FirstPassMode == 2 || m_encCfg->m_FirstPassMode == 4; m_doVisAct = m_encCfg->m_usePerceptQPA || (m_encCfg->m_LookAhead && m_encCfg->m_RCTargetBitrate) - || (m_encCfg->m_RCNumPasses > 1 && ((!isFinalPass) || (m_encCfg->m_FirstPassMode > 2))); + || (m_encCfg->m_RCNumPasses > 1 && (!isFinalPass)); m_doVisActQpa = m_encCfg->m_usePerceptQPA; diff --git a/source/Lib/EncoderLib/RateCtrl.cpp b/source/Lib/EncoderLib/RateCtrl.cpp index 61f992392..0f5e4afc8 100644 --- a/source/Lib/EncoderLib/RateCtrl.cpp +++ b/source/Lib/EncoderLib/RateCtrl.cpp @@ -178,7 +178,7 @@ void EncRCPic::destroy() encRCSeq = NULL; } -void EncRCPic::clipTargetQP (std::list& listPreviousPictures, const int baseQP, const int maxTL, const double resRatio, int &qp, int* qpAvg) +void EncRCPic::clipTargetQP (std::list& listPreviousPictures, const int baseQP, const int refrIncrFac, const int maxTL, const double resRatio, int &qp, int* qpAvg) { const int rShift = (resRatio < 0.03125 ? 12 : (resRatio < 0.125 ? 13 : (resRatio < 0.5 ? 14 : 15))); const int initQP = qp; @@ -222,7 +222,7 @@ void EncRCPic::clipTargetQP (std::list& listPreviousPictures, const i { const int clipRange = (refreshParams ? 5 + (encRCSeq->intraPeriod + (encRCSeq->gopSize >> 1)) / encRCSeq->gopSize : std::max (3, 6 - (frameLevel >> 1))); - qp = Clip3 (lastCurrTLQP - clipRange, std::min (MAX_QP, lastCurrTLQP + clipRange), qp); + qp = Clip3 (lastCurrTLQP - clipRange, std::min (MAX_QP, lastCurrTLQP + (refreshParams ? (refrIncrFac * clipRange) >> 1 : clipRange)), qp); } if (lastPrevTLQP >= 0) // prevent QP from being lower than QPs at lower temporal level { @@ -353,6 +353,20 @@ int RateCtrl::getBaseQP() { sumFrBits += stats.numBits; } + if (m_pcEncCfg->m_usePerceptQPA && m_pcEncCfg->m_LookAhead) // account for very low visual activity + { + const double hpEnerPic = sqrt (32.0 * double (1 << (2 * encRCSeq->bitDepth - 10)) * sqrt (d)); + uint32_t hpEner = 0; + + for (auto& stats : firstPassData) + { + hpEner += stats.visActY; + } + if (hpEner > 0 && hpEner < hpEnerPic * firstPassData.size()) // similar to applyQPAdaptationSlice + { + sumFrBits = uint64_t (0.5 + sumFrBits * sqrt (hpEner / (hpEnerPic * firstPassData.size()))); + } + } baseQP = int (24.5 - log (d) / log (2.0)); // QPstart, equivalent to round (24 + 2*log2 (resRatio)) d = (double) m_pcEncCfg->m_RCTargetBitrate * (double) firstPassData.size() / (encRCSeq->frameRate * sumFrBits); d = firstPassBaseQP - (105.0 / 128.0) * sqrt ((double) std::max (1, firstPassBaseQP)) * log (d) / log (2.0); @@ -475,6 +489,8 @@ void RateCtrl::storeStatsData( TRCPassStats statsData ) CHECK( statsData.poc - srcData.poc >= m_pcEncCfg->m_GOPSize, "miss stats data from previous frame for temporal down-sampling" ); statsData.qp = srcData.qp; statsData.lambda = srcData.lambda; + if( statsData.visActY == 0 && statsData.spVisAct == 0 ) + statsData.spVisAct = srcData.spVisAct; if( statsData.visActY == 0 ) statsData.visActY = srcData.visActY; statsData.numBits = srcData.numBits; @@ -682,6 +698,7 @@ void RateCtrl::adjustStatsDownsample() { value_gopcur += statValue; num_gopcur++; + doChangeBits = false; if (stat.gopNum != 0) { int var_cur = abs(statValue - meanValue); @@ -689,20 +706,19 @@ void RateCtrl::adjustStatsDownsample() { doChangeBits = true; } - int rate1 = (((value_gopcur / num_gopcur) * 100) / meanValue); - int rate2 = (value_gopbefore == 0) ? 100 : (((value_gopcur / num_gopcur) * 100) / value_gopbefore); - if ((rate1 > 140) || (rate1 < 60) - || (rate2 > 140) || (rate2 < 60)) - { - doChangeBits = true; - } - else if (doChangeBits) + else { - doChangeBits = false; + int rate1 = (((value_gopcur / num_gopcur) * 100) / meanValue); + int rate2 = (value_gopbefore == 0) ? 100 : (((value_gopcur / num_gopcur) * 100) / value_gopbefore); + if ((rate1 > 140) || (rate1 < 60) + || (rate2 > 140) || (rate2 < 60)) + { + doChangeBits = true; + } } } } - if ((stat.gopNum != 0) && doChangeBits) + if ((stat.gopNum != 0) && doChangeBits && (stat.tempLayer > 1)) { stat.numBits = (stat.numBits * 3) >> 1; } @@ -833,7 +849,7 @@ double RateCtrl::getAverageBitsFromFirstPass() } totalBitsFirstPass = (2 * tlBits[0] + (tlCount[0] >> 1)) / std::max (1u, tlCount[0]) + - ((gopsInIp - l) * tlBits[1] + (tlCount[1] >> 1)) / std::max (1u, tlCount[1]); + ((gopsInIp - l) * tlBits[1] + (tlCount[1] >> 1)) / std::max (1u, tlCount[1]); for (l = 2; l <= 7; l++) { totalBitsFirstPass += ((gopsInIp << (l - 2)) * tlBits[l] + (tlCount[l] >> 1)) / std::max (1u, tlCount[l]); @@ -927,6 +943,7 @@ void RateCtrl::processGops() vecIdx++; } } + vecIdx = 0; fac = 1.0 / gopBits[vecIdx]; gopTempVal[vecIdx] = 1.0f; @@ -1160,34 +1177,17 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double& { const double sqrOfResRatio = double( m_pcEncCfg->m_SourceWidth * m_pcEncCfg->m_SourceHeight ) / ( 3840.0 * 2160.0 ); const int firstPassSliceQP = it->qp; - const int secondPassBaseQP = ( m_pcEncCfg->m_LookAhead ? ( m_pcEncCfg->m_QP + getBaseQP() ) >> 1 : m_pcEncCfg->m_QP ); const int budgetRelaxScale = ( encRCSeq->maxGopRate + 0.5 < 2.0 * (double)encRCSeq->targetRate * encRCSeq->gopSize / encRCSeq->frameRate ? 2 : 3 ); // quarters const bool isRateCapperMax = ( encRCSeq->maxGopRate + 0.5 >= 3.0 * (double)encRCSeq->targetRate * encRCSeq->gopSize / encRCSeq->frameRate ); const bool isEndOfSequence = ( it->poc >= flushPOC && flushPOC >= 0 ); const double dLimit = ( isRateCapperMax ? 3.0 : 0.5 * budgetRelaxScale + 0.5 ); double d = (double)it->targetBits, tmpVal; - uint16_t visAct = it->visActY; - - if ( it->isNewScene ) // spatiotemporal visual activity is transient at camera/scene change, find next steady-state activity - { - std::list::iterator itNext = it; - itNext++; - while ( itNext != encRCSeq->firstPassData.end() && !itNext->isIntra ) - { - if ( itNext->poc == it->poc + 2 ) - { - visAct = itNext->visActY; - break; - } - itNext++; - } - } - encRcPic->visActSteady = visAct; // TODO: try removing all visAct(Y) related code except for the one in detectSceneCuts() + encRcPic->visActSteady = it->visActY; if ( it->refreshParameters ) // reset counters for budget usage in subsequent frames { - encRCSeq->qpCorrection[ frameLevel ] = ( it->poc == 0 && d < it->numBits ? std::max( -1.0 * it->visActY / double( 1 << ( encRCSeq->bitDepth - 3 ) ), 1.0 - it->numBits / d ) : 0.0 ); + encRCSeq->qpCorrection[ frameLevel ] = ( it->poc == 0 && it->isIntra && d < it->numBits ? std::max( -1.0 * it->visActY / double( 1 << ( encRCSeq->bitDepth - 3 ) ), 1.0 - it->numBits / d ) : 0.0 ); if ( !m_pcEncCfg->m_LookAhead ) { encRCSeq->actualBitCnt[ frameLevel ] = encRCSeq->targetBitCnt[ frameLevel ] = 0; @@ -1270,7 +1270,7 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double& d = firstPassSliceQP - ( 105.0 / 128.0 ) * sqrt( (double)std::max( 1, firstPassSliceQP ) ) * log( d ) / log( 2.0 ); sliceQP = int( 0.5 + d + 0.5 * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] ); - encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : secondPassBaseQP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ), + encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : m_pcEncCfg->m_QP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ), 5 - budgetRelaxScale, ( it->poc < encRCSeq->gopSize ? 0 : ( m_pcEncCfg->m_maxTLayer + 1 ) >> 1 ), sqrOfResRatio, sliceQP, &encRCSeq->lastAverageQP ); lambda = it->lambda * pow( 2.0, double( sliceQP - firstPassSliceQP ) / 3.0 ); lambda = Clip3( encRCSeq->minEstLambda, encRCSeq->maxEstLambda, lambda ); diff --git a/source/Lib/EncoderLib/RateCtrl.h b/source/Lib/EncoderLib/RateCtrl.h index 3ed7425f6..c870ebf57 100644 --- a/source/Lib/EncoderLib/RateCtrl.h +++ b/source/Lib/EncoderLib/RateCtrl.h @@ -142,7 +142,7 @@ namespace vvenc { void create( EncRCSeq* encRCSeq, int frameLevel, int framePoc ); void destroy(); - void clipTargetQP (std::list& listPreviousPictures, const int baseQP, const int maxTL, const double resRatio, int &qp, int* qpAvg); + void clipTargetQP (std::list& listPreviousPictures, const int baseQP, const int refrIncrFac, const int maxTL, const double resRatio, int &qp, int* qpAvg); void updateAfterPicture (const int picActualBits, const int averageQP); void addToPictureList( std::list& listPreviousPictures ); diff --git a/source/Lib/EncoderLib/VLCWriter.cpp b/source/Lib/EncoderLib/VLCWriter.cpp index 26e2fe3ae..0db624682 100644 --- a/source/Lib/EncoderLib/VLCWriter.cpp +++ b/source/Lib/EncoderLib/VLCWriter.cpp @@ -1863,7 +1863,7 @@ void HLSWriter::codeSliceHeader( const Slice* slice ) } //Write L1 related syntax elements - if (!slice->pps->rpl1IdxPresent && slice->pps->rpl1IdxPresent) + if (slice->sps->getNumRPL(1) > 1 && slice->pps->rpl1IdxPresent) { WRITE_FLAG(slice->rplIdx[1] != -1 ? 1 : 0, "ref_pic_list_sps_flag[1]"); } diff --git a/source/Lib/apputils/LogoRenderer.h b/source/Lib/apputils/LogoRenderer.h index f09dc47ac..45fb562f0 100644 --- a/source/Lib/apputils/LogoRenderer.h +++ b/source/Lib/apputils/LogoRenderer.h @@ -166,7 +166,7 @@ class LogoRenderer if( m_bInitialized ){ uninit(); } } - int init( const std::string &fileName, vvencChromaFormat chromaFormat, int internalBitdepth, std::ostream& rcOstr ) + int init( const std::string &fileName, vvencChromaFormat chromaFormat, int inputBitdepth, std::ostream& rcOstr ) { if( m_bInitialized ) { @@ -212,7 +212,13 @@ class LogoRenderer { rcOstr << "Logo input file error: invalid size " << m_cLogo.inputOpts.sourceWidth << "x" << m_cLogo.inputOpts.sourceHeight << std::endl; return -1; - } + } + + if( inputBitdepth == 8 && m_cLogo.inputOpts.bitdepth == 10 ) + { + m_cLogo.inputOpts.bgColorMin = ( m_cLogo.inputOpts.bgColorMin + 2) >> 2; + m_cLogo.inputOpts.bgColorMax = ( m_cLogo.inputOpts.bgColorMax + 2) >> 2; + } vvenc_YUVBuffer_default( &m_cYuvBufLogo ); vvenc_YUVBuffer_alloc_buffer( &m_cYuvBufLogo, chromaFormat, m_cLogo.inputOpts.sourceWidth, m_cLogo.inputOpts.sourceHeight ); @@ -257,8 +263,9 @@ class LogoRenderer // read the logo int yuvBuffer bool is16bit = m_cLogo.inputOpts.bitdepth > 8 ? true : false; - int bitdepthShift = internalBitdepth - m_cLogo.inputOpts.bitdepth; - const LPel maxVal = ( 1 << m_cLogo.inputOpts.bitdepth ) - 1; + int bitdepthShift = inputBitdepth - m_cLogo.inputOpts.bitdepth; + const LPel maxVal = ( 1 << inputBitdepth ) - 1; + for( int comp = 0; comp < 3; comp++ ) { vvencYUVPlane yuvPlane = m_cYuvBufLogo.planes[ comp ]; diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h index 4c4f18409..0b63e095c 100644 --- a/source/Lib/apputils/VVEncAppCfg.h +++ b/source/Lib/apputils/VVEncAppCfg.h @@ -479,6 +479,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) IStreamToEnum toMsgLevel ( &c->m_verbosity, &MsgLevelToEnumMap ); IStreamToFunc toPreset ( setPresets, this, c, &PresetToEnumMap,vvencPresetMode::VVENC_MEDIUM); IStreamToRefVec toSourceSize ( { &c->m_SourceWidth, &c->m_SourceHeight }, true, 'x' ); + IStreamToRefVec toMaxPicSize ( { &c->m_maxPicWidth, &c->m_maxPicHeight }, true, 'x' ); IStreamToRefVec toFps ( { &c->m_FrameRate, &c->m_FrameScale }, false, '/' ); IStreamToEnum toProfile ( &c->m_profile, &ProfileToEnumMap ); @@ -754,6 +755,10 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("VerticalPadding", c->m_aiPad[1], "Vertical source padding for conformance window mode 2") ("InputChromaFormat", toInputFileChromaFormat, "input file chroma format (400, 420, 422, 444)") ("PackedInput", m_packedYUVInput, "Enable 10-bit packed YUV input data ( pack 4 samples( 8-byte) into 5-bytes consecutively.") + + ("MaxPicSize", toMaxPicSize, "Maximum resolution (maxWidth x maxHeight)") + ("MaxPicWidth", c->m_maxPicWidth, "Maximum picture width") + ("MaxPicHeight", c->m_maxPicHeight, "Maximum picture height") ; opts.setSubSection("Profile, Level, Tier"); @@ -1203,6 +1208,11 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) err.warn( "Bitstream file" ) << cErr; } + if ( m_FrameSkip < 0 ) + { + err.error( "number of frames to skip" ) << (m_easyMode ? "frameskip must be >= 0\n" : "FrameSkip must be >= 0\n"); + } + // check for y4m input bool isY4m = ( m_forceY4mInput || apputils::FileIOHelper::isY4mInputFilename( m_inputFileName ) ) ? true : false; if( !isY4m && apputils::FileIOHelper::isY4mHeaderAvailable( m_inputFileName ) ) diff --git a/source/Lib/apputils/YuvFileIO.h b/source/Lib/apputils/YuvFileIO.h index f8386ada5..a8a6267e2 100644 --- a/source/Lib/apputils/YuvFileIO.h +++ b/source/Lib/apputils/YuvFileIO.h @@ -102,6 +102,10 @@ class YuvFileIO m_fileBitdepth = std::min( fileBitDepth, 16 ); m_MSBExtendedBitDepth = MSBExtendedBitDepth; m_bitdepthShift = internalBitDepth - m_MSBExtendedBitDepth; + if( internalBitDepth == 8 && fileBitDepth == 10 && MSBExtendedBitDepth == fileBitDepth ) + { + m_bitdepthShift = 0; + } m_fileChrFmt = fileChrFmt; m_bufferChrFmt = bufferChrFmt; m_clipToRec709 = clipToRec709; @@ -143,7 +147,7 @@ class YuvFileIO if( !cLogoFilename.empty() ) { std::stringstream strstr; - if ( 0 != m_cLogoRenderer.init( cLogoFilename, m_bufferChrFmt, internalBitDepth, strstr ) ) + if ( 0 != m_cLogoRenderer.init( cLogoFilename, m_bufferChrFmt, fileBitDepth, strstr ) ) { if( !strstr.str().empty() ) m_lastError = strstr.str(); diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp index 7291a50b1..2779c6e12 100644 --- a/source/Lib/vvenc/vvencCfg.cpp +++ b/source/Lib/vvenc/vvencCfg.cpp @@ -400,6 +400,9 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c ) c->m_PadSourceWidth = 0; ///< source width in pixel c->m_PadSourceHeight = 0; ///< source height in pixel (when interlaced = field height) + c->m_maxPicWidth = 0; + c->m_maxPicHeight = 0; + memset(&c->m_aiPad,0, sizeof(c->m_aiPad)); ///< number of padded pixels for width and height c->m_enablePictureHeaderInSliceHeader = true; c->m_AccessUnitDelimiter = -1; ///< add Access Unit Delimiter NAL units, default: auto (only enable if needed by dependent options) @@ -1209,6 +1212,11 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) c->m_craAPSreset = true; c->m_rprRASLtoolSwitch = true; } + + if( c->m_maxPicWidth > 0 && c->m_maxPicHeight > 0 ) + { + vvenc_confirmParameter( c, !c->m_rprEnabledFlag || !c->m_resChangeInClvsEnabled, "if a maxSize is set, both RPR and resChangeInClvsEnabled have to enabled" ); + } if( c->m_IntraPeriod == 0 && c->m_IntraPeriodSec > 0 ) { diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp index 43ee76992..2406a550e 100644 --- a/source/Lib/vvenc/vvencimpl.cpp +++ b/source/Lib/vvenc/vvencimpl.cpp @@ -338,7 +338,7 @@ int VVEncImpl::encode( vvencYUVBuffer* pcYUVBuffer, vvencAccessUnit* pcAccessUni } } - if ( ! xVerifyYUVBuffer( pcYUVBuffer ) ) + if ( ! xConvertVerifyYUVBuffer( pcYUVBuffer ) ) { m_cErrorString = "InputPicture: Source image contains values outside the specified bit range"; return VVENC_ERR_UNSPECIFIED; @@ -557,10 +557,17 @@ int VVEncImpl::printSummary() const return 0; } -bool VVEncImpl::xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer ) +bool VVEncImpl::xConvertVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer ) { if( pcYUVBuffer == nullptr ){ return false; } + bool conv8bit = false; + if ( m_cVVEncCfg.m_inputBitDepth[0] == 10 && m_cVVEncCfg.m_internalBitDepth[0] == 8 && + m_cVVEncCfg.m_inputBitDepth[0] == m_cVVEncCfg.m_MSBExtendedBitDepth[0] ) + { + conv8bit = true; + } + const int numComp = (m_cVVEncCfg.m_internChromaFormat==VVENC_CHROMA_400) ? 1 : 3; const int16_t mask = ~( ( 1 << m_cVVEncCfg.m_internalBitDepth[0] ) - 1 ); int dstSum = 0; @@ -568,11 +575,26 @@ bool VVEncImpl::xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer ) { vvencYUVPlane& plane = pcYUVBuffer->planes[ comp ]; int16_t* dst = plane.ptr; - for( int y = 0; y < plane.height; y++, dst += plane.stride ) + + if ( conv8bit ) { - for( int x = 0; x < plane.width; x++ ) + for( int y = 0; y < plane.height; y++, dst += plane.stride ) { - dstSum |= dst[ x ] & mask; + for( int x = 0; x < plane.width; x++ ) + { + dst[ x ] = (Pel)std::min( 255, ( dst[x] + 2 ) >> 2 ); + dstSum |= dst[ x ] & mask; + } + } + } + else + { + for( int y = 0; y < plane.height; y++, dst += plane.stride ) + { + for( int x = 0; x < plane.width; x++ ) + { + dstSum |= dst[ x ] & mask; + } } } } diff --git a/source/Lib/vvenc/vvencimpl.h b/source/Lib/vvenc/vvencimpl.h index a98a62eb8..fbbba7363 100644 --- a/source/Lib/vvenc/vvencimpl.h +++ b/source/Lib/vvenc/vvencimpl.h @@ -130,7 +130,7 @@ class VVEncImpl private: int xGetAccessUnitsSize( const vvenc::AccessUnitList& rcAuList ); int xCopyAu( vvencAccessUnit& rcAccessUnit, const AccessUnitList& rcAu ); - bool xVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer ); + bool xConvertVerifyYUVBuffer( vvencYUVBuffer* pcYUVBuffer ); private: VVEncInternalState m_eState = INTERNAL_STATE_UNINITIALIZED;