Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial NEON code, improved stats output, allowing maxrate setting as factor #340

Merged
merged 1 commit into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
* Christian Stoffers, , Fraunhofer HHI
* Gabriel Hege, , Fraunhofer HHI
* Jens Güther, , Fraunhofer HHI
* Florian Eisenreich, , Fraunhofer HHI
19 changes: 17 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,18 @@ endif()
set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules" )
message( STATUS "CMAKE_MODULE_PATH: updating module path to: ${CMAKE_MODULE_PATH}" )

# check for arm architecture support
set( VVENC_ARM_SIMD_DEFAULT FALSE )
if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm"
OR "${CMAKE_CXX_COMPILER}" MATCHES "aarch64\|arm"
OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64\|armv" )
AND NOT "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86\|x64" )
set( VVENC_ARM_SIMD_DEFAULT TRUE )
endif()

# we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86
set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" )
set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" )

include( vvencCompilerSupport )

Expand All @@ -39,8 +49,13 @@ if( VVENC_ENABLE_X86_SIMD )
check_missing_intrinsics()
endif()

set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_SIMD_X86" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTARGET_SIMD_X86" )
message( STATUS "x86 SIMD intrinsics enabled (using SIMDE for non-x86 targets)" )
add_compile_definitions( TARGET_SIMD_X86 )
endif()

if( VVENC_ENABLE_ARM_SIMD )
message( STATUS "ARM SIMD intrinsics enabled" )
add_compile_definitions( TARGET_SIMD_ARM )
endif()

if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
Expand Down
6 changes: 5 additions & 1 deletion include/vvenc/vvencCfg.h
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,11 @@ typedef struct vvenc_config
int8_t m_sliceTypeAdapt; // enable slice type adaptation (STA)
bool m_treatAsSubPic;

int m_RCMaxBitrate; // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR))
#define VVENC_SET_MAXRATE_FACTOR(f) (-((int)(f*16+0.5)))
int m_RCMaxBitrate; // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR),
// if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate).
// -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate.
// for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier
int m_reservedInt;
double m_reservedDouble[9];

Expand Down
4 changes: 3 additions & 1 deletion source/App/vvencFFapp/EncApp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ int EncApp::encode()
int64_t frameCount = apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput );
frameCount = std::max<int64_t>( 0, frameCount-appCfg.m_FrameSkip );
int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded;
cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, vvencCfg.m_verbosity, "vvenc [info]: " );
bool statsInfoReady = false;

// loop over input YUV data
Expand Down Expand Up @@ -370,6 +370,7 @@ int EncApp::encode()
if( statsInfoReady )
{
msgApp( VVENC_INFO, cStats.getInfoString().c_str() );
fflush( stdout );
}
}

Expand All @@ -383,6 +384,7 @@ int EncApp::encode()
if( appCfg.m_printStats )
{
msgApp( VVENC_INFO, cStats.getFinalStats().c_str() );
fflush( stdout );
}
}

Expand Down
4 changes: 3 additions & 1 deletion source/App/vvencapp/vvencapp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ int main( int argc, char* argv[] )
int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded;

apputils::Stats cStats;
cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, vvenccfg.m_verbosity, "vvenc [info]: " );
bool statsInfoReady = false;

while( !bEof || !bEncodeDone )
Expand Down Expand Up @@ -404,6 +404,7 @@ int main( int argc, char* argv[] )
if( statsInfoReady )
{
msgApp( nullptr, VVENC_INFO, cStats.getInfoString().c_str() );
fflush( stdout );
}
}

Expand All @@ -426,6 +427,7 @@ int main( int argc, char* argv[] )
if( vvencappCfg.m_printStats )
{
msgApp( nullptr, VVENC_INFO, cStats.getFinalStats().c_str() );
fflush( stdout );
}
}

Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/AdaptiveLoopFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

struct AlfClassifier
{
AlfClassifier() {}
Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/AffineGradientSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace vvenc {
//! \ingroup CommonLib
//! \{

using namespace x86_simd;

class AffineGradientSearch
{
public:
Expand Down
19 changes: 19 additions & 0 deletions source/Lib/CommonLib/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ struct vvencYUVBuffer;

namespace vvenc {

using namespace x86_simd;
using namespace arm_simd;

// ---------------------------------------------------------------------------
// AreaBuf struct
// ---------------------------------------------------------------------------
Expand All @@ -81,6 +84,22 @@ struct PelBufferOps
template<X86_VEXT vext>
void _initPelBufOpsX86();
#endif

#if ENABLE_SIMD_OPT_BUFFER && defined( TARGET_SIMD_ARM )
void initPelBufOpsARM();
template<ARM_VEXT vext>
void _initPelBufOpsARM();
#endif

#define INCX( ptr, stride ) { ptr++; }
#define INCY( ptr, stride ) { ptr += ( stride ); }
#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) ) // need in loopFilter.cpp + some ARM files

void ( *roundGeo ) ( const Pel* src, Pel* dest, const int numSamples, unsigned rshift, int offset, const ClpRng &clpRng);
void ( *addAvg ) ( const Pel* src0, const Pel* src1, Pel* dst, int numsamples, unsigned shift, int offset, const ClpRng& clpRng );
void ( *reco ) ( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, const ClpRng& clpRng );
Expand Down
69 changes: 53 additions & 16 deletions source/Lib/CommonLib/CommonDef.h
Original file line number Diff line number Diff line change
Expand Up @@ -636,22 +636,6 @@ static inline T* aligned_malloc(size_t len, size_t alignement) {
# define ALWAYS_INLINE
#endif

#ifdef TARGET_SIMD_X86
typedef enum
{
UNDEFINED = -1,
SCALAR = 0,
SSE41,
SSE42,
AVX,
AVX2,
AVX512
} X86_VEXT;
#endif

template <typename ValueType> inline ValueType leftShiftU (const ValueType value, const unsigned shift) { return value << shift; }
template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }

#if defined( _WIN32 ) && defined( TARGET_SIMD_X86 )
static inline unsigned int bit_scan_reverse( int a )
{
Expand All @@ -672,6 +656,59 @@ static inline unsigned int bit_scan_reverse( int a )
}
#endif

#if ENABLE_SIMD_LOG2
static inline int getLog2( int val )
{
return bit_scan_reverse( val );
}
#else
extern int8_t g_aucLog2[MAX_CU_SIZE + 1];
static inline int getLog2( int val )
{
CHECKD( g_aucLog2[2] != 1, "g_aucLog2[] has not been initialized yet." );
if( val > 0 && val < (int) sizeof( g_aucLog2 ) )
{
return g_aucLog2[val];
}
return std::log2( val );
}
#endif

#if ENABLE_SIMD_OPT

namespace x86_simd
{
#ifdef TARGET_SIMD_X86
typedef enum
{
UNDEFINED = -1,
SCALAR = 0,
SSE41,
SSE42,
AVX,
AVX2,
AVX512
} X86_VEXT;
#endif
}

namespace arm_simd
{
#ifdef TARGET_SIMD_ARM
typedef enum
{
UNDEFINED = -1,
SCALAR = 0,
NEON,
} ARM_VEXT;
#endif // TARGET_SIMD_ARM
} // namespace arm_simd

#endif //ENABLE_SIMD_OPT

template <typename ValueType> inline ValueType leftShiftU (const ValueType value, const unsigned shift) { return value << shift; }
template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }

#if ENABLE_SIMD_LOG2 && defined( TARGET_SIMD_X86 )
static inline int floorLog2( int val )
{
Expand Down
2 changes: 1 addition & 1 deletion source/Lib/CommonLib/DepQuant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1518,7 +1518,7 @@ namespace DQIntern

#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
// if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > SCALAR )
if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR )
{
const int sbbSize = tuPars.m_sbbSize;
// move the pointer to the beginning of the current subblock
Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/InterPrediction.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

// forward declaration
class Mv;

Expand Down
8 changes: 6 additions & 2 deletions source/Lib/CommonLib/InterpolationFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1071,12 +1071,16 @@ void InterpolationFilter::xWeightedGeoBlk(const ClpRngs &clpRngs, const CodingUn
void InterpolationFilter::initInterpolationFilter( bool enable )
{
#if ENABLE_SIMD_OPT_MCIF
#ifdef TARGET_SIMD_X86
if ( enable )
{
#ifdef TARGET_SIMD_X86
initInterpolationFilterX86();
}
#endif

#ifdef TARGET_SIMD_ARM
initInterpolationFilterARM();
#endif
}
#endif
}

Expand Down
9 changes: 9 additions & 0 deletions source/Lib/CommonLib/InterpolationFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;
using namespace arm_simd;

#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
#define IF_FILTER_PREC 6 ///< Log2 of sum of filter taps
#define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally
Expand Down Expand Up @@ -117,6 +120,12 @@ class InterpolationFilter
template <X86_VEXT vext>
void _initInterpolationFilterX86();
#endif

#ifdef TARGET_SIMD_ARM
void initInterpolationFilterARM();
template <ARM_VEXT vext>
void _initInterpolationFilterARM();
#endif

void filterN2_2D(const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, const ClpRng& clpRng);
void filter4x4 (const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, bool useAltHpelIf = false, int nFilterIdx = 0);
Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/IntraPrediction.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

// ====================================================================================================================
// Class definition
// ====================================================================================================================
Expand Down
9 changes: 0 additions & 9 deletions source/Lib/CommonLib/LoopFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,6 @@ const uint8_t LoopFilter::sm_betaTable[MAX_QP + 1] =
// utility functions
// ====================================================================================================================

#define INCX( ptr, stride ) { ptr++; }
#define INCY( ptr, stride ) { ptr += ( stride ); }
#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) )

#define BsSet( val, compIdx ) ( ( val ) << ( ( compIdx ) << 1 ) )
#define BsGet( val, compIdx ) ( ( ( val ) >> ( ( compIdx ) << 1 ) ) & 3 )

Expand Down
1 change: 1 addition & 0 deletions source/Lib/CommonLib/LoopFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

#define DEBLOCK_SMALLEST_BLOCK 8

Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/MCTF.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

class NoMallocThreadPool;

//! \ingroup EncoderLib
Expand Down
2 changes: 2 additions & 0 deletions source/Lib/CommonLib/Quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.

namespace vvenc {

using namespace x86_simd;

// ====================================================================================================================
// Constants
// ====================================================================================================================
Expand Down
2 changes: 1 addition & 1 deletion source/Lib/CommonLib/QuantRDOQ2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ int QuantRDOQ2::xRateDistOptQuantFast( TransformUnit &tu, const ComponentID &com

const bool scanFirstBlk = !bUseScalingList && log2CGSize == 4 && cctx.log2CGWidth() == 2;
#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
const bool isSimd = read_x86_extension_flags() > SCALAR;
const bool isSimd = read_x86_extension_flags() > x86_simd::SCALAR;
#endif

int subSetId = iScanPos >> log2CGSize;
Expand Down
3 changes: 3 additions & 0 deletions source/Lib/CommonLib/RdCost.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ void RdCost::create()
#ifdef TARGET_SIMD_X86
initRdCostX86();
#endif
#ifdef TARGET_SIMD_ARM
initRdCostARM();
#endif
#endif

m_costMode = VVENC_COST_STANDARD_LOSSY;
Expand Down
Loading