fraunhoferhhi · jbrdbg · Dec 19, 2023 · Dec 19, 2023
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -12,3 +12,4 @@
 * Christian Stoffers, , Fraunhofer HHI
 * Gabriel Hege, , Fraunhofer HHI
 * Jens Güther, , Fraunhofer HHI
+* Florian Eisenreich, , Fraunhofer HHI
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,8 +21,18 @@ endif()
 set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules" )
 message( STATUS "CMAKE_MODULE_PATH: updating module path to: ${CMAKE_MODULE_PATH}" )
 
+# check for arm architecture support
+set( VVENC_ARM_SIMD_DEFAULT FALSE )
+if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm"
+    OR "${CMAKE_CXX_COMPILER}" MATCHES "aarch64\|arm"
+    OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64\|armv" )
+    AND NOT "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86\|x64" )
+  set( VVENC_ARM_SIMD_DEFAULT TRUE )
+endif()
+
 # we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86
 set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" )
+set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" )
 
 include( vvencCompilerSupport )
 
@@ -39,8 +49,13 @@ if( VVENC_ENABLE_X86_SIMD )
     check_missing_intrinsics()
   endif()
 
-  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_SIMD_X86" )
-  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTARGET_SIMD_X86" )
+  message( STATUS "x86 SIMD intrinsics enabled (using SIMDE for non-x86 targets)" )
+  add_compile_definitions( TARGET_SIMD_X86 )
+endif()
+
+if( VVENC_ENABLE_ARM_SIMD )
+  message( STATUS "ARM SIMD intrinsics enabled" )
+  add_compile_definitions( TARGET_SIMD_ARM )
 endif()
 
 if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )

diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h
@@ -769,7 +769,11 @@ typedef struct vvenc_config
   int8_t              m_sliceTypeAdapt;                                                  // enable slice type adaptation (STA)
   bool                m_treatAsSubPic;
 
-  int                 m_RCMaxBitrate;                                                    // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR))
+#define VVENC_SET_MAXRATE_FACTOR(f) (-((int)(f*16+0.5)))
+  int                 m_RCMaxBitrate;                                                    // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR),
+                                                                                         // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate).
+                                                                                         // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate.
+                                                                                         // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier
   int                 m_reservedInt;
   double              m_reservedDouble[9];
 

diff --git a/source/App/vvencFFapp/EncApp.cpp b/source/App/vvencFFapp/EncApp.cpp
@@ -307,7 +307,7 @@ int EncApp::encode()
     int64_t frameCount =  apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput );
     frameCount = std::max<int64_t>( 0, frameCount-appCfg.m_FrameSkip );
     int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded;
-    cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
+    cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, vvencCfg.m_verbosity, "vvenc [info]: " );
     bool statsInfoReady = false;
 
     // loop over input YUV data
@@ -370,6 +370,7 @@ int EncApp::encode()
           if( statsInfoReady )
           {
             msgApp( VVENC_INFO, cStats.getInfoString().c_str() );
+            fflush( stdout );
           }
         }
 
@@ -383,6 +384,7 @@ int EncApp::encode()
     if( appCfg.m_printStats )
     {
       msgApp( VVENC_INFO, cStats.getFinalStats().c_str() );
+      fflush( stdout );
     }
   }
 

diff --git a/source/App/vvencapp/vvencapp.cpp b/source/App/vvencapp/vvencapp.cpp
@@ -357,7 +357,7 @@ int main( int argc, char* argv[] )
     int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded;
 
     apputils::Stats cStats;
-    cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
+    cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, vvenccfg.m_verbosity, "vvenc [info]: " );
     bool statsInfoReady = false;
 
     while( !bEof || !bEncodeDone )
@@ -404,6 +404,7 @@ int main( int argc, char* argv[] )
           if( statsInfoReady )
           {
             msgApp( nullptr, VVENC_INFO, cStats.getInfoString().c_str() );
+            fflush( stdout );
           }
         }
 
@@ -426,6 +427,7 @@ int main( int argc, char* argv[] )
     if( vvencappCfg.m_printStats )
     {
       msgApp( nullptr, VVENC_INFO, cStats.getFinalStats().c_str() );
+      fflush( stdout );
     }
   }
 

diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 struct AlfClassifier
 {
   AlfClassifier() {}

diff --git a/source/Lib/CommonLib/AffineGradientSearch.h b/source/Lib/CommonLib/AffineGradientSearch.h
@@ -53,6 +53,8 @@ namespace vvenc {
   //! \ingroup CommonLib
   //! \{
 
+using namespace x86_simd;
+
   class AffineGradientSearch
   {
   public:

diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
@@ -66,6 +66,9 @@ struct vvencYUVBuffer;
 
 namespace vvenc {
 
+using namespace x86_simd;
+using namespace arm_simd;
+
 // ---------------------------------------------------------------------------
 // AreaBuf struct
 // ---------------------------------------------------------------------------
@@ -81,6 +84,22 @@ struct PelBufferOps
   template<X86_VEXT vext>
   void _initPelBufOpsX86();
 #endif
+
+#if ENABLE_SIMD_OPT_BUFFER && defined( TARGET_SIMD_ARM )
+  void initPelBufOpsARM();
+  template<ARM_VEXT vext>
+  void _initPelBufOpsARM();
+#endif
+
+#define INCX( ptr, stride ) { ptr++; }
+#define INCY( ptr, stride ) { ptr += ( stride ); }
+#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
+#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
+#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
+#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
+#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
+#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) ) // need in loopFilter.cpp + some ARM files
+
   void ( *roundGeo )      ( const Pel* src, Pel* dest, const int numSamples, unsigned rshift, int offset, const ClpRng &clpRng);
   void ( *addAvg )        ( const Pel* src0, const Pel* src1, Pel* dst, int numsamples, unsigned shift, int offset, const ClpRng& clpRng );
   void ( *reco  )         ( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, const ClpRng& clpRng );

diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
@@ -636,22 +636,6 @@ static inline T* aligned_malloc(size_t len, size_t alignement) {
 #    define ALWAYS_INLINE
 #endif
 
-#ifdef TARGET_SIMD_X86
-typedef enum
-{
-  UNDEFINED = -1,
-  SCALAR = 0,
-  SSE41,
-  SSE42,
-  AVX,
-  AVX2,
-  AVX512
-} X86_VEXT;
-#endif
-
-template <typename ValueType> inline ValueType leftShiftU  (const ValueType value, const unsigned shift) { return value << shift; }
-template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }
-
 #if defined( _WIN32 ) && defined( TARGET_SIMD_X86 )
 static inline unsigned int bit_scan_reverse( int a )
 {
@@ -672,6 +656,59 @@ static inline unsigned int bit_scan_reverse( int a )
 }
 #endif
 
+#if ENABLE_SIMD_LOG2
+static inline int getLog2( int val )
+{
+  return bit_scan_reverse( val );
+}
+#else
+extern int8_t g_aucLog2[MAX_CU_SIZE + 1];
+static inline int getLog2( int val )
+{
+  CHECKD( g_aucLog2[2] != 1, "g_aucLog2[] has not been initialized yet." );
+  if( val > 0 && val < (int) sizeof( g_aucLog2 ) )
+  {
+    return g_aucLog2[val];
+  }
+  return std::log2( val );
+}
+#endif
+
+#if ENABLE_SIMD_OPT
+
+namespace x86_simd
+{
+#ifdef TARGET_SIMD_X86
+  typedef enum
+  {
+    UNDEFINED = -1,
+    SCALAR = 0,
+    SSE41,
+    SSE42,
+    AVX,
+    AVX2,
+    AVX512
+  } X86_VEXT;
+#endif
+}
+
+namespace arm_simd
+{
+#ifdef TARGET_SIMD_ARM
+  typedef enum
+  {
+    UNDEFINED = -1,
+    SCALAR    = 0,
+    NEON,
+  } ARM_VEXT;
+#endif   // TARGET_SIMD_ARM
+}   // namespace arm_simd
+
+#endif //ENABLE_SIMD_OPT
+
+template <typename ValueType> inline ValueType leftShiftU  (const ValueType value, const unsigned shift) { return value << shift; }
+template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }
+
 #if ENABLE_SIMD_LOG2 && defined( TARGET_SIMD_X86 )
 static inline int floorLog2( int val )
 {

diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp
@@ -1518,7 +1518,7 @@ namespace DQIntern
 
 #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
       // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
-      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > SCALAR )
+      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR )
       {
         const int sbbSize = tuPars.m_sbbSize;
         // move the pointer to the beginning of the current subblock

diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
@@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // forward declaration
 class Mv;
 

diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp
@@ -1071,12 +1071,16 @@ void InterpolationFilter::xWeightedGeoBlk(const ClpRngs &clpRngs, const CodingUn
 void InterpolationFilter::initInterpolationFilter( bool enable )
 {
 #if ENABLE_SIMD_OPT_MCIF
-#ifdef TARGET_SIMD_X86
   if ( enable )
   {
+#ifdef TARGET_SIMD_X86
     initInterpolationFilterX86();
-  }
 #endif
+
+#ifdef TARGET_SIMD_ARM
+    initInterpolationFilterARM();
+#endif
+  }
 #endif
 }
 

diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h
@@ -55,6 +55,9 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+using namespace arm_simd;
+
 #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
 #define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
 #define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally
@@ -117,6 +120,12 @@ class InterpolationFilter
   template <X86_VEXT vext>
   void _initInterpolationFilterX86();
 #endif
+
+#ifdef TARGET_SIMD_ARM
+  void initInterpolationFilterARM();
+  template <ARM_VEXT vext>
+  void _initInterpolationFilterARM();
+#endif
 
   void filterN2_2D(const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY,                                        const ClpRng& clpRng);
   void filter4x4  (const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY,   bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, bool useAltHpelIf = false, int nFilterIdx = 0);

diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // ====================================================================================================================
 // Class definition
 // ====================================================================================================================

diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp
@@ -90,15 +90,6 @@ const uint8_t LoopFilter::sm_betaTable[MAX_QP + 1] =
 // utility functions
 // ====================================================================================================================
 
-#define INCX( ptr, stride ) { ptr++; }
-#define INCY( ptr, stride ) { ptr += ( stride ); }
-#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
-#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
-#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
-#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
-#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
-#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) )
-
 #define BsSet( val, compIdx ) (   ( val ) << ( ( compIdx ) << 1 ) )     
 #define BsGet( val, compIdx ) ( ( ( val ) >> ( ( compIdx ) << 1 ) ) & 3 )
 

diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h
@@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
 
 #define DEBLOCK_SMALLEST_BLOCK  8
 

diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 class NoMallocThreadPool;
 
 //! \ingroup EncoderLib

diff --git a/source/Lib/CommonLib/Quant.h b/source/Lib/CommonLib/Quant.h
@@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // ====================================================================================================================
 // Constants
 // ====================================================================================================================

diff --git a/source/Lib/CommonLib/QuantRDOQ2.cpp b/source/Lib/CommonLib/QuantRDOQ2.cpp
@@ -584,7 +584,7 @@ int QuantRDOQ2::xRateDistOptQuantFast( TransformUnit &tu, const ComponentID &com
 
   const bool scanFirstBlk = !bUseScalingList && log2CGSize == 4 && cctx.log2CGWidth() == 2;
 #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
-  const bool isSimd       = read_x86_extension_flags() > SCALAR;
+  const bool isSimd       = read_x86_extension_flags() > x86_simd::SCALAR;
 #endif
 
   int subSetId = iScanPos >> log2CGSize;

diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp
@@ -141,6 +141,9 @@ void RdCost::create()
 #ifdef TARGET_SIMD_X86
   initRdCostX86();
 #endif
+#ifdef TARGET_SIMD_ARM
+  initRdCostARM();
+#endif
 #endif
 
   m_costMode      = VVENC_COST_STANDARD_LOSSY;