Add support for SSSE3, tune for AVX-512BW

andrewevstyukhin · May 12, 2020 · 9a32800 · 9a32800
1 parent 4db23df
commit 9a32800
Show file tree

Hide file tree

Showing 16 changed files with 345 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Nebc7 always preserves opaque alpha for opaque blocks.
 
 ## Usage
 
-The solution was tested on AVX-capable CPU for Win64 API only.
+The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX512BW - capable CPUs for Win64 API only.
 
 `Bc7Compress /nomask /noflip source.png destination.ktx [/debug result.png]`
 

diff --git a/src/Bc7Compress.vcxproj b/src/Bc7Compress.vcxproj
@@ -130,6 +130,7 @@
     <ClInclude Include="SnippetInsertRemoveZeroBit.h" />
     <ClInclude Include="SnippetLevelsBuffer.h" />
     <ClInclude Include="SnippetLevelsMinimum.h" />
+    <ClInclude Include="SnippetTargetSSSE3.h" />
     <ClInclude Include="Worker.h" />
   </ItemGroup>
   <ItemGroup>

diff --git a/src/Bc7Compress.vcxproj.filters b/src/Bc7Compress.vcxproj.filters
@@ -56,6 +56,9 @@
     <ClInclude Include="Bc7Pca.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="SnippetTargetSSSE3.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="pch.cpp">

diff --git a/src/Bc7CoreMode4.cpp b/src/Bc7CoreMode4.cpp
@@ -191,10 +191,10 @@ namespace Mode4 {
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
-		const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(mweights), vrot);
+		const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastq_epi64(mweights), vrot);
 
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		const __m256i vmask3 = _mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0);
 		const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);
@@ -365,10 +365,10 @@ namespace Mode4 {
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
-		const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(mweights), vrot);
+		const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastq_epi64(mweights), vrot);
 
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		const __m256i vmask3 = _mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0);
 		const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);

diff --git a/src/Bc7CoreMode5.cpp b/src/Bc7CoreMode5.cpp
@@ -175,10 +175,10 @@ namespace Mode5 {
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
-		const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		const __m256i vmask3 = _mm256_shuffle_epi8(_mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0), vrot);
 		const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);

diff --git a/src/Bc7CoreMode6.cpp b/src/Bc7CoreMode6.cpp
@@ -121,13 +121,13 @@ namespace Mode6 {
 		__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-		const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		__m256i vtx = *(const __m256i*)&gTableInterpolate4_U8[0];
 		__m256i vty = *(const __m256i*)&gTableInterpolate4_U8[2];
@@ -314,15 +314,15 @@ namespace Mode6 {
 		__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-		const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
 
 		mc = _mm_shuffle_epi32(mc, shuffle);
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		__m256i vt0 = *(const __m256i*)&gTableInterpolate4GR_U8[0];
 		__m256i vt1 = *(const __m256i*)&gTableInterpolate4GR_U8[2];

diff --git a/src/Bc7CoreMode7.cpp b/src/Bc7CoreMode7.cpp
@@ -142,14 +142,14 @@ namespace Mode7 {
 		__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-		const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix4 = _mm_slli_epi32(mfix, 2);
 
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		__m256i vt = *(const __m256i*)gTableInterpolate2_U8;
 
@@ -327,15 +327,15 @@ namespace Mode7 {
 		__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-		const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		const __m256i vhalf = _mm256_set1_epi16(32);
 		const __m256i vsign = _mm256_set1_epi16(-0x8000);
 		const __m128i mfix4 = _mm_slli_epi32(mfix, 2);
 
 		mc = _mm_shuffle_epi32(mc, shuffle);
 		mc = _mm_packus_epi16(mc, mc);
-		__m256i vc = _mm256_broadcastsi128_si256(mc);
+		__m256i vc = _mm256_broadcastq_epi64(mc);
 
 		__m256i vt = *(const __m256i*)gTableInterpolate2GR_U8;
 

diff --git a/src/Bc7Mode.h b/src/Bc7Mode.h
@@ -5,6 +5,7 @@
 //#define OPTION_PCA
 //#define OPTION_COUNTERS
 //#define OPTION_LINEAR
+//#define OPTION_SLOWPOKE
 #define OPTION_SELFCHECK
 
 #if defined(OPTION_LINEAR)
@@ -21,18 +22,18 @@ enum { kAlpha = 1000, kGreen = 715, kRed = 213, kBlue = 72 };
 
 enum { kColor = kGreen + kRed + kBlue };
 
-#if defined(OPTION_AVX512) && !defined(OPTION_AVX2)
-#define OPTION_AVX2
+#if defined(OPTION_AVX512) && (!defined(__AVX512F__) || !defined(__AVX512BW__) || !defined(__AVX512VL__) || defined(OPTION_SLOWPOKE))
+#error AVX-512 is required
 #endif
 
-#if defined(OPTION_AVX2) && !defined(OPTION_FMA) // Except Via Cores
-#define OPTION_FMA
+#if defined(OPTION_AVX512) && !defined(OPTION_AVX2)
+#define OPTION_AVX2
 #endif
 
-#if defined(OPTION_AVX2) && !defined(__AVX2__)
+#if defined(OPTION_AVX2) && (!defined(__AVX2__) || defined(OPTION_SLOWPOKE))
 #error AVX2 is required
 #endif
 
-#if !defined(__AVX__)
-#error AVX is required
+#if defined(OPTION_AVX2) && !defined(OPTION_FMA) // Except Via Cores
+#define OPTION_FMA
 #endif
diff --git a/src/Bc7Tables.cpp b/src/Bc7Tables.cpp
@@ -48,7 +48,7 @@ alignas(16) static constexpr short gTableInterpolate4[16][2] =
 };
 
 alignas(32) __m128i gTableInterpolate2_U8[4 >> 1];
-alignas(32) __m128i gTableInterpolate3_U8[8 >> 1];
+alignas(64) __m128i gTableInterpolate3_U8[8 >> 1];
 alignas(32) __m128i gTableInterpolate4_U8[16 >> 1];
 
 alignas(32) __m128i gTableInterpolate2GR_U8[4 >> (2 - 1)];

diff --git a/src/Bc7Tables.h b/src/Bc7Tables.h
@@ -3,7 +3,7 @@
 #include "pch.h"
 
 alignas(32) extern __m128i gTableInterpolate2_U8[4 >> 1];
-alignas(32) extern __m128i gTableInterpolate3_U8[8 >> 1];
+alignas(64) extern __m128i gTableInterpolate3_U8[8 >> 1];
 alignas(32) extern __m128i gTableInterpolate4_U8[16 >> 1];
 
 alignas(32) extern __m128i gTableInterpolate2GR_U8[4 >> (2 - 1)];

diff --git a/src/SnippetComputeOpaqueSubset2.h b/src/SnippetComputeOpaqueSubset2.h
@@ -12,14 +12,14 @@ static INLINED int ComputeOpaqueSubsetError2(const Area& area, __m128i mc, const
 	const __m128i mfix = gFixWeightsGRB;
 
 #if defined(OPTION_AVX2)
-	const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+	const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 	const __m256i vhalf = _mm256_set1_epi16(32);
 	const __m256i vsign = _mm256_set1_epi16(-0x8000);
 	const __m128i mfix4 = _mm_slli_epi32(mfix, 2);
 
 	mc = _mm_packus_epi16(mc, mc);
-	__m256i vc = _mm256_broadcastsi128_si256(mc);
+	__m256i vc = _mm256_broadcastq_epi64(mc);
 
 	__m256i vt = *(const __m256i*)gTableInterpolate2_U8;
 
@@ -197,15 +197,15 @@ static INLINED int ComputeOpaqueSubsetError2Pair(const Area& area, __m128i mc, c
 	__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-	const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+	const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 	const __m256i vhalf = _mm256_set1_epi16(32);
 	const __m256i vsign = _mm256_set1_epi16(-0x8000);
 	const __m128i mfix4 = _mm_slli_epi32(mfix, 2);
 
 	mc = _mm_shuffle_epi32(mc, shuffle);
 	mc = _mm_packus_epi16(mc, mc);
-	__m256i vc = _mm256_broadcastsi128_si256(mc);
+	__m256i vc = _mm256_broadcastq_epi64(mc);
 
 	__m256i vt = *(const __m256i*)gTableInterpolate2GR_U8;
 

diff --git a/src/SnippetComputeOpaqueSubset3.h b/src/SnippetComputeOpaqueSubset3.h
@@ -11,15 +11,97 @@ static INLINED int ComputeOpaqueSubsetError3(const Area& area, __m128i mc, const
 	const __m128i mweights = gWeightsGRB;
 	const __m128i mfix = gFixWeightsGRB;
 
-#if defined(OPTION_AVX2)
-	const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+#if defined(OPTION_AVX512)
+	const __m512i wweights = _mm512_broadcastq_epi64(mweights);
+
+	const __m512i whalf = _mm512_set1_epi16(32);
+	const __m512i wsign = _mm512_set1_epi16(-0x8000);
+	const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
+
+	mc = _mm_packus_epi16(mc, mc);
+	__m512i wc = _mm512_broadcastq_epi64(mc);
+
+	__m512i wt = *(const __m512i*)gTableInterpolate3_U8;
+
+	wt = _mm512_maddubs_epi16(wc, wt);
+
+	wt = _mm512_add_epi16(wt, whalf);
+
+	wt = _mm512_srli_epi16(wt, 6);
+
+	__m512i wtx = _mm512_permutex_epi64(wt, 0x44);
+	__m512i wty = _mm512_permutex_epi64(wt, 0xEE);
+
+	int k = static_cast<int>(area.Count);
+	const __m256i* p = (const __m256i*)area.DataMask_I16;
+
+	while ((k -= 2) >= 0)
+	{
+		__m256i vpacked = _mm256_load_si256(p);
+		__m256i vpixel = _mm256_unpacklo_epi64(vpacked, vpacked);
+		__m512i wpixel = _mm512_broadcast_i64x4(vpixel);
+
+		merrorBlock = _mm_add_epi32(merrorBlock, mfix2);
+
+		__m512i wx = _mm512_sub_epi16(wpixel, wtx);
+		__m512i wy = _mm512_sub_epi16(wpixel, wty);
+
+		wx = _mm512_mullo_epi16(wx, wx);
+		wy = _mm512_mullo_epi16(wy, wy);
+
+		wx = _mm512_xor_epi32(wx, wsign);
+		wy = _mm512_xor_epi32(wy, wsign);
+
+		wx = _mm512_madd_epi16(wx, wweights);
+		wy = _mm512_madd_epi16(wy, wweights);
+
+		wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
+		wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));
+
+		wx = _mm512_min_epi32(wx, wy);
+		__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
+		vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));
+
+		merrorBlock = _mm_add_epi32(merrorBlock, _mm256_castsi256_si128(vx));
+		merrorBlock = _mm_add_epi32(merrorBlock, _mm256_extracti128_si256(vx, 1));
+
+		p++;
+
+		if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
+			break;
+	}
+
+	if (k & 1)
+	{
+		__m128i mpacked = _mm_load_si128((const __m128i*)p);
+		__m512i wpixel = _mm512_broadcastq_epi64(mpacked);
+
+		merrorBlock = _mm_add_epi32(merrorBlock, mfix);
+
+		__m512i wx = _mm512_sub_epi16(wpixel, wt);
+
+		wx = _mm512_mullo_epi16(wx, wx);
+
+		wx = _mm512_xor_epi32(wx, wsign);
+
+		wx = _mm512_madd_epi16(wx, wweights);
+
+		wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
+
+		__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
+		vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));
+
+		merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
+	}
+#elif defined(OPTION_AVX2)
+	const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 	const __m256i vhalf = _mm256_set1_epi16(32);
 	const __m256i vsign = _mm256_set1_epi16(-0x8000);
 	const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
 
 	mc = _mm_packus_epi16(mc, mc);
-	__m256i vc = _mm256_broadcastsi128_si256(mc);
+	__m256i vc = _mm256_broadcastq_epi64(mc);
 
 	__m256i vt0 = *(const __m256i*)&gTableInterpolate3_U8[0];
 	__m256i vt1 = *(const __m256i*)&gTableInterpolate3_U8[2];
@@ -189,15 +271,15 @@ static INLINED int ComputeOpaqueSubsetError3Pair(const Area& area, __m128i mc, c
 	__m128i merrorBlock = _mm_setzero_si128();
 
 #if defined(OPTION_AVX2)
-	const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
+	const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 	const __m256i vhalf = _mm256_set1_epi16(32);
 	const __m256i vsign = _mm256_set1_epi16(-0x8000);
 	const __m128i mfix4 = _mm_slli_epi32(mfix, 2);
 
 	mc = _mm_shuffle_epi32(mc, shuffle);
 	mc = _mm_packus_epi16(mc, mc);
-	__m256i vc = _mm256_broadcastsi128_si256(mc);
+	__m256i vc = _mm256_broadcastq_epi64(mc);
 
 	__m256i vt = *(const __m256i*)gTableInterpolate3GR_U8;
 

diff --git a/src/SnippetLevelsBuffer.h b/src/SnippetLevelsBuffer.h
@@ -478,7 +478,7 @@ class LevelsBuffer final
 
 	static INLINED void Estimate32Short(NodeShort*& nodesPtr, const uint8_t* values[16], const size_t count, const int c, const __m128i mtop) noexcept
 	{
-		const __m512i wwater = _mm512_maskz_broadcastw_epi16(kFullMask32, mtop);
+		const __m512i wwater = _mm512_broadcastw_epi16(mtop);
 
 		__m512i wsum = _mm512_setzero_si512();
 		uint32_t flags = ~0ui32;
@@ -491,21 +491,21 @@ class LevelsBuffer final
 
 			__m256i vdelta = _mm256_load_si256(p);
 
-			__m512i wadd = _mm512_maskz_cvtepu8_epi16(kFullMask32, vdelta);
+			__m512i wadd = _mm512_cvtepu8_epi16(vdelta);
 
-			wadd = _mm512_maskz_mullo_epi16(kFullMask32, wadd, wadd);
+			wadd = _mm512_mullo_epi16(wadd, wadd);
 
-			wsum = _mm512_maskz_adds_epu16(kFullMask32, wsum, wadd);
+			wsum = _mm512_adds_epu16(wsum, wadd);
 
-			flags = _mm512_mask_cmp_epu16_mask(kFullMask32, wwater, wsum, _MM_CMPINT_GT);
+			flags = _mm512_cmp_epu16_mask(wwater, wsum, _MM_CMPINT_GT);
 			if (!flags)
 				return;
 		}
 
 		Store8N(nodesPtr, _mm512_castsi512_si128(wsum), flags, c);
-		Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 1), flags >> 8, c + 8);
-		Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 2), flags >> 16, c + 16);
-		Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 3), flags >> 24, c + 24);
+		Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 1), flags >> 8, c + 8);
+		Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 2), flags >> 16, c + 16);
+		Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 3), flags >> 24, c + 24);
 	}
 
 	static INLINED void Estimate16Short(NodeShort*& nodesPtr, const uint8_t* values[16], const size_t count, const int c, const __m128i mtop) noexcept