Refactoring & optimizations

andrewevstyukhin · Jul 8, 2020 · 926cb25 · 926cb25
1 parent 462db37
commit 926cb25
Show file tree

Hide file tree

Showing 14 changed files with 774 additions and 1,073 deletions.
diff --git a/README.md b/README.md
@@ -20,9 +20,11 @@ Modes 7, 1, 3 are memory-bound because of large tables, they partially limited i
 
 For premultiplied alpha it is necessary to specify "/nomask" command-line option. While extruded RGBA images can highly benefit from masking. Switch "/retina" allows future artifact-free scaling by 0.5. Masking gives smaller compressed images and better borders, because masked pixels can have any value.
 
+Many encoders use single metric (RMSE / MSE / PSNR) insensitive to a direction. While SSIM is unhandy for direct compression, it enhances correlation when encoding produces equal deltas and so SSIM overcomes dithering.
+
 ## Usage
 
-The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX512BW - capable CPUs for Win64 API only.
+The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX-512BW - capable CPUs for Win64 API only.
 
 `Bc7Compress /nomask /noflip source.png destination.ktx [/debug result.png]`
 

diff --git a/src/Bc7Compress.vcxproj b/src/Bc7Compress.vcxproj
@@ -129,6 +129,7 @@
     <ClInclude Include="SnippetHorizontalSum4.h" />
     <ClInclude Include="SnippetInsertRemoveZeroBit.h" />
     <ClInclude Include="SnippetLevelsBuffer.h" />
+    <ClInclude Include="SnippetLevelsBufferHalf.h" />
     <ClInclude Include="SnippetLevelsMinimum.h" />
     <ClInclude Include="SnippetTargetSSSE3.h" />
     <ClInclude Include="Worker.h" />

diff --git a/src/Bc7Compress.vcxproj.filters b/src/Bc7Compress.vcxproj.filters
@@ -59,6 +59,9 @@
     <ClInclude Include="SnippetTargetSSSE3.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="SnippetLevelsBufferHalf.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="pch.cpp">

diff --git a/src/Bc7Core.cpp b/src/Bc7Core.cpp
@@ -9,6 +9,7 @@
 #if defined(OPTION_COUNTERS)
 #include "SnippetLevelsMinimum.h"
 #include "SnippetLevelsBuffer.h"
+#include "SnippetLevelsBufferHalf.h"
 #endif
 
 #if defined(OPTION_COUNTERS)
@@ -1316,8 +1317,8 @@ void CompressStatistics()
 
 	PRINTF("[Minimum]\tFull = %i, Short = %i",
 		gMinimumFull.load(), gMinimumShort.load());
-	PRINTF("[Estimate]\tFull = %i, Short = %i",
-		gEstimateFull.load(), gEstimateShort.load());
+	PRINTF("[Estimate]\tFull = %i, Short = %i, Half = %i",
+		gEstimateFull.load(), gEstimateShort.load(), gEstimateHalf.load());
 
 	PRINTF("\t\t[1] = %i, [2] = %i, [3] = %i, [4] = %i",
 		gLevels[1].load(), gLevels[2].load(), gLevels[3].load(), gLevels[4].load());

diff --git a/src/Bc7CoreMode6.cpp b/src/Bc7CoreMode6.cpp
@@ -5,7 +5,7 @@
 #include "Bc7Pca.h"
 
 #include "SnippetInsertRemoveZeroBit.h"
-#include "SnippetLevelsBuffer.h"
+#include "SnippetLevelsBufferHalf.h"
 
 // https://docs.microsoft.com/en-us/windows/desktop/direct3d11/bc7-format-mode-reference#mode-6
 
@@ -109,7 +109,7 @@ namespace Mode6 {
 		if (error)
 		{
 			error *= kAlpha;
-			int v = gTableDeltas4_Value8[0][alpha];
+			int v = (gTableDeltas4Half_Value8[0][alpha >> 1] >> ((alpha & 1) << 2)) & 0xF;
 			error *= v * v;
 		}
 
@@ -120,7 +120,114 @@ namespace Mode6 {
 	{
 		__m128i merrorBlock = _mm_setzero_si128();
 
-#if defined(OPTION_AVX2)
+#if defined(OPTION_AVX512)
+		const __m512i wweights = _mm512_broadcastq_epi64(mweights);
+
+		const __m512i whalf = _mm512_set1_epi16(32);
+
+		mc = _mm_packus_epi16(mc, mc);
+		__m512i wc = _mm512_broadcastq_epi64(mc);
+
+		__m512i wt0 = *(const __m512i*)&gTableInterpolate4_U8[0];
+		__m512i wt1 = *(const __m512i*)&gTableInterpolate4_U8[4];
+
+		wt0 = _mm512_maddubs_epi16(wc, wt0);
+		wt1 = _mm512_maddubs_epi16(wc, wt1);
+
+		wt0 = _mm512_add_epi16(wt0, whalf);
+		wt1 = _mm512_add_epi16(wt1, whalf);
+
+		wt0 = _mm512_srli_epi16(wt0, 6);
+		wt1 = _mm512_srli_epi16(wt1, 6);
+
+		__m512i wtx = _mm512_permutex_epi64(wt0, 0x44);
+		__m512i wty = _mm512_permutex_epi64(wt0, 0xEE);
+		__m512i wtz = _mm512_permutex_epi64(wt1, 0x44);
+		__m512i wtw = _mm512_permutex_epi64(wt1, 0xEE);
+
+		int k = static_cast<int>(area.Active);
+		const __m256i* p = (const __m256i*)area.DataMask_I16;
+
+		while ((k -= 2) >= 0)
+		{
+			__m256i vpacked = _mm256_load_si256(p);
+			__m256i vpixel = _mm256_unpacklo_epi64(vpacked, vpacked);
+			__m512i wpixel = _mm512_broadcast_i64x4(vpixel);
+
+			__m512i wx = _mm512_sub_epi16(wpixel, wtx);
+			__m512i wy = _mm512_sub_epi16(wpixel, wty);
+			__m512i wz = _mm512_sub_epi16(wpixel, wtz);
+			__m512i ww = _mm512_sub_epi16(wpixel, wtw);
+
+			wx = _mm512_abs_epi16(wx);
+			wy = _mm512_abs_epi16(wy);
+			wz = _mm512_abs_epi16(wz);
+			ww = _mm512_abs_epi16(ww);
+
+			wx = _mm512_srli_epi16(wx, kDenoise);
+			wy = _mm512_srli_epi16(wy, kDenoise);
+			wz = _mm512_srli_epi16(wz, kDenoise);
+			ww = _mm512_srli_epi16(ww, kDenoise);
+
+			wx = _mm512_mullo_epi16(wx, wx);
+			wy = _mm512_mullo_epi16(wy, wy);
+			wz = _mm512_mullo_epi16(wz, wz);
+			ww = _mm512_mullo_epi16(ww, ww);
+
+			wx = _mm512_madd_epi16(wx, wweights);
+			wy = _mm512_madd_epi16(wy, wweights);
+			wz = _mm512_madd_epi16(wz, wweights);
+			ww = _mm512_madd_epi16(ww, wweights);
+
+			wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
+			wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));
+			wz = _mm512_add_epi32(wz, _mm512_shuffle_epi32(wz, _MM_SHUFFLE(2, 3, 0, 1)));
+			ww = _mm512_add_epi32(ww, _mm512_shuffle_epi32(ww, _MM_SHUFFLE(2, 3, 0, 1)));
+
+			wx = _mm512_min_epi32(_mm512_min_epi32(wx, wy), _mm512_min_epi32(wz, ww));
+			__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
+			vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));
+
+			merrorBlock = _mm_add_epi32(merrorBlock, _mm256_castsi256_si128(vx));
+			merrorBlock = _mm_add_epi32(merrorBlock, _mm256_extracti128_si256(vx, 1));
+
+			p++;
+
+			if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
+				goto done;
+		}
+
+		if (k & 1)
+		{
+			__m128i mpacked = _mm_load_si128((const __m128i*)p);
+			__m512i wpixel = _mm512_broadcastq_epi64(mpacked);
+
+			__m512i wx = _mm512_sub_epi16(wpixel, wt0);
+			__m512i wy = _mm512_sub_epi16(wpixel, wt1);
+
+			wx = _mm512_abs_epi16(wx);
+			wy = _mm512_abs_epi16(wy);
+
+			wx = _mm512_srli_epi16(wx, kDenoise);
+			wy = _mm512_srli_epi16(wy, kDenoise);
+
+			wx = _mm512_mullo_epi16(wx, wx);
+			wy = _mm512_mullo_epi16(wy, wy);
+
+			wx = _mm512_madd_epi16(wx, wweights);
+			wy = _mm512_madd_epi16(wy, wweights);
+
+			wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
+			wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));
+
+			wx = _mm512_min_epi32(wx, wy);
+			__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
+			vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));
+
+			merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
+		}
+	done:
+#elif defined(OPTION_AVX2)
 		const __m256i vweights = _mm256_broadcastq_epi64(mweights);
 
 		const __m256i vhalf = _mm256_set1_epi16(32);
@@ -392,7 +499,7 @@ namespace Mode6 {
 			p++;
 
 			if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
-				break;
+				goto done;
 		}
 
 		if (k & 1)
@@ -422,6 +529,7 @@ namespace Mode6 {
 
 			merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
 		}
+	done:
 #else
 		const __m128i mhalf = _mm_set1_epi16(32);
 
@@ -630,7 +738,7 @@ namespace Mode6 {
 	class Subset final
 	{
 	public:
-		LevelsBuffer<LevelsCapacity> ch0, ch1, ch2, ch3;
+		LevelsBufferHalf<LevelsCapacity> ch0, ch1, ch2, ch3;
 
 		ALWAYS_INLINED Subset() noexcept = default;
 
@@ -643,23 +751,23 @@ namespace Mode6 {
 			}
 			else
 			{
-				ch0.ComputeChannelLevelsReduced<7, pbits, false, gTableDeltas4_Value8>(area, 0, kAlpha, water);
+				ch0.ComputeChannelLevelsReduced<7, pbits, false, gTableDeltas4Half_Value8>(area, 0, kAlpha, water);
 			}
 			int min0 = ch0.MinErr;
 			if (min0 >= water)
 				return false;
 
-			ch1.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 1, kGreen, water - min0);
+			ch1.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 1, kGreen, water - min0);
 			int min1 = ch1.MinErr;
 			if (min0 + min1 >= water)
 				return false;
 
-			ch2.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 2, kRed, water - min0 - min1);
+			ch2.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 2, kRed, water - min0 - min1);
 			int min2 = ch2.MinErr;
 			if (min0 + min1 + min2 >= water)
 				return false;
 
-			ch3.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 3, kBlue, water - min0 - min1 - min2);
+			ch3.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 3, kBlue, water - min0 - min1 - min2);
 			int min3 = ch3.MinErr;
 			if (min0 + min1 + min2 + min3 >= water)
 				return false;

diff --git a/src/Bc7CoreMode7.cpp b/src/Bc7CoreMode7.cpp
@@ -213,7 +213,7 @@ namespace Mode7 {
 			p += 2;
 
 			if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
-				break;
+				goto done;
 		}
 
 		if (k & 2)
@@ -269,6 +269,7 @@ namespace Mode7 {
 
 			merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
 		}
+	done:
 #else
 		const __m128i mhalf = _mm_set1_epi16(32);
 
@@ -384,7 +385,7 @@ namespace Mode7 {
 			p += 2;
 
 			if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
-				break;
+				goto done;
 		}
 
 		if (k & 2)
@@ -433,6 +434,7 @@ namespace Mode7 {
 
 			merrorBlock = _mm_add_epi32(merrorBlock, mx);
 		}
+	done:
 #else
 		const __m128i mhalf = _mm_set1_epi16(32);
 

diff --git a/src/Bc7Tables.cpp b/src/Bc7Tables.cpp
@@ -492,7 +492,7 @@ alignas(64) uint8_t gTableDeltas3_Value5[0x100][0x20 * 0x20];
 alignas(64) uint16_t gTableCuts3_Value7Shared[0x100][0x80];
 alignas(64) uint16_t gTableCuts3_Value5[0x100][0x20];
 
-alignas(64) uint8_t gTableDeltas4_Value8[0x100][0x100 * 0x100];
+alignas(64) uint8_t gTableDeltas4Half_Value8[0x100][0x100 * 0x80];
 
 template<int bits>
 static INLINED void ReduceLevels(const uint8_t table[0x100][0x100 * 0x100], uint8_t* p)
@@ -584,15 +584,21 @@ void InitLevels() noexcept
 {
 	const __m128i mhalf = _mm_set1_epi16(32);
 
-	// 2-bit index
+	// 3-bit index
 	{
+		const auto gTableDeltas3_Value8 = gTableDeltas2_Value8;
+
 		__m128i mratio = _mm_setzero_si128();
 		{
-			__m128i m0 = gTableInterpolate2_U8[0];
-			__m128i m1 = gTableInterpolate2_U8[1];
+			__m128i m0 = gTableInterpolate3_U8[0];
+			__m128i m1 = gTableInterpolate3_U8[1];
+			__m128i m2 = gTableInterpolate3_U8[2];
+			__m128i m3 = gTableInterpolate3_U8[3];
 
-			mratio = _mm_blend_epi16(mratio, m0, 0x11 + 0x44);
-			mratio = _mm_blend_epi16(mratio, m1, 0x22 + 0x88);
+			mratio = _mm_blend_epi16(mratio, m0, 0x11);
+			mratio = _mm_blend_epi16(mratio, m1, 0x22);
+			mratio = _mm_blend_epi16(mratio, m2, 0x44);
+			mratio = _mm_blend_epi16(mratio, m3, 0x88);
 		}
 
 		for (int x = 0; x < 0x100; x++)
@@ -616,35 +622,28 @@ void InitLevels() noexcept
 
 					mv = _mm_srli_epi16(mv, kDenoise);
 
-					gTableDeltas2_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
+					gTableDeltas3_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
 				}
 			}
 		}
 
-		ReduceLevels<7>(gTableDeltas2_Value8, &gTableDeltas2_Value7[0][0]);
-		ReduceLevels<6>(gTableDeltas2_Value8, &gTableDeltas2_Value6[0][0]);
-		ReduceLevels<5>(gTableDeltas2_Value8, &gTableDeltas2_Value5[0][0]);
+		ReduceLevels<7>(gTableDeltas3_Value8, &gTableDeltas3_Value7Shared[0][0]); FilterSharedLevels<7>(&gTableDeltas3_Value7Shared[0][0]);
+		ReduceLevels<6>(gTableDeltas3_Value8, &gTableDeltas3_Value6[0][0]);
+		ReduceLevels<5>(gTableDeltas3_Value8, &gTableDeltas3_Value5[0][0]);
 
-		CutLevels<8>(gTableDeltas2_Value8, gTableCuts2_Value8);
-		CutLevels<6>(gTableDeltas2_Value6, gTableCuts2_Value6);
-		CutLevels<5>(gTableDeltas2_Value5, gTableCuts2_Value5);
+		CutLevels<7>(gTableDeltas3_Value7Shared, gTableCuts3_Value7Shared);
+		CutLevels<5>(gTableDeltas3_Value5, gTableCuts3_Value5);
 	}
 
-	// 3-bit index
+	// 2-bit index
 	{
-		const auto gTableDeltas3_Value8 = gTableDeltas4_Value8;
-
 		__m128i mratio = _mm_setzero_si128();
 		{
-			__m128i m0 = gTableInterpolate3_U8[0];
-			__m128i m1 = gTableInterpolate3_U8[1];
-			__m128i m2 = gTableInterpolate3_U8[2];
-			__m128i m3 = gTableInterpolate3_U8[3];
+			__m128i m0 = gTableInterpolate2_U8[0];
+			__m128i m1 = gTableInterpolate2_U8[1];
 
-			mratio = _mm_blend_epi16(mratio, m0, 0x11);
-			mratio = _mm_blend_epi16(mratio, m1, 0x22);
-			mratio = _mm_blend_epi16(mratio, m2, 0x44);
-			mratio = _mm_blend_epi16(mratio, m3, 0x88);
+			mratio = _mm_blend_epi16(mratio, m0, 0x11 + 0x44);
+			mratio = _mm_blend_epi16(mratio, m1, 0x22 + 0x88);
 		}
 
 		for (int x = 0; x < 0x100; x++)
@@ -668,17 +667,18 @@ void InitLevels() noexcept
 
 					mv = _mm_srli_epi16(mv, kDenoise);
 
-					gTableDeltas3_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
+					gTableDeltas2_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
 				}
 			}
 		}
 
-		ReduceLevels<7>(gTableDeltas3_Value8, &gTableDeltas3_Value7Shared[0][0]); FilterSharedLevels<7>(&gTableDeltas3_Value7Shared[0][0]);
-		ReduceLevels<6>(gTableDeltas3_Value8, &gTableDeltas3_Value6[0][0]);
-		ReduceLevels<5>(gTableDeltas3_Value8, &gTableDeltas3_Value5[0][0]);
+		ReduceLevels<7>(gTableDeltas2_Value8, &gTableDeltas2_Value7[0][0]);
+		ReduceLevels<6>(gTableDeltas2_Value8, &gTableDeltas2_Value6[0][0]);
+		ReduceLevels<5>(gTableDeltas2_Value8, &gTableDeltas2_Value5[0][0]);
 
-		CutLevels<7>(gTableDeltas3_Value7Shared, gTableCuts3_Value7Shared);
-		CutLevels<5>(gTableDeltas3_Value5, gTableCuts3_Value5);
+		CutLevels<8>(gTableDeltas2_Value8, gTableCuts2_Value8);
+		CutLevels<6>(gTableDeltas2_Value6, gTableCuts2_Value6);
+		CutLevels<5>(gTableDeltas2_Value5, gTableCuts2_Value5);
 	}
 
 	// 4-bit index
@@ -733,11 +733,13 @@ void InitLevels() noexcept
 					mv0 = _mm_abs_epi16(mv0);
 					mv1 = _mm_abs_epi16(mv1);
 
-					__m128i mv = _mm_min_epu16(mv0, mv1);
+					__m128i mv = _mm_min_epi16(mv0, mv1);
 
 					mv = _mm_srli_epi16(mv, kDenoise);
 
-					gTableDeltas4_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
+					mv = _mm_min_epi16(mv, _mm_set1_epi16(0xF));
+
+					gTableDeltas4Half_Value8[x][c >> 1] |= static_cast<uint8_t>(_mm_extract_epi16(_mm_minpos_epu16(mv), 0) << ((c & 1) << 2));
 				}
 			}
 		}