Skip to content

Commit

Permalink
Refactoring & optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Evstyukhin committed Jul 8, 2020
1 parent 462db37 commit 926cb25
Show file tree
Hide file tree
Showing 14 changed files with 774 additions and 1,073 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ Modes 7, 1, 3 are memory-bound because of large tables, they partially limited i

For premultiplied alpha it is necessary to specify "/nomask" command-line option. While extruded RGBA images can highly benefit from masking. Switch "/retina" allows future artifact-free scaling by 0.5. Masking gives smaller compressed images and better borders, because masked pixels can have any value.

Many encoders use single metric (RMSE / MSE / PSNR) insensitive to a direction. While SSIM is unhandy for direct compression, it enhances correlation when encoding produces equal deltas and so SSIM overcomes dithering.

## Usage

The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX512BW - capable CPUs for Win64 API only.
The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX-512BW - capable CPUs for Win64 API only.

`Bc7Compress /nomask /noflip source.png destination.ktx [/debug result.png]`

Expand Down
1 change: 1 addition & 0 deletions src/Bc7Compress.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
<ClInclude Include="SnippetHorizontalSum4.h" />
<ClInclude Include="SnippetInsertRemoveZeroBit.h" />
<ClInclude Include="SnippetLevelsBuffer.h" />
<ClInclude Include="SnippetLevelsBufferHalf.h" />
<ClInclude Include="SnippetLevelsMinimum.h" />
<ClInclude Include="SnippetTargetSSSE3.h" />
<ClInclude Include="Worker.h" />
Expand Down
3 changes: 3 additions & 0 deletions src/Bc7Compress.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@
<ClInclude Include="SnippetTargetSSSE3.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="SnippetLevelsBufferHalf.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="pch.cpp">
Expand Down
5 changes: 3 additions & 2 deletions src/Bc7Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#if defined(OPTION_COUNTERS)
#include "SnippetLevelsMinimum.h"
#include "SnippetLevelsBuffer.h"
#include "SnippetLevelsBufferHalf.h"
#endif

#if defined(OPTION_COUNTERS)
Expand Down Expand Up @@ -1316,8 +1317,8 @@ void CompressStatistics()

PRINTF("[Minimum]\tFull = %i, Short = %i",
gMinimumFull.load(), gMinimumShort.load());
PRINTF("[Estimate]\tFull = %i, Short = %i",
gEstimateFull.load(), gEstimateShort.load());
PRINTF("[Estimate]\tFull = %i, Short = %i, Half = %i",
gEstimateFull.load(), gEstimateShort.load(), gEstimateHalf.load());

PRINTF("\t\t[1] = %i, [2] = %i, [3] = %i, [4] = %i",
gLevels[1].load(), gLevels[2].load(), gLevels[3].load(), gLevels[4].load());
Expand Down
126 changes: 117 additions & 9 deletions src/Bc7CoreMode6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "Bc7Pca.h"

#include "SnippetInsertRemoveZeroBit.h"
#include "SnippetLevelsBuffer.h"
#include "SnippetLevelsBufferHalf.h"

// https://docs.microsoft.com/en-us/windows/desktop/direct3d11/bc7-format-mode-reference#mode-6

Expand Down Expand Up @@ -109,7 +109,7 @@ namespace Mode6 {
if (error)
{
error *= kAlpha;
int v = gTableDeltas4_Value8[0][alpha];
int v = (gTableDeltas4Half_Value8[0][alpha >> 1] >> ((alpha & 1) << 2)) & 0xF;
error *= v * v;
}

Expand All @@ -120,7 +120,114 @@ namespace Mode6 {
{
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
#if defined(OPTION_AVX512)
const __m512i wweights = _mm512_broadcastq_epi64(mweights);

const __m512i whalf = _mm512_set1_epi16(32);

mc = _mm_packus_epi16(mc, mc);
__m512i wc = _mm512_broadcastq_epi64(mc);

__m512i wt0 = *(const __m512i*)&gTableInterpolate4_U8[0];
__m512i wt1 = *(const __m512i*)&gTableInterpolate4_U8[4];

wt0 = _mm512_maddubs_epi16(wc, wt0);
wt1 = _mm512_maddubs_epi16(wc, wt1);

wt0 = _mm512_add_epi16(wt0, whalf);
wt1 = _mm512_add_epi16(wt1, whalf);

wt0 = _mm512_srli_epi16(wt0, 6);
wt1 = _mm512_srli_epi16(wt1, 6);

__m512i wtx = _mm512_permutex_epi64(wt0, 0x44);
__m512i wty = _mm512_permutex_epi64(wt0, 0xEE);
__m512i wtz = _mm512_permutex_epi64(wt1, 0x44);
__m512i wtw = _mm512_permutex_epi64(wt1, 0xEE);

int k = static_cast<int>(area.Active);
const __m256i* p = (const __m256i*)area.DataMask_I16;

while ((k -= 2) >= 0)
{
__m256i vpacked = _mm256_load_si256(p);
__m256i vpixel = _mm256_unpacklo_epi64(vpacked, vpacked);
__m512i wpixel = _mm512_broadcast_i64x4(vpixel);

__m512i wx = _mm512_sub_epi16(wpixel, wtx);
__m512i wy = _mm512_sub_epi16(wpixel, wty);
__m512i wz = _mm512_sub_epi16(wpixel, wtz);
__m512i ww = _mm512_sub_epi16(wpixel, wtw);

wx = _mm512_abs_epi16(wx);
wy = _mm512_abs_epi16(wy);
wz = _mm512_abs_epi16(wz);
ww = _mm512_abs_epi16(ww);

wx = _mm512_srli_epi16(wx, kDenoise);
wy = _mm512_srli_epi16(wy, kDenoise);
wz = _mm512_srli_epi16(wz, kDenoise);
ww = _mm512_srli_epi16(ww, kDenoise);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);
wz = _mm512_mullo_epi16(wz, wz);
ww = _mm512_mullo_epi16(ww, ww);

wx = _mm512_madd_epi16(wx, wweights);
wy = _mm512_madd_epi16(wy, wweights);
wz = _mm512_madd_epi16(wz, wweights);
ww = _mm512_madd_epi16(ww, wweights);

wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));
wz = _mm512_add_epi32(wz, _mm512_shuffle_epi32(wz, _MM_SHUFFLE(2, 3, 0, 1)));
ww = _mm512_add_epi32(ww, _mm512_shuffle_epi32(ww, _MM_SHUFFLE(2, 3, 0, 1)));

wx = _mm512_min_epi32(_mm512_min_epi32(wx, wy), _mm512_min_epi32(wz, ww));
__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));

merrorBlock = _mm_add_epi32(merrorBlock, _mm256_castsi256_si128(vx));
merrorBlock = _mm_add_epi32(merrorBlock, _mm256_extracti128_si256(vx, 1));

p++;

if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
goto done;
}

if (k & 1)
{
__m128i mpacked = _mm_load_si128((const __m128i*)p);
__m512i wpixel = _mm512_broadcastq_epi64(mpacked);

__m512i wx = _mm512_sub_epi16(wpixel, wt0);
__m512i wy = _mm512_sub_epi16(wpixel, wt1);

wx = _mm512_abs_epi16(wx);
wy = _mm512_abs_epi16(wy);

wx = _mm512_srli_epi16(wx, kDenoise);
wy = _mm512_srli_epi16(wy, kDenoise);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);

wx = _mm512_madd_epi16(wx, wweights);
wy = _mm512_madd_epi16(wy, wweights);

wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));

wx = _mm512_min_epi32(wx, wy);
__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));

merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
}
done:
#elif defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
Expand Down Expand Up @@ -392,7 +499,7 @@ namespace Mode6 {
p++;

if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
break;
goto done;
}

if (k & 1)
Expand Down Expand Up @@ -422,6 +529,7 @@ namespace Mode6 {

merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
}
done:
#else
const __m128i mhalf = _mm_set1_epi16(32);

Expand Down Expand Up @@ -630,7 +738,7 @@ namespace Mode6 {
class Subset final
{
public:
LevelsBuffer<LevelsCapacity> ch0, ch1, ch2, ch3;
LevelsBufferHalf<LevelsCapacity> ch0, ch1, ch2, ch3;

ALWAYS_INLINED Subset() noexcept = default;

Expand All @@ -643,23 +751,23 @@ namespace Mode6 {
}
else
{
ch0.ComputeChannelLevelsReduced<7, pbits, false, gTableDeltas4_Value8>(area, 0, kAlpha, water);
ch0.ComputeChannelLevelsReduced<7, pbits, false, gTableDeltas4Half_Value8>(area, 0, kAlpha, water);
}
int min0 = ch0.MinErr;
if (min0 >= water)
return false;

ch1.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 1, kGreen, water - min0);
ch1.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 1, kGreen, water - min0);
int min1 = ch1.MinErr;
if (min0 + min1 >= water)
return false;

ch2.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 2, kRed, water - min0 - min1);
ch2.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 2, kRed, water - min0 - min1);
int min2 = ch2.MinErr;
if (min0 + min1 + min2 >= water)
return false;

ch3.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4_Value8>(area, 3, kBlue, water - min0 - min1 - min2);
ch3.ComputeChannelLevelsReduced<7, pbits, true, gTableDeltas4Half_Value8>(area, 3, kBlue, water - min0 - min1 - min2);
int min3 = ch3.MinErr;
if (min0 + min1 + min2 + min3 >= water)
return false;
Expand Down
6 changes: 4 additions & 2 deletions src/Bc7CoreMode7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ namespace Mode7 {
p += 2;

if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
break;
goto done;
}

if (k & 2)
Expand Down Expand Up @@ -269,6 +269,7 @@ namespace Mode7 {

merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
}
done:
#else
const __m128i mhalf = _mm_set1_epi16(32);

Expand Down Expand Up @@ -384,7 +385,7 @@ namespace Mode7 {
p += 2;

if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
break;
goto done;
}

if (k & 2)
Expand Down Expand Up @@ -433,6 +434,7 @@ namespace Mode7 {

merrorBlock = _mm_add_epi32(merrorBlock, mx);
}
done:
#else
const __m128i mhalf = _mm_set1_epi16(32);

Expand Down
66 changes: 34 additions & 32 deletions src/Bc7Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ alignas(64) uint8_t gTableDeltas3_Value5[0x100][0x20 * 0x20];
alignas(64) uint16_t gTableCuts3_Value7Shared[0x100][0x80];
alignas(64) uint16_t gTableCuts3_Value5[0x100][0x20];

alignas(64) uint8_t gTableDeltas4_Value8[0x100][0x100 * 0x100];
alignas(64) uint8_t gTableDeltas4Half_Value8[0x100][0x100 * 0x80];

template<int bits>
static INLINED void ReduceLevels(const uint8_t table[0x100][0x100 * 0x100], uint8_t* p)
Expand Down Expand Up @@ -584,15 +584,21 @@ void InitLevels() noexcept
{
const __m128i mhalf = _mm_set1_epi16(32);

// 2-bit index
// 3-bit index
{
const auto gTableDeltas3_Value8 = gTableDeltas2_Value8;

__m128i mratio = _mm_setzero_si128();
{
__m128i m0 = gTableInterpolate2_U8[0];
__m128i m1 = gTableInterpolate2_U8[1];
__m128i m0 = gTableInterpolate3_U8[0];
__m128i m1 = gTableInterpolate3_U8[1];
__m128i m2 = gTableInterpolate3_U8[2];
__m128i m3 = gTableInterpolate3_U8[3];

mratio = _mm_blend_epi16(mratio, m0, 0x11 + 0x44);
mratio = _mm_blend_epi16(mratio, m1, 0x22 + 0x88);
mratio = _mm_blend_epi16(mratio, m0, 0x11);
mratio = _mm_blend_epi16(mratio, m1, 0x22);
mratio = _mm_blend_epi16(mratio, m2, 0x44);
mratio = _mm_blend_epi16(mratio, m3, 0x88);
}

for (int x = 0; x < 0x100; x++)
Expand All @@ -616,35 +622,28 @@ void InitLevels() noexcept

mv = _mm_srli_epi16(mv, kDenoise);

gTableDeltas2_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
gTableDeltas3_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
}
}
}

ReduceLevels<7>(gTableDeltas2_Value8, &gTableDeltas2_Value7[0][0]);
ReduceLevels<6>(gTableDeltas2_Value8, &gTableDeltas2_Value6[0][0]);
ReduceLevels<5>(gTableDeltas2_Value8, &gTableDeltas2_Value5[0][0]);
ReduceLevels<7>(gTableDeltas3_Value8, &gTableDeltas3_Value7Shared[0][0]); FilterSharedLevels<7>(&gTableDeltas3_Value7Shared[0][0]);
ReduceLevels<6>(gTableDeltas3_Value8, &gTableDeltas3_Value6[0][0]);
ReduceLevels<5>(gTableDeltas3_Value8, &gTableDeltas3_Value5[0][0]);

CutLevels<8>(gTableDeltas2_Value8, gTableCuts2_Value8);
CutLevels<6>(gTableDeltas2_Value6, gTableCuts2_Value6);
CutLevels<5>(gTableDeltas2_Value5, gTableCuts2_Value5);
CutLevels<7>(gTableDeltas3_Value7Shared, gTableCuts3_Value7Shared);
CutLevels<5>(gTableDeltas3_Value5, gTableCuts3_Value5);
}

// 3-bit index
// 2-bit index
{
const auto gTableDeltas3_Value8 = gTableDeltas4_Value8;

__m128i mratio = _mm_setzero_si128();
{
__m128i m0 = gTableInterpolate3_U8[0];
__m128i m1 = gTableInterpolate3_U8[1];
__m128i m2 = gTableInterpolate3_U8[2];
__m128i m3 = gTableInterpolate3_U8[3];
__m128i m0 = gTableInterpolate2_U8[0];
__m128i m1 = gTableInterpolate2_U8[1];

mratio = _mm_blend_epi16(mratio, m0, 0x11);
mratio = _mm_blend_epi16(mratio, m1, 0x22);
mratio = _mm_blend_epi16(mratio, m2, 0x44);
mratio = _mm_blend_epi16(mratio, m3, 0x88);
mratio = _mm_blend_epi16(mratio, m0, 0x11 + 0x44);
mratio = _mm_blend_epi16(mratio, m1, 0x22 + 0x88);
}

for (int x = 0; x < 0x100; x++)
Expand All @@ -668,17 +667,18 @@ void InitLevels() noexcept

mv = _mm_srli_epi16(mv, kDenoise);

gTableDeltas3_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
gTableDeltas2_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
}
}
}

ReduceLevels<7>(gTableDeltas3_Value8, &gTableDeltas3_Value7Shared[0][0]); FilterSharedLevels<7>(&gTableDeltas3_Value7Shared[0][0]);
ReduceLevels<6>(gTableDeltas3_Value8, &gTableDeltas3_Value6[0][0]);
ReduceLevels<5>(gTableDeltas3_Value8, &gTableDeltas3_Value5[0][0]);
ReduceLevels<7>(gTableDeltas2_Value8, &gTableDeltas2_Value7[0][0]);
ReduceLevels<6>(gTableDeltas2_Value8, &gTableDeltas2_Value6[0][0]);
ReduceLevels<5>(gTableDeltas2_Value8, &gTableDeltas2_Value5[0][0]);

CutLevels<7>(gTableDeltas3_Value7Shared, gTableCuts3_Value7Shared);
CutLevels<5>(gTableDeltas3_Value5, gTableCuts3_Value5);
CutLevels<8>(gTableDeltas2_Value8, gTableCuts2_Value8);
CutLevels<6>(gTableDeltas2_Value6, gTableCuts2_Value6);
CutLevels<5>(gTableDeltas2_Value5, gTableCuts2_Value5);
}

// 4-bit index
Expand Down Expand Up @@ -733,11 +733,13 @@ void InitLevels() noexcept
mv0 = _mm_abs_epi16(mv0);
mv1 = _mm_abs_epi16(mv1);

__m128i mv = _mm_min_epu16(mv0, mv1);
__m128i mv = _mm_min_epi16(mv0, mv1);

mv = _mm_srli_epi16(mv, kDenoise);

gTableDeltas4_Value8[x][c] = (uint8_t)_mm_extract_epi16(_mm_minpos_epu16(mv), 0);
mv = _mm_min_epi16(mv, _mm_set1_epi16(0xF));

gTableDeltas4Half_Value8[x][c >> 1] |= static_cast<uint8_t>(_mm_extract_epi16(_mm_minpos_epu16(mv), 0) << ((c & 1) << 2));
}
}
}
Expand Down
Loading

0 comments on commit 926cb25

Please sign in to comment.