Skip to content

Commit

Permalink
Add support for SSSE3, tune for AVX-512BW
Browse files Browse the repository at this point in the history
  • Loading branch information
Андрей Евстюхин committed May 12, 2020
1 parent 4db23df commit 9a32800
Show file tree
Hide file tree
Showing 16 changed files with 345 additions and 64 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Nebc7 always preserves opaque alpha for opaque blocks.

## Usage

The solution was tested on AVX-capable CPU for Win64 API only.
The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX512BW - capable CPUs for Win64 API only.

`Bc7Compress /nomask /noflip source.png destination.ktx [/debug result.png]`

Expand Down
1 change: 1 addition & 0 deletions src/Bc7Compress.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
<ClInclude Include="SnippetInsertRemoveZeroBit.h" />
<ClInclude Include="SnippetLevelsBuffer.h" />
<ClInclude Include="SnippetLevelsMinimum.h" />
<ClInclude Include="SnippetTargetSSSE3.h" />
<ClInclude Include="Worker.h" />
</ItemGroup>
<ItemGroup>
Expand Down
3 changes: 3 additions & 0 deletions src/Bc7Compress.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@
<ClInclude Include="Bc7Pca.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="SnippetTargetSSSE3.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="pch.cpp">
Expand Down
8 changes: 4 additions & 4 deletions src/Bc7CoreMode4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ namespace Mode4 {
const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(mweights), vrot);
const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastq_epi64(mweights), vrot);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0);
const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);
Expand Down Expand Up @@ -365,10 +365,10 @@ namespace Mode4 {
const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(mweights), vrot);
const __m256i vweights = _mm256_shuffle_epi8(_mm256_broadcastq_epi64(mweights), vrot);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0);
const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);
Expand Down
4 changes: 2 additions & 2 deletions src/Bc7CoreMode5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ namespace Mode5 {
const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

const __m256i vmask3 = _mm256_shuffle_epi8(_mm256_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, 0), vrot);
const __m256i vweights3 = _mm256_and_si256(vweights, vmask3);
Expand Down
8 changes: 4 additions & 4 deletions src/Bc7CoreMode6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ namespace Mode6 {
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vtx = *(const __m256i*)&gTableInterpolate4_U8[0];
__m256i vty = *(const __m256i*)&gTableInterpolate4_U8[2];
Expand Down Expand Up @@ -314,15 +314,15 @@ namespace Mode6 {
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);

mc = _mm_shuffle_epi32(mc, shuffle);
mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt0 = *(const __m256i*)&gTableInterpolate4GR_U8[0];
__m256i vt1 = *(const __m256i*)&gTableInterpolate4GR_U8[2];
Expand Down
8 changes: 4 additions & 4 deletions src/Bc7CoreMode7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,14 @@ namespace Mode7 {
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix4 = _mm_slli_epi32(mfix, 2);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

Expand Down Expand Up @@ -327,15 +327,15 @@ namespace Mode7 {
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix4 = _mm_slli_epi32(mfix, 2);

mc = _mm_shuffle_epi32(mc, shuffle);
mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate2GR_U8;

Expand Down
15 changes: 8 additions & 7 deletions src/Bc7Mode.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
//#define OPTION_PCA
//#define OPTION_COUNTERS
//#define OPTION_LINEAR
//#define OPTION_SLOWPOKE
#define OPTION_SELFCHECK

#if defined(OPTION_LINEAR)
Expand All @@ -21,18 +22,18 @@ enum { kAlpha = 1000, kGreen = 715, kRed = 213, kBlue = 72 };

enum { kColor = kGreen + kRed + kBlue };

#if defined(OPTION_AVX512) && !defined(OPTION_AVX2)
#define OPTION_AVX2
#if defined(OPTION_AVX512) && (!defined(__AVX512F__) || !defined(__AVX512BW__) || !defined(__AVX512VL__) || defined(OPTION_SLOWPOKE))
#error AVX-512 is required
#endif

#if defined(OPTION_AVX2) && !defined(OPTION_FMA) // Except Via Cores
#define OPTION_FMA
#if defined(OPTION_AVX512) && !defined(OPTION_AVX2)
#define OPTION_AVX2
#endif

#if defined(OPTION_AVX2) && !defined(__AVX2__)
#if defined(OPTION_AVX2) && (!defined(__AVX2__) || defined(OPTION_SLOWPOKE))
#error AVX2 is required
#endif

#if !defined(__AVX__)
#error AVX is required
#if defined(OPTION_AVX2) && !defined(OPTION_FMA) // Except Via Cores
#define OPTION_FMA
#endif
2 changes: 1 addition & 1 deletion src/Bc7Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ alignas(16) static constexpr short gTableInterpolate4[16][2] =
};

alignas(32) __m128i gTableInterpolate2_U8[4 >> 1];
alignas(32) __m128i gTableInterpolate3_U8[8 >> 1];
alignas(64) __m128i gTableInterpolate3_U8[8 >> 1];
alignas(32) __m128i gTableInterpolate4_U8[16 >> 1];

alignas(32) __m128i gTableInterpolate2GR_U8[4 >> (2 - 1)];
Expand Down
2 changes: 1 addition & 1 deletion src/Bc7Tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include "pch.h"

alignas(32) extern __m128i gTableInterpolate2_U8[4 >> 1];
alignas(32) extern __m128i gTableInterpolate3_U8[8 >> 1];
alignas(64) extern __m128i gTableInterpolate3_U8[8 >> 1];
alignas(32) extern __m128i gTableInterpolate4_U8[16 >> 1];

alignas(32) extern __m128i gTableInterpolate2GR_U8[4 >> (2 - 1)];
Expand Down
8 changes: 4 additions & 4 deletions src/SnippetComputeOpaqueSubset2.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ static INLINED int ComputeOpaqueSubsetError2(const Area& area, __m128i mc, const
const __m128i mfix = gFixWeightsGRB;

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix4 = _mm_slli_epi32(mfix, 2);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate2_U8;

Expand Down Expand Up @@ -197,15 +197,15 @@ static INLINED int ComputeOpaqueSubsetError2Pair(const Area& area, __m128i mc, c
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix4 = _mm_slli_epi32(mfix, 2);

mc = _mm_shuffle_epi32(mc, shuffle);
mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate2GR_U8;

Expand Down
92 changes: 87 additions & 5 deletions src/SnippetComputeOpaqueSubset3.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,97 @@ static INLINED int ComputeOpaqueSubsetError3(const Area& area, __m128i mc, const
const __m128i mweights = gWeightsGRB;
const __m128i mfix = gFixWeightsGRB;

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
#if defined(OPTION_AVX512)
const __m512i wweights = _mm512_broadcastq_epi64(mweights);

const __m512i whalf = _mm512_set1_epi16(32);
const __m512i wsign = _mm512_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);

mc = _mm_packus_epi16(mc, mc);
__m512i wc = _mm512_broadcastq_epi64(mc);

__m512i wt = *(const __m512i*)gTableInterpolate3_U8;

wt = _mm512_maddubs_epi16(wc, wt);

wt = _mm512_add_epi16(wt, whalf);

wt = _mm512_srli_epi16(wt, 6);

__m512i wtx = _mm512_permutex_epi64(wt, 0x44);
__m512i wty = _mm512_permutex_epi64(wt, 0xEE);

int k = static_cast<int>(area.Count);
const __m256i* p = (const __m256i*)area.DataMask_I16;

while ((k -= 2) >= 0)
{
__m256i vpacked = _mm256_load_si256(p);
__m256i vpixel = _mm256_unpacklo_epi64(vpacked, vpacked);
__m512i wpixel = _mm512_broadcast_i64x4(vpixel);

merrorBlock = _mm_add_epi32(merrorBlock, mfix2);

__m512i wx = _mm512_sub_epi16(wpixel, wtx);
__m512i wy = _mm512_sub_epi16(wpixel, wty);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);

wx = _mm512_xor_epi32(wx, wsign);
wy = _mm512_xor_epi32(wy, wsign);

wx = _mm512_madd_epi16(wx, wweights);
wy = _mm512_madd_epi16(wy, wweights);

wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));
wy = _mm512_add_epi32(wy, _mm512_shuffle_epi32(wy, _MM_SHUFFLE(2, 3, 0, 1)));

wx = _mm512_min_epi32(wx, wy);
__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));

merrorBlock = _mm_add_epi32(merrorBlock, _mm256_castsi256_si128(vx));
merrorBlock = _mm_add_epi32(merrorBlock, _mm256_extracti128_si256(vx, 1));

p++;

if (!(_mm_movemask_epi8(_mm_cmpgt_epi32(mwater, merrorBlock)) & 0xF))
break;
}

if (k & 1)
{
__m128i mpacked = _mm_load_si128((const __m128i*)p);
__m512i wpixel = _mm512_broadcastq_epi64(mpacked);

merrorBlock = _mm_add_epi32(merrorBlock, mfix);

__m512i wx = _mm512_sub_epi16(wpixel, wt);

wx = _mm512_mullo_epi16(wx, wx);

wx = _mm512_xor_epi32(wx, wsign);

wx = _mm512_madd_epi16(wx, wweights);

wx = _mm512_add_epi32(wx, _mm512_shuffle_epi32(wx, _MM_SHUFFLE(2, 3, 0, 1)));

__m256i vx = _mm256_min_epi32(_mm512_extracti64x4_epi64(wx, 1), _mm512_castsi512_si256(wx));
vx = _mm256_min_epi32(vx, _mm256_shuffle_epi32(vx, _MM_SHUFFLE(1, 0, 3, 2)));

merrorBlock = _mm_add_epi32(merrorBlock, _mm_min_epi32(_mm256_extracti128_si256(vx, 1), _mm256_castsi256_si128(vx)));
}
#elif defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix2 = _mm_add_epi32(mfix, mfix);

mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt0 = *(const __m256i*)&gTableInterpolate3_U8[0];
__m256i vt1 = *(const __m256i*)&gTableInterpolate3_U8[2];
Expand Down Expand Up @@ -189,15 +271,15 @@ static INLINED int ComputeOpaqueSubsetError3Pair(const Area& area, __m128i mc, c
__m128i merrorBlock = _mm_setzero_si128();

#if defined(OPTION_AVX2)
const __m256i vweights = _mm256_broadcastsi128_si256(mweights);
const __m256i vweights = _mm256_broadcastq_epi64(mweights);

const __m256i vhalf = _mm256_set1_epi16(32);
const __m256i vsign = _mm256_set1_epi16(-0x8000);
const __m128i mfix4 = _mm_slli_epi32(mfix, 2);

mc = _mm_shuffle_epi32(mc, shuffle);
mc = _mm_packus_epi16(mc, mc);
__m256i vc = _mm256_broadcastsi128_si256(mc);
__m256i vc = _mm256_broadcastq_epi64(mc);

__m256i vt = *(const __m256i*)gTableInterpolate3GR_U8;

Expand Down
16 changes: 8 additions & 8 deletions src/SnippetLevelsBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ class LevelsBuffer final

static INLINED void Estimate32Short(NodeShort*& nodesPtr, const uint8_t* values[16], const size_t count, const int c, const __m128i mtop) noexcept
{
const __m512i wwater = _mm512_maskz_broadcastw_epi16(kFullMask32, mtop);
const __m512i wwater = _mm512_broadcastw_epi16(mtop);

__m512i wsum = _mm512_setzero_si512();
uint32_t flags = ~0ui32;
Expand All @@ -491,21 +491,21 @@ class LevelsBuffer final

__m256i vdelta = _mm256_load_si256(p);

__m512i wadd = _mm512_maskz_cvtepu8_epi16(kFullMask32, vdelta);
__m512i wadd = _mm512_cvtepu8_epi16(vdelta);

wadd = _mm512_maskz_mullo_epi16(kFullMask32, wadd, wadd);
wadd = _mm512_mullo_epi16(wadd, wadd);

wsum = _mm512_maskz_adds_epu16(kFullMask32, wsum, wadd);
wsum = _mm512_adds_epu16(wsum, wadd);

flags = _mm512_mask_cmp_epu16_mask(kFullMask32, wwater, wsum, _MM_CMPINT_GT);
flags = _mm512_cmp_epu16_mask(wwater, wsum, _MM_CMPINT_GT);
if (!flags)
return;
}

Store8N(nodesPtr, _mm512_castsi512_si128(wsum), flags, c);
Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 1), flags >> 8, c + 8);
Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 2), flags >> 16, c + 16);
Store8N(nodesPtr, _mm512_maskz_extracti32x4_epi32(kFullMask8, wsum, 3), flags >> 24, c + 24);
Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 1), flags >> 8, c + 8);
Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 2), flags >> 16, c + 16);
Store8N(nodesPtr, _mm512_extracti32x4_epi32(wsum, 3), flags >> 24, c + 24);
}

static INLINED void Estimate16Short(NodeShort*& nodesPtr, const uint8_t* values[16], const size_t count, const int c, const __m128i mtop) noexcept
Expand Down
Loading

0 comments on commit 9a32800

Please sign in to comment.