Skip to content

Commit

Permalink
Revive precise compression in an unusual way
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Evstyukhin committed Jan 9, 2021
1 parent de61627 commit 6b46835
Show file tree
Hide file tree
Showing 14 changed files with 661 additions and 164 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ The solution was tested on SSSE3, SSE4.1, AVX, AVX2, AVX-512BW - capable CPUs fo

I would recommend using AVX2 or AVX-512 for the best performance. See Bc7Mode.h about settings.

## Customization

The kDenoise and kDenoiseStep constants in Bc7Mode.h define RDO capabilities:

1. The most performant and compact mode is defined by kDenoise = 1, kDenoiseStep = 3 \* 3.
2. Generally precise but much slower mode is defined by kDenoise = 0, kDenoiseStep = 0.
3. Especially for Moon shots (black backround with low light amplitude) I recommend kDenoise = 0, kDenoiseStep = 3 \* 3.

The constants kAlpha, kGreen, kRed, kBlue set weights for channels and depend on the nature of the data.

## Example

Recompressing "BC7Ltest.png" (gained from https://code.google.com/archive/p/nvidia-texture-tools/downloads bc7_export.zip) on i7-6700 CPU:
Expand Down
82 changes: 68 additions & 14 deletions src/Bc7Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@ static INLINED int ComputeOpaqueAlphaError(const Area& area) noexcept
{
int da = *(const short*)&area.DataMask_I16[i] ^ 255;

da >>= kDenoise;
if constexpr (!kDenoise)
{
da = (da > 0x7F) ? 0x7F : da;
}
else
{
da >>= kDenoise;
}

error += da * da;
}
Expand Down Expand Up @@ -729,8 +736,14 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
__m512i wy = _mm512_abs_epi16(wx);
__m512i ww = _mm512_abs_epi16(wz);

wy = _mm512_srli_epi16(wy, kDenoise);
ww = _mm512_srli_epi16(ww, kDenoise);
if constexpr (!kDenoise)
{
wy = _mm512_adds_epu8(wy, wy);
ww = _mm512_adds_epu8(ww, ww);
}

wy = _mm512_srli_epi16(wy, kDenoiseShift);
ww = _mm512_srli_epi16(ww, kDenoiseShift);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);
Expand Down Expand Up @@ -767,7 +780,12 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

__m512i wy = _mm512_abs_epi16(wx);

wy = _mm512_srli_epi16(wy, kDenoise);
if constexpr (!kDenoise)
{
wy = _mm512_adds_epu8(wy, wy);
}

wy = _mm512_srli_epi16(wy, kDenoiseShift);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);
Expand All @@ -794,7 +812,12 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

__m256i vy = _mm256_abs_epi16(vx);

vy = _mm256_srli_epi16(vy, kDenoise);
if constexpr (!kDenoise)
{
vy = _mm256_adds_epu8(vy, vy);
}

vy = _mm256_srli_epi16(vy, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -883,10 +906,18 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
__m256i vyy = _mm256_abs_epi16(vxx);
__m256i vww = _mm256_abs_epi16(vzz);

vy = _mm256_srli_epi16(vy, kDenoise);
vw = _mm256_srli_epi16(vw, kDenoise);
vyy = _mm256_srli_epi16(vyy, kDenoise);
vww = _mm256_srli_epi16(vww, kDenoise);
if constexpr (!kDenoise)
{
vy = _mm256_adds_epu8(vy, vy);
vw = _mm256_adds_epu8(vw, vw);
vyy = _mm256_adds_epu8(vyy, vyy);
vww = _mm256_adds_epu8(vww, vww);
}

vy = _mm256_srli_epi16(vy, kDenoiseShift);
vw = _mm256_srli_epi16(vw, kDenoiseShift);
vyy = _mm256_srli_epi16(vyy, kDenoiseShift);
vww = _mm256_srli_epi16(vww, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -946,8 +977,14 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
__m256i vy = _mm256_abs_epi16(vx);
__m256i vw = _mm256_abs_epi16(vz);

vy = _mm256_srli_epi16(vy, kDenoise);
vw = _mm256_srli_epi16(vw, kDenoise);
if constexpr (!kDenoise)
{
vy = _mm256_adds_epu8(vy, vy);
vw = _mm256_adds_epu8(vw, vw);
}

vy = _mm256_srli_epi16(vy, kDenoiseShift);
vw = _mm256_srli_epi16(vw, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -984,7 +1021,12 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

__m256i vy = _mm256_abs_epi16(vx);

vy = _mm256_srli_epi16(vy, kDenoise);
if constexpr (!kDenoise)
{
vy = _mm256_adds_epu8(vy, vy);
}

vy = _mm256_srli_epi16(vy, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -1075,7 +1117,12 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

__m128i my = _mm_abs_epi16(mx);

my = _mm_srli_epi16(my, kDenoise);
if constexpr (!kDenoise)
{
my = _mm_adds_epu8(my, my);
}

my = _mm_srli_epi16(my, kDenoiseShift);

mx = _mm_mullo_epi16(mx, mx);
my = _mm_mullo_epi16(my, my);
Expand Down Expand Up @@ -1192,7 +1239,14 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

const int index = (bottom != alpha0) ? M - 1 : 0;

bottom >>= kDenoise;
if constexpr (!kDenoise)
{
bottom = (bottom > 0x7F) ? 0x7F : bottom;
}
else
{
bottom >>= kDenoise;
}

errorBlock += bottom * bottom * _mm_extract_epi16(mweights, 0) * int(area.Count - area.Active);

Expand Down
150 changes: 120 additions & 30 deletions src/Bc7CoreMode4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,18 @@ namespace Mode4 {
wz = _mm512_abs_epi16(wz);
ww = _mm512_abs_epi16(ww);

wx = _mm512_srli_epi16(wx, kDenoise);
wy = _mm512_srli_epi16(wy, kDenoise);
wz = _mm512_srli_epi16(wz, kDenoise);
ww = _mm512_srli_epi16(ww, kDenoise);
if constexpr (!kDenoise)
{
wx = _mm512_adds_epu8(wx, wx);
wy = _mm512_adds_epu8(wy, wy);
wz = _mm512_adds_epu8(wz, wz);
ww = _mm512_adds_epu8(ww, ww);
}

wx = _mm512_srli_epi16(wx, kDenoiseShift);
wy = _mm512_srli_epi16(wy, kDenoiseShift);
wz = _mm512_srli_epi16(wz, kDenoiseShift);
ww = _mm512_srli_epi16(ww, kDenoiseShift);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);
Expand Down Expand Up @@ -323,10 +331,18 @@ namespace Mode4 {
vz = _mm256_abs_epi16(vz);
vw = _mm256_abs_epi16(vw);

vx = _mm256_srli_epi16(vx, kDenoise);
vy = _mm256_srli_epi16(vy, kDenoise);
vz = _mm256_srli_epi16(vz, kDenoise);
vw = _mm256_srli_epi16(vw, kDenoise);
if constexpr (!kDenoise)
{
vx = _mm256_adds_epu8(vx, vx);
vy = _mm256_adds_epu8(vy, vy);
vz = _mm256_adds_epu8(vz, vz);
vw = _mm256_adds_epu8(vw, vw);
}

vx = _mm256_srli_epi16(vx, kDenoiseShift);
vy = _mm256_srli_epi16(vy, kDenoiseShift);
vz = _mm256_srli_epi16(vz, kDenoiseShift);
vw = _mm256_srli_epi16(vw, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -410,10 +426,18 @@ namespace Mode4 {
mz = _mm_abs_epi16(mz);
mw = _mm_abs_epi16(mw);

mx = _mm_srli_epi16(mx, kDenoise);
my = _mm_srli_epi16(my, kDenoise);
mz = _mm_srli_epi16(mz, kDenoise);
mw = _mm_srli_epi16(mw, kDenoise);
if constexpr (!kDenoise)
{
mx = _mm_adds_epu8(mx, mx);
my = _mm_adds_epu8(my, my);
mz = _mm_adds_epu8(mz, mz);
mw = _mm_adds_epu8(mw, mw);
}

mx = _mm_srli_epi16(mx, kDenoiseShift);
my = _mm_srli_epi16(my, kDenoiseShift);
mz = _mm_srli_epi16(mz, kDenoiseShift);
mw = _mm_srli_epi16(mw, kDenoiseShift);

mx = _mm_mullo_epi16(mx, mx);
my = _mm_mullo_epi16(my, my);
Expand Down Expand Up @@ -478,7 +502,12 @@ namespace Mode4 {

mweight = _mm_mullo_epi16(mweight, _mm_cvtsi32_si128(static_cast<int>(16 - i)));

mx = _mm_srli_epi16(mx, kDenoise);
if constexpr (!kDenoise)
{
mx = _mm_adds_epu8(mx, mx);
}

mx = _mm_srli_epi16(mx, kDenoiseShift);

mx = _mm_mullo_epi16(mx, mx);

Expand Down Expand Up @@ -544,10 +573,18 @@ namespace Mode4 {
wz = _mm512_abs_epi16(wz);
ww = _mm512_abs_epi16(ww);

wx = _mm512_srli_epi16(wx, kDenoise);
wy = _mm512_srli_epi16(wy, kDenoise);
wz = _mm512_srli_epi16(wz, kDenoise);
ww = _mm512_srli_epi16(ww, kDenoise);
if constexpr (!kDenoise)
{
wx = _mm512_adds_epu8(wx, wx);
wy = _mm512_adds_epu8(wy, wy);
wz = _mm512_adds_epu8(wz, wz);
ww = _mm512_adds_epu8(ww, ww);
}

wx = _mm512_srli_epi16(wx, kDenoiseShift);
wy = _mm512_srli_epi16(wy, kDenoiseShift);
wz = _mm512_srli_epi16(wz, kDenoiseShift);
ww = _mm512_srli_epi16(ww, kDenoiseShift);

wx = _mm512_mullo_epi16(wx, wx);
wy = _mm512_mullo_epi16(wy, wy);
Expand Down Expand Up @@ -636,10 +673,18 @@ namespace Mode4 {
vz = _mm256_abs_epi16(vz);
vw = _mm256_abs_epi16(vw);

vx = _mm256_srli_epi16(vx, kDenoise);
vy = _mm256_srli_epi16(vy, kDenoise);
vz = _mm256_srli_epi16(vz, kDenoise);
vw = _mm256_srli_epi16(vw, kDenoise);
if constexpr (!kDenoise)
{
vx = _mm256_adds_epu8(vx, vx);
vy = _mm256_adds_epu8(vy, vy);
vz = _mm256_adds_epu8(vz, vz);
vw = _mm256_adds_epu8(vw, vw);
}

vx = _mm256_srli_epi16(vx, kDenoiseShift);
vy = _mm256_srli_epi16(vy, kDenoiseShift);
vz = _mm256_srli_epi16(vz, kDenoiseShift);
vw = _mm256_srli_epi16(vw, kDenoiseShift);

vx = _mm256_mullo_epi16(vx, vx);
vy = _mm256_mullo_epi16(vy, vy);
Expand Down Expand Up @@ -727,10 +772,18 @@ namespace Mode4 {
mz = _mm_abs_epi16(mz);
mw = _mm_abs_epi16(mw);

mx = _mm_srli_epi16(mx, kDenoise);
my = _mm_srli_epi16(my, kDenoise);
mz = _mm_srli_epi16(mz, kDenoise);
mw = _mm_srli_epi16(mw, kDenoise);
if constexpr (!kDenoise)
{
mx = _mm_adds_epu8(mx, mx);
my = _mm_adds_epu8(my, my);
mz = _mm_adds_epu8(mz, mz);
mw = _mm_adds_epu8(mw, mw);
}

mx = _mm_srli_epi16(mx, kDenoiseShift);
my = _mm_srli_epi16(my, kDenoiseShift);
mz = _mm_srli_epi16(mz, kDenoiseShift);
mw = _mm_srli_epi16(mw, kDenoiseShift);

mx = _mm_mullo_epi16(mx, mx);
my = _mm_mullo_epi16(my, my);
Expand Down Expand Up @@ -799,7 +852,12 @@ namespace Mode4 {

mweight = _mm_mullo_epi16(mweight, _mm_cvtsi32_si128(static_cast<int>(16 - i)));

mx = _mm_srli_epi16(mx, kDenoise);
if constexpr (!kDenoise)
{
mx = _mm_adds_epu8(mx, mx);
}

mx = _mm_srli_epi16(mx, kDenoiseShift);

mx = _mm_mullo_epi16(mx, mx);

Expand Down Expand Up @@ -858,7 +916,15 @@ namespace Mode4 {
{
int da = *(const uint16_t*)&state3.Values_I16[state3.Best[i]] - *(const uint16_t*)&area.DataMask_I16[i];

da = (da < 0 ? -da : da) >> kDenoise;
da = (da < 0) ? -da : da;
if constexpr (!kDenoise)
{
da = (da > 0x7F) ? 0x7F : da;
}
else
{
da >>= kDenoise;
}

errorAlpha += da * da;
}
Expand Down Expand Up @@ -909,7 +975,15 @@ namespace Mode4 {
{
int da = *(const uint16_t*)&state1.Values_I16[state1.Best[i]] - *(const uint16_t*)&area.DataMask_I16[i];

da = (da < 0 ? -da : da) >> kDenoise;
da = (da < 0) ? -da : da;
if constexpr (!kDenoise)
{
da = (da > 0x7F) ? 0x7F : da;
}
else
{
da >>= kDenoise;
}

errorAlpha += da * da;
}
Expand Down Expand Up @@ -975,7 +1049,15 @@ namespace Mode4 {
{
int da = *(const uint16_t*)&state3.Values_I16[state3.Best[i]] - *(const uint16_t*)&area.DataMask_I16[i];

da = (da < 0 ? -da : da) >> kDenoise;
da = (da < 0) ? -da : da;
if constexpr (!kDenoise)
{
da = (da > 0x7F) ? 0x7F : da;
}
else
{
da >>= kDenoise;
}

errorAlpha += da * da;
}
Expand Down Expand Up @@ -1016,7 +1098,15 @@ namespace Mode4 {
{
int da = *(const uint16_t*)&state1.Values_I16[state1.Best[i]] - *(const uint16_t*)&area.DataMask_I16[i];

da = (da < 0 ? -da : da) >> kDenoise;
da = (da < 0) ? -da : da;
if constexpr (!kDenoise)
{
da = (da > 0x7F) ? 0x7F : da;
}
else
{
da >>= kDenoise;
}

errorAlpha += da * da;
}
Expand Down
Loading

0 comments on commit 6b46835

Please sign in to comment.