Skip to content

Commit

Permalink
Unroll hot loops
Browse files Browse the repository at this point in the history
  • Loading branch information
Андрей Евстюхин committed May 10, 2020
1 parent 5d8f084 commit 4a00ab5
Show file tree
Hide file tree
Showing 21 changed files with 1,135 additions and 247 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Recompressing "BC7Ltest.png" (gained from https://code.google.com/archive/p/nvid
Bc7Compress.exe /slow /nomask /noflip BC7Ltest.png output.ktx /debug output.png
Loaded BC7Ltest.png
Image 152x152, Texture 152x152
Compressed 1444 blocks, elapsed 36 ms, throughput 0.641 Mpx/s
Compressed 1444 blocks, elapsed 26 ms, throughput 0.888 Mpx/s
SubTexture A MSE = 0.0, PSNR = 73.986163, SSIM_4x4 = 0.99999923
SubTexture RGB wMSE = 0.0, wPSNR = 62.172358, wSSIM_4x4 = 0.99999238
Saved output.ktx
Expand All @@ -32,7 +32,7 @@ Compressing "frymire.png" (gained from https://github.com/castano/nvidia-texture
Bc7Compress.exe /nomask /noflip frymire.png frymire.ktx
Loaded frymire.png
Image 1118x1105, Texture 1120x1108
Compressed 77560 blocks, elapsed 498 ms, throughput 2.491 Mpx/s
Compressed 77560 blocks, elapsed 449 ms, throughput 2.763 Mpx/s
Exactly A
SubTexture RGB wMSE = 0.2, wPSNR = 55.181449, wSSIM_4x4 = 0.99980677
Saved frymire.ktx
Expand All @@ -42,7 +42,7 @@ Compressing "frymire.png" in development mode:
Bc7Compress.exe /draft /nomask /noflip frymire.png frymire.ktx
Loaded frymire.png
Image 1118x1105, Texture 1120x1108
Compressed 77560 blocks, elapsed 188 ms, throughput 6.600 Mpx/s
Compressed 77560 blocks, elapsed 141 ms, throughput 8.801 Mpx/s
Exactly A
SubTexture RGB wMSE = 0.4, wPSNR = 52.056761, wSSIM_4x4 = 0.99952034
Saved frymire.ktx
Expand All @@ -52,7 +52,7 @@ Compressing "8192.png" (gained from https://bitbucket.org/wolfpld/etcpak/downloa
Bc7Compress.exe /draft /nomask /noflip 8192.png 8192.ktx
Loaded 8192.png
Image 8192x8192, Texture 8192x8192
Compressed 4194304 blocks, elapsed 16770 ms, throughput 4.001 Mpx/s
Compressed 4194304 blocks, elapsed 12377 ms, throughput 5.422 Mpx/s
Exactly A
SubTexture RGB wMSE = 0.4, wPSNR = 52.364416, wSSIM_4x4 = 0.99625929
Saved 8192.ktx
Expand Down
2 changes: 1 addition & 1 deletion src/Bc7Compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <string>
#include <vector>

static INLINED int Max(int x, int y) noexcept
static ALWAYS_INLINED int Max(int x, int y) noexcept
{
return (x > y) ? x : y;
}
Expand Down
258 changes: 165 additions & 93 deletions src/Bc7Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,65 +38,6 @@ static INLINED int ComputeOpaqueAlphaError(const Area& area) noexcept
return error;
}

static INLINED void ComputeColorCovariances(const Area& area,
int64_t& covGR, int64_t& covRB, int64_t& covBG) noexcept
{
__m128i msum = _mm_setzero_si128();
__m128i msum2 = _mm_setzero_si128();

size_t count = area.Count;

for (size_t i = 0; i < count; i++)
{
__m128i mpacked = _mm_load_si128(&area.DataMask_I16[i]);
__m128i mpixel = _mm_cvtepu16_epi32(mpacked);

msum = _mm_add_epi16(msum, mpixel);
msum2 = _mm_add_epi32(msum2, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, _MM_SHUFFLE(1, 3, 2, 0))));
}

__m128i mcount = _mm_shuffle_epi32(_mm_cvtsi64_si128(static_cast<int64_t>(count)), 0);

msum2 = _mm_sub_epi32(_mm_mullo_epi32(msum2, mcount), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, _MM_SHUFFLE(1, 3, 2, 0))));

covGR = _mm_extract_epi32(msum2, 1);
covRB = _mm_extract_epi32(msum2, 2);
covBG = _mm_extract_epi32(msum2, 3);
}

static INLINED void ComputeAlphaColorCovariances(const Area& area,
int64_t& covAG, int64_t& covAR, int64_t& covAB, int64_t& covGR, int64_t& covRB, int64_t& covBG) noexcept
{
__m128i msum = _mm_setzero_si128();
__m128i msumA = _mm_setzero_si128();
__m128i msum2 = _mm_setzero_si128();

size_t count = area.Active;

for (size_t i = 0; i < count; i++)
{
__m128i mpacked = _mm_load_si128(&area.DataMask_I16[i]);
__m128i mpixel = _mm_cvtepu16_epi32(mpacked);

msum = _mm_add_epi16(msum, mpixel);
msumA = _mm_add_epi32(msumA, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, 0)));
msum2 = _mm_add_epi32(msum2, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, _MM_SHUFFLE(1, 3, 2, 0))));
}

__m128i mcount = _mm_shuffle_epi32(_mm_cvtsi64_si128(static_cast<int64_t>(count)), 0);

msumA = _mm_sub_epi32(_mm_mullo_epi32(msumA, mcount), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, 0)));
msum2 = _mm_sub_epi32(_mm_mullo_epi32(msum2, mcount), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, _MM_SHUFFLE(1, 3, 2, 0))));

covAG = _mm_extract_epi32(msumA, 1);
covAR = _mm_extract_epi32(msumA, 2);
covAB = _mm_extract_epi32(msumA, 3);

covGR = _mm_extract_epi32(msum2, 1);
covRB = _mm_extract_epi32(msum2, 2);
covBG = _mm_extract_epi32(msum2, 3);
}

static INLINED void MakeCell(Cell& input) noexcept
{
input.BestColor0 = _mm_setzero_si128();
Expand All @@ -105,16 +46,30 @@ static INLINED void MakeCell(Cell& input) noexcept
//input.BestParameter = 0;
//input.BestMode = 8;

int flags = 0;
int flags_mask = 1;

__m128i m0 = _mm_set1_epi16(255);

for (size_t i = 0; i < 16; i++)
{
__m128i mc = _mm_cvtepu8_epi16(_mm_cvtsi32_si128(((int*)input.ImageRows_U8)[i]));
__m128i mmask = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(((int*)input.MaskRows_S8)[i]));

mc = _mm_and_si128(mc, mmask);

flags |= _mm_extract_epi16(mmask, 1) & flags_mask;
flags_mask <<= 1;

m0 = _mm_min_epi16(m0, mc);

input.DataMask_I16[i] = _mm_unpacklo_epi64(mc, mmask);
}

input.VisibleFlags = flags;

input.IsOpaque = (_mm_extract_epi16(m0, 0) == 255);

for (size_t partitionIndex = 0; partitionIndex < 64; partitionIndex++)
{
input.LazyArea12[partitionIndex] = true;
Expand All @@ -138,14 +93,114 @@ static INLINED void MakeCell(Cell& input) noexcept
}
}

static NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size_t count, uint64_t indices) noexcept
NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size_t count, uint64_t indices) noexcept
{
// Initialize Indices

area.Count = static_cast<uint32_t>(count);

area.ZeroIndex = static_cast<uint8_t>(indices & 0xF);

size_t active;

__m128i m0 = _mm_set1_epi16(255);
__m128i m1 = _mm_setzero_si128();

const size_t flags = (uint32_t)cell.VisibleFlags;
if (flags == 0xFFFF)
{
__m128i mi = _mm_cvtsi64_si128((int64_t)indices);

mi = _mm_unpacklo_epi8(mi, _mm_srli_epi16(mi, 4));

mi = _mm_and_si128(mi, _mm_set1_epi8(0xF));

_mm_store_si128((__m128i*)area.Indices, mi);

active = count;

if (cell.IsOpaque)
{
area.Active = static_cast<uint32_t>(active);

// Min & Max + ComputeColorCovariances

__m128i msum = _mm_setzero_si128();
__m128i msum2 = _mm_setzero_si128();

for (size_t i = 0; i < active; i++)
{
size_t index = area.Indices[i];

__m128i mpacked = _mm_load_si128(&cell.DataMask_I16[index]);

m0 = _mm_min_epi16(m0, mpacked);
m1 = _mm_max_epi16(m1, mpacked);

__m128i mpixel = _mm_cvtepu16_epi32(mpacked);

msum = _mm_add_epi16(msum, mpixel);
msum2 = _mm_add_epi32(msum2, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, _MM_SHUFFLE(1, 3, 2, 0))));

area.DataMask_I16[i] = mpacked;
}

__m128i mbounds = _mm_unpacklo_epi16(m0, m1);

_mm_store_si128(&area.MinMax_U16, mbounds);

// Flags

area.IsOpaque = true;

area.BestPca3 = -1;

// Orient channels

__m128i mactive = _mm_shuffle_epi32(_mm_cvtsi64_si128(static_cast<int64_t>(active)), 0);

msum2 = _mm_sub_epi32(_mm_mullo_epi32(msum2, mactive), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, _MM_SHUFFLE(1, 3, 2, 0))));

int64_t covGR = _mm_extract_epi32(msum2, 1);
int64_t covRB = _mm_extract_epi32(msum2, 2);
int64_t covBG = _mm_extract_epi32(msum2, 3);

for (;;)
{
bool changes = false;

int64_t b = covBG * kGreen + covRB * kRed;
if (b < 0)
{
mbounds = _mm_shufflehi_epi16(mbounds, _MM_SHUFFLE(2, 3, 1, 0));

covBG = -covBG;
covRB = -covRB;

changes = true;
}

int64_t r = covGR * kGreen + covRB * kBlue;
if (r < 0)
{
mbounds = _mm_shufflehi_epi16(mbounds, _MM_SHUFFLE(3, 2, 0, 1));

covGR = -covGR;
covRB = -covRB;

changes = true;
}

if (!changes)
break;
}

_mm_store_si128(&area.Bounds_U16, mbounds);

return;
}
}
else
{
uint8_t TransparentIndices[16];

Expand All @@ -157,39 +212,55 @@ static NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size
const size_t index = indices & 0xF;
indices >>= 4;

size_t opaque = ((const uint16_t*)&cell.DataMask_I16[index])[4 + 1] & 1;
*pOpaque = static_cast<uint8_t>(index);
*pTransparent = static_cast<uint8_t>(index);

*(opaque ? pOpaque : pTransparent) = static_cast<uint8_t>(index);
size_t opaque = (flags >> index) & 1;

pTransparent++;
pOpaque += opaque;
pTransparent -= opaque;
pTransparent += opaque ^ 1;
}

area.Active = static_cast<uint32_t>(pOpaque - area.Indices);
active = pOpaque - area.Indices;

uint8_t* p = TransparentIndices;
while (p != pTransparent)
if (active < count)
{
*pOpaque++ = *p++;
m0 = _mm_insert_epi16(m0, 0, 0);

const __m128i mempty = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, 0);

__m128i* pData = &area.DataMask_I16[active];
uint8_t* pIndex = TransparentIndices;
do
{
*pOpaque++ = *pIndex++;
*pData++ = mempty;
} while (pIndex != pTransparent);
}
}

// Min & Max
area.Active = static_cast<uint32_t>(active);

__m128i m0 = _mm_set1_epi16(255);
__m128i m1 = _mm_setzero_si128();
// Min & Max + ComputeAlphaColorCovariances

for (size_t i = 0; i < count; i++)
__m128i msum = _mm_setzero_si128();
__m128i msum2 = _mm_setzero_si128();
__m128i msumA = _mm_setzero_si128();

for (size_t i = 0; i < active; i++)
{
size_t index = area.Indices[i];

__m128i mpacked = _mm_load_si128(&cell.DataMask_I16[index]);
__m128i mc = _mm_unpacklo_epi64(mpacked, mpacked);
__m128i mmask = _mm_unpackhi_epi64(mpacked, mpacked);

m0 = _mm_blendv_epi8(m0, _mm_min_epi16(mc, m0), mmask);
m1 = _mm_max_epi16(m1, mc);
m0 = _mm_min_epi16(m0, mpacked);
m1 = _mm_max_epi16(m1, mpacked);

__m128i mpixel = _mm_cvtepu16_epi32(mpacked);

msum = _mm_add_epi16(msum, mpixel);
msum2 = _mm_add_epi32(msum2, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, _MM_SHUFFLE(1, 3, 2, 0))));
msumA = _mm_add_epi32(msumA, _mm_mullo_epi16(mpixel, _mm_shuffle_epi32(mpixel, 0)));

area.DataMask_I16[i] = mpacked;
}
Expand All @@ -202,16 +273,21 @@ static NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size

// Flags

area.IsOpaque = ((_mm_extract_epi16(mbounds, 0) & _mm_extract_epi16(mbounds, 1)) == 255);
area.IsOpaque = (_mm_extract_epi16(mbounds, 0) == 255);

area.BestPca3 = -1;

// Orient channels

__m128i mactive = _mm_shuffle_epi32(_mm_cvtsi64_si128(static_cast<int64_t>(active)), 0);

if (area.IsOpaque)
{
int64_t covGR = 0, covRB = 0, covBG = 0;
ComputeColorCovariances(area, covGR, covRB, covBG);
msum2 = _mm_sub_epi32(_mm_mullo_epi32(msum2, mactive), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, _MM_SHUFFLE(1, 3, 2, 0))));

int64_t covGR = _mm_extract_epi32(msum2, 1);
int64_t covRB = _mm_extract_epi32(msum2, 2);
int64_t covBG = _mm_extract_epi32(msum2, 3);

for (;;)
{
Expand Down Expand Up @@ -245,8 +321,16 @@ static NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size
}
else
{
int64_t covAG = 0, covAR = 0, covAB = 0, covGR = 0, covRB = 0, covBG = 0;
ComputeAlphaColorCovariances(area, covAG, covAR, covAB, covGR, covRB, covBG);
msum2 = _mm_sub_epi32(_mm_mullo_epi32(msum2, mactive), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, _MM_SHUFFLE(1, 3, 2, 0))));
msumA = _mm_sub_epi32(_mm_mullo_epi32(msumA, mactive), _mm_mullo_epi32(msum, _mm_shuffle_epi32(msum, 0)));

int64_t covGR = _mm_extract_epi32(msum2, 1);
int64_t covRB = _mm_extract_epi32(msum2, 2);
int64_t covBG = _mm_extract_epi32(msum2, 3);

int64_t covAG = _mm_extract_epi32(msumA, 1);
int64_t covAR = _mm_extract_epi32(msumA, 2);
int64_t covAB = _mm_extract_epi32(msumA, 3);

for (;;)
{
Expand Down Expand Up @@ -296,18 +380,6 @@ static NOTINLINED void MakeAreaFromCell(Area& area, const Cell& cell, const size
_mm_store_si128(&area.Bounds_U16, mbounds);
}

Area& GetArea(Area& area, bool& lazy, const Cell& cell, const uint64_t indices) noexcept
{
if (lazy)
{
lazy = false;

MakeAreaFromCell(area, cell, indices & 0xF, indices >> 4);
}

return area;
}

int AreaGetBestPca3(Area& area) noexcept
{
#if defined(OPTION_PCA)
Expand Down
Loading

0 comments on commit 4a00ab5

Please sign in to comment.