Skip to content

Commit

Permalink
Simplify and update timings
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Evstyukhin committed Nov 18, 2022
1 parent 282b2da commit 6633207
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 14 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Let's try the soup from https://cbloomrants.blogspot.com/2020/06/oodle-texture-s

Bc7Compress.exe /nomask /noflip mysoup1024.png mysoup1024-stepping.ktx /debug mysoup1024-stepping.png
Image 1024x1024, Texture 1024x1024
Compressed 65536 blocks, elapsed 380 ms, throughput 2.759 Mpx/s
Compressed 65536 blocks, elapsed 345 ms, throughput 3.039 Mpx/s
Whole A
SubTexture RGB qMSE = 1.6, qPSNR = 46.097026, wSSIM_4x4 = 0.99034363

Expand All @@ -78,7 +78,7 @@ Let's try the soup from https://cbloomrants.blogspot.com/2020/06/oodle-texture-s

Bc7Compress.exe /nomask /noflip mysoup1024.png mysoup1024-mode6index2.ktx /debug mysoup1024-mode6index2.png
Image 1024x1024, Texture 1024x1024
Compressed 65536 blocks, elapsed 386 ms, throughput 2.716 Mpx/s
Compressed 65536 blocks, elapsed 350 ms, throughput 2.995 Mpx/s
Whole A
SubTexture RGB qMSE = 2.1, qPSNR = 44.949510, wSSIM_4x4 = 0.98476649

Expand All @@ -91,7 +91,7 @@ Then check all modes with BC7Ltest.png from https://code.google.com/archive/p/nv

Bc7Compress.exe /nomask /noflip BC7Ltest.png BC7Ltest.ktx /debug output.png
Image 152x152, Texture 152x152
Compressed 1444 blocks, elapsed 24 ms, throughput 0.962 Mpx/s
Compressed 1444 blocks, elapsed 17 ms, throughput 1.359 Mpx/s
SubTexture A qMSE = 0.0, qPSNR = 69.026097, SSIM_4x4 = 0.99997865
SubTexture RGB qMSE = 0.6, qPSNR = 50.687018, wSSIM_4x4 = 0.99964438

Expand All @@ -102,7 +102,7 @@ And finally compress an interesting image https://github.com/castano/image-datas

Bc7Compress.exe /nomask /noflip frymire.png frymire.ktx /debug output.png
Image 1118x1105, Texture 1120x1108
Compressed 77560 blocks, elapsed 73 ms, throughput 16.999 Mpx/s
Compressed 77560 blocks, elapsed 69 ms, throughput 17.984 Mpx/s
Whole A
SubTexture RGB qMSE = 0.6, qPSNR = 50.594453, wSSIM_4x4 = 0.97119160

Expand All @@ -112,7 +112,7 @@ For non-production purposes we can apply a "draft" mode on earlily saved https:/

Bc7Compress.exe /draft /nomask /noflip 8192.png 8192.ktx /debug output.png
Image 8192x8192, Texture 8192x8192
Compressed 4194304 blocks, elapsed 295 ms, throughput 227.487 Mpx/s
Compressed 4194304 blocks, elapsed 282 ms, throughput 237.974 Mpx/s
Whole A
SubTexture RGB qMSE = 0.9, qPSNR = 48.734921, wSSIM_4x4 = 0.98443459

Expand Down
9 changes: 4 additions & 5 deletions src/Bc7Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -944,8 +944,7 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
__m128i mpacked = _mm_load_si128(&area.DataMask_I16[i]);
__m256i vpixel = _mm256_broadcastq_epi64(mpacked);

__m256i vbottom = _mm256_set1_epi32(kBlockMaximalColorAlphaError);

__m256i vbottom;
if constexpr (M == 16)
{
__m256i vx = _mm256_load_si256((const __m256i*)state.Values_I16);
Expand Down Expand Up @@ -1013,7 +1012,7 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
vxx = _mm256_blend_epi32(vxx, vyy, 0xAA);
vzz = _mm256_blend_epi32(vzz, vww, 0xAA);

vbottom = _mm256_blendv_epi8(vbottom, vx, _mm256_cmpgt_epi64(vbottom, vx));
vbottom = vx;
vbottom = _mm256_blendv_epi8(vbottom, vz, _mm256_cmpgt_epi64(vbottom, vz));
vbottom = _mm256_blendv_epi8(vbottom, vxx, _mm256_cmpgt_epi64(vbottom, vxx));
vbottom = _mm256_blendv_epi8(vbottom, vzz, _mm256_cmpgt_epi64(vbottom, vzz));
Expand Down Expand Up @@ -1064,7 +1063,7 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat
vx = _mm256_blend_epi32(vx, vy, 0xAA);
vz = _mm256_blend_epi32(vz, vw, 0xAA);

vbottom = _mm256_blendv_epi8(vbottom, vx, _mm256_cmpgt_epi64(vbottom, vx));
vbottom = vx;
vbottom = _mm256_blendv_epi8(vbottom, vz, _mm256_cmpgt_epi64(vbottom, vz));

_mm256_store_si256((__m256i*)errors, vx);
Expand Down Expand Up @@ -1098,7 +1097,7 @@ INLINED int ComputeSubsetTable(const Area& area, const __m128i mweights, Modulat

vx = _mm256_blend_epi32(vx, vy, 0xAA);

vbottom = _mm256_blendv_epi8(vbottom, vx, _mm256_cmpgt_epi64(vbottom, vx));
vbottom = vx;

_mm256_store_si256((__m256i*)errors, vx);
}
Expand Down
8 changes: 4 additions & 4 deletions src/Bc7Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,18 +814,18 @@ void InitWeights(bool linearData) noexcept
mweights16 = _mm_insert_epi16(mweights16, gWeightBlue, 3);
mweights16 = _mm_unpacklo_epi64(mweights16, mweights16);

_mm_store_si128(&gWeights32, _mm_unpacklo_epi16(mweights16, _mm_setzero_si128()));
_mm_store_si128(&gWeights32, _mm_cvtepu16_epi32(mweights16));
_mm_store_si128(&gWeightsAGRB, mweights16);
_mm_store_si128(&gWeightsAGR, _mm_and_si128(_mm_set_epi16(0, -1, -1, -1, 0, -1, -1, -1), mweights16));
_mm_store_si128(&gWeightsAGB, _mm_and_si128(_mm_set_epi16(-1, 0, -1, -1, -1, 0, -1, -1), mweights16));
_mm_store_si128(&gWeightsAG, _mm_and_si128(_mm_set_epi16(0, 0, -1, -1, 0, 0, -1, -1), mweights16));
_mm_store_si128(&gWeightsAR, _mm_and_si128(_mm_set_epi16(0, -1, 0, -1, 0, -1, 0, -1), mweights16));
_mm_store_si128(&gWeightsAGAG, _mm_shuffle_epi32(mweights16, 0));
_mm_store_si128(&gWeightsARAR, _mm_shufflehi_epi16(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(2, 0, 2, 0)), _MM_SHUFFLE(2, 0, 2, 0)));
_mm_store_si128(&gWeightsARAR, _mm_shuffle_epi32(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(2, 0, 2, 0)), 0));

_mm_store_si128(&gWeightsGRB, _mm_and_si128(_mm_set_epi16(-1, -1, -1, 0, -1, -1, -1, 0), mweights16));
_mm_store_si128(&gWeightsGRGR, _mm_shufflehi_epi16(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(2, 1, 2, 1)), _MM_SHUFFLE(2, 1, 2, 1)));
_mm_store_si128(&gWeightsGBGB, _mm_shufflehi_epi16(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)));
_mm_store_si128(&gWeightsGRGR, _mm_shuffle_epi32(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(2, 1, 2, 1)), 0));
_mm_store_si128(&gWeightsGBGB, _mm_shuffle_epi32(_mm_shufflelo_epi16(mweights16, _MM_SHUFFLE(3, 1, 3, 1)), 0));

#if !defined(OPTION_LIBRARY)
if (linearData)
Expand Down

0 comments on commit 6633207

Please sign in to comment.