From 671bd63f05fe462dba45e8cde9ee71cf9f74a57b Mon Sep 17 00:00:00 2001 From: elasota Date: Tue, 21 Aug 2018 03:04:17 -0400 Subject: [PATCH] Rename InputBlock to PixelBlock, add decompression functions, fix GCC/Clang --- ConvectionKernels/ConvectionKernels.cpp | 637 ++++++++++++++++++++++-- ConvectionKernels/ConvectionKernels.h | 30 +- 2 files changed, 601 insertions(+), 66 deletions(-) diff --git a/ConvectionKernels/ConvectionKernels.cpp b/ConvectionKernels/ConvectionKernels.cpp index 80340653..b807d88f 100644 --- a/ConvectionKernels/ConvectionKernels.cpp +++ b/ConvectionKernels/ConvectionKernels.cpp @@ -43,6 +43,7 @@ Licensed under the MIT License. #include #include #include +#include #define UNREFERENCED_PARAMETER(n) ((void)n) @@ -495,7 +496,7 @@ namespace cvtt return result; } - static void ConvertLDRInputs(const InputBlockU8* inputBlocks, int pxOffset, int channel, UInt15& chOut) + static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15& chOut) { int16_t values[8]; for (int i = 0; i < 8; i++) @@ -504,7 +505,7 @@ namespace cvtt chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); } - static void ConvertHDRInputs(const InputBlockF16* inputBlocks, int pxOffset, int channel, SInt16& chOut) + static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16& chOut) { int16_t values[8]; for (int i = 0; i < 8; i++) @@ -1276,12 +1277,12 @@ namespace cvtt return 1.0f / v; } - static void ConvertLDRInputs(const InputBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut) + static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut) { chOut = inputBlocks[0].m_pixels[pxOffset][channel]; } - static void ConvertHDRInputs(const InputBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut) + static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut) { chOut = inputBlocks[0].m_pixels[pxOffset][channel]; } @@ -1623,6 +1624,19 @@ namespace cvtt { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false } // 7 }; + const int g_weight2[] = { 0, 21, 43, 64 }; + const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 }; + const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + + const int *g_weightTables[] = + { + NULL, + NULL, + g_weight2, + g_weight3, + g_weight4 + }; + struct BC6HModeInfo { uint16_t m_modeID; @@ -2515,6 +2529,37 @@ namespace cvtt } }; + + struct UnpackingVector + { + uint32_t m_vector[4]; + + void Init(const uint8_t *bytes) + { + for (int i = 0; i < 4; i++) + m_vector[i] = 0; + + for (int b = 0; b < 16; b++) + m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8)); + } + + inline ParallelMath::ScalarUInt16 Unpack(int bits) + { + uint32_t bitMask = (1 << bits) - 1; + + ParallelMath::ScalarUInt16 result = static_cast(m_vector[0] & bitMask); + + for (int i = 0; i < 4; i++) + { + m_vector[i] >>= bits; + if (i != 3) + m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits); + } + + return result; + } + }; + void ComputeTweakFactors2(int tweak, int range, float* outFactors) { int totalUnits = range - 1; @@ -2597,8 +2642,11 @@ namespace cvtt } UnfinishedEndpoints(const UnfinishedEndpoints& other) - : UnfinishedEndpoints(other.m_base, other.m_offset) { + for (int ch = 0; ch < TVectorSize; ch++) + m_base[ch] = other.m_base[ch]; + for (int ch = 0; ch < TVectorSize; ch++) + m_offset[ch] = other.m_offset[ch]; } void FinishHDRUnsigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode) @@ -3017,25 +3065,6 @@ namespace cvtt return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn); } - void SelectIndexRangeLDR(const MFloat* pixel, MUInt15 &outLowIndex, MUInt15 &outHighIndex) const - { - MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0]; - for (int ch = 1; ch < TVectorSize; ch++) - dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch]; - - dist = ParallelMath::Clamp(dist, 0.0f, m_maxValue); - - { - ParallelMath::RoundTowardZeroForScope rtz; - outLowIndex = ParallelMath::RoundAndConvertToU15(dist, &rtz); - } - - { - ParallelMath::RoundUpForScope ru; - outHighIndex = ParallelMath::RoundAndConvertToU15(dist, &rtz); - } - } - protected: MAInt16 m_endPoint[2][TVectorSize]; @@ -3516,7 +3545,7 @@ namespace cvtt MUInt15 m_indexSelector; MUInt15 m_rotation; } m_isr; - }; + } m_u; }; static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2]) @@ -4337,7 +4366,7 @@ namespace cvtt work.m_error = ParallelMath::Min(totalError, work.m_error); ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode)); - ParallelMath::ConditionalSet(work.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition)); + ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition)); } } } @@ -4617,8 +4646,8 @@ namespace cvtt work.m_error = ParallelMath::Min(combinedError, work.m_error); ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode)); - ParallelMath::ConditionalSet(work.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation)); - ParallelMath::ConditionalSet(work.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector)); + ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation)); + ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector)); for (int px = 0; px < 16; px++) { @@ -4642,7 +4671,7 @@ namespace cvtt b = temp; } - static void Pack(uint32_t flags, const InputBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds) + static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds) { MUInt15 pixels[16][4]; MFloat floatPixels[16][4]; @@ -4676,8 +4705,8 @@ namespace cvtt pv.Init(); ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block); - ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_partition, block); - ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_isr.m_indexSelector, block); + ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block); + ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block); const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode]; @@ -4780,7 +4809,7 @@ namespace cvtt if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) { - ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_isr.m_rotation, block); + ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block); pv.Pack(rotation, 2); } @@ -4872,6 +4901,225 @@ namespace cvtt packedBlocks += 16; } } + + static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock) + { + UnpackingVector pv; + pv.Init(packedBlock); + + int mode = 8; + for (int i = 0; i < 8; i++) + { + if (pv.Unpack(1) == 1) + { + mode = i; + break; + } + } + + if (mode > 7) + { + for (int px = 0; px < 16; px++) + for (int ch = 0; ch < 4; ch++) + output.m_pixels[px][ch] = 0; + + return; + } + + const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode]; + + int partition = 0; + if (modeInfo.m_partitionBits) + partition = pv.Unpack(modeInfo.m_partitionBits); + + int rotation = 0; + if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) + rotation = pv.Unpack(2); + + int indexSelector = 0; + if (modeInfo.m_hasIndexSelector) + indexSelector = pv.Unpack(1); + + // Resolve fixups + int fixups[3] = { 0, 0, 0 }; + + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate) + { + if (modeInfo.m_numSubsets == 2) + fixups[1] = BC7Data::g_fixupIndexes2[partition]; + else if (modeInfo.m_numSubsets == 3) + { + fixups[1] = BC7Data::g_fixupIndexes3[partition][0]; + fixups[2] = BC7Data::g_fixupIndexes3[partition][1]; + } + } + + int endPoints[3][2][4]; + + // Decode RGB + for (int ch = 0; ch < 3; ch++) + { + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + for (int ep = 0; ep < 2; ep++) + endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits)); + } + } + + // Decode alpha + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) + { + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + for (int ep = 0; ep < 2; ep++) + endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits)); + } + } + else + { + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + for (int ep = 0; ep < 2; ep++) + endPoints[subset][ep][3] = 255; + } + } + + int parityBits = 0; + + // Decode parity bits + if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset) + { + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + int p = pv.Unpack(1); + + for (int ep = 0; ep < 2; ep++) + { + for (int ch = 0; ch < 3; ch++) + endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits); + + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) + endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits); + } + } + + parityBits = 1; + } + else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint) + { + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + for (int ep = 0; ep < 2; ep++) + { + int p = pv.Unpack(1); + + for (int ch = 0; ch < 3; ch++) + endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits); + + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) + endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits); + } + } + + parityBits = 1; + } + + // Fill endpoint bits + for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) + { + for (int ep = 0; ep < 2; ep++) + { + for (int ch = 0; ch < 3; ch++) + endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits)); + + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) + endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits)); + } + } + + int indexes[16]; + int indexes2[16]; + + // Decode indexes + for (int px = 0; px < 16; px++) + { + int bits = modeInfo.m_indexBits; + if ((px == 0) || (px == fixups[1]) || (px == fixups[2])) + bits--; + + indexes[px] = pv.Unpack(bits); + } + + // Decode secondary indexes + if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) + { + for (int px = 0; px < 16; px++) + { + int bits = modeInfo.m_alphaIndexBits; + if (px == 0) + bits--; + + indexes2[px] = pv.Unpack(bits); + } + } + else + { + for (int px = 0; px < 16; px++) + indexes2[px] = 0; + } + + const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits]; + const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits]; + + // Decode each pixel + for (int px = 0; px < 16; px++) + { + int rgbWeight = 0; + int alphaWeight = 0; + + int rgbIndex = indexes[px]; + + rgbWeight = rgbWeights[indexes[px]]; + + if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) + alphaWeight = rgbWeight; + else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) + alphaWeight = alphaWeights[indexes2[px]]; + + if (indexSelector == 1) + { + int temp = rgbWeight; + rgbWeight = alphaWeight; + alphaWeight = temp; + } + + int pixel[4] = { 0, 0, 0, 255 }; + + int subset = 0; + + if (modeInfo.m_numSubsets == 2) + subset = (BC7Data::g_partitionMap[partition] >> px) & 1; + else if (modeInfo.m_numSubsets == 3) + subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3; + + for (int ch = 0; ch < 3; ch++) + pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6; + + if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) + pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6; + + if (rotation != 0) + { + int ch = rotation - 1; + int temp = pixel[ch]; + pixel[ch] = pixel[3]; + pixel[3] = temp; + } + + for (int ch = 0; ch < 4; ch++) + output.m_pixels[px][ch] = static_cast(pixel[ch]); + } + } }; class BC6HComputer @@ -5128,7 +5376,7 @@ namespace cvtt outIsLegal = allLegal; } - static void Pack(uint32_t flags, const InputBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds) + static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds) { if (numTweakRounds < 1) numTweakRounds = 1; @@ -5530,6 +5778,253 @@ namespace cvtt pv.Flush(packedBlocks + 16 * block); } } + + static void SignExtendSingle(int &v, int bits) + { + if (v & (1 << (bits - 1))) + v |= -(1 << bits); + } + + static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned) + { + UnpackingVector pv; + pv.Init(pBC); + + int numModeBits = 2; + int modeBits = pv.Unpack(2); + if (modeBits != 0 && modeBits != 1) + { + modeBits |= pv.Unpack(3) << 2; + numModeBits += 3; + } + + int mode = -1; + for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++) + { + if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits) + { + mode = possibleMode; + break; + } + } + + if (mode < 0) + { + for (int px = 0; px < 16; px++) + { + for (int ch = 0; ch < 3; ch++) + output.m_pixels[px][ch] = 0; + output.m_pixels[px][3] = 0x3c00; // 1.0 + } + return; + } + + const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode]; + const size_t headerBits = modeInfo.m_partitioned ? 82 : 65; + const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode]; + + int32_t partition = 0; + int32_t eps[2][2][3]; + + for (int subset = 0; subset < 2; subset++) + for (int epi = 0; epi < 2; epi++) + for (int ch = 0; ch < 3; ch++) + eps[subset][epi][ch] = 0; + + for (size_t i = numModeBits; i < headerBits; i++) + { + int32_t *pCodedValue = NULL; + + switch (desc[i].m_eField) + { + case BC6HData::D: pCodedValue = &partition; break; + case BC6HData::RW: pCodedValue = &eps[0][0][0]; break; + case BC6HData::RX: pCodedValue = &eps[0][1][0]; break; + case BC6HData::RY: pCodedValue = &eps[1][0][0]; break; + case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break; + case BC6HData::GW: pCodedValue = &eps[0][0][1]; break; + case BC6HData::GX: pCodedValue = &eps[0][1][1]; break; + case BC6HData::GY: pCodedValue = &eps[1][0][1]; break; + case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break; + case BC6HData::BW: pCodedValue = &eps[0][0][2]; break; + case BC6HData::BX: pCodedValue = &eps[0][1][2]; break; + case BC6HData::BY: pCodedValue = &eps[1][0][2]; break; + case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break; + default: assert(false); break; + } + + (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit; + } + + + uint16_t modeID = modeInfo.m_modeID; + + int fixupIndex1 = 0; + int indexBits = 4; + int numSubsets = 1; + if (modeInfo.m_partitioned) + { + fixupIndex1 = BC7Data::g_fixupIndexes2[partition]; + indexBits = 3; + numSubsets = 2; + } + + int indexes[16]; + for (int px = 0; px < 16; px++) + { + if (px == 0 || px == fixupIndex1) + indexes[px] = pv.Unpack(indexBits - 1); + else + indexes[px] = pv.Unpack(indexBits); + } + + if (modeInfo.m_partitioned) + { + for (int ch = 0; ch < 3; ch++) + { + if (isSigned) + SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec); + if (modeInfo.m_transformed || isSigned) + { + SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]); + SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]); + SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]); + } + } + } + else + { + for (int ch = 0; ch < 3; ch++) + { + if (isSigned) + SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec); + if (modeInfo.m_transformed || isSigned) + SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]); + } + } + + int aPrec = modeInfo.m_aPrec; + + if (modeInfo.m_transformed) + { + for (int ch = 0; ch < 3; ch++) + { + int wrapMask = (1 << aPrec) - 1; + + eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask); + if (isSigned) + SignExtendSingle(eps[0][1][ch], aPrec); + + if (modeInfo.m_partitioned) + { + eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask); + eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask); + + if (isSigned) + { + SignExtendSingle(eps[1][0][ch], aPrec); + SignExtendSingle(eps[1][1][ch], aPrec); + } + } + } + } + + // Unquantize endpoints + for (int subset = 0; subset < numSubsets; subset++) + { + for (int epi = 0; epi < 2; epi++) + { + for (int ch = 0; ch < 3; ch++) + { + int &v = eps[subset][epi][ch]; + + if (isSigned) + { + if (aPrec >= 16) + { + // Nothing + } + else + { + bool s = false; + int comp = v; + if (v < 0) + { + s = true; + comp = -comp; + } + + int unq = 0; + if (comp == 0) + unq = 0; + else if (comp >= ((1 << (aPrec - 1)) - 1)) + unq = 0x7fff; + else + unq = ((comp << 15) + 0x4000) >> (aPrec - 1); + + if (s) + unq = -unq; + + v = unq; + } + } + else + { + if (aPrec >= 15) + { + // Nothing + } + else if (v == 0) + { + // Nothing + } + else if (v == ((1 << aPrec) - 1)) + v = 0xffff; + else + v = ((v << 16) + 0x8000) >> aPrec; + } + } + } + } + + const int *weights = BC7Data::g_weightTables[indexBits]; + + for (int px = 0; px < 16; px++) + { + int subset = 0; + if (modeInfo.m_partitioned) + subset = (BC7Data::g_partitionMap[partition] >> px) & 1; + + int w = weights[indexes[px]]; + for (int ch = 0; ch < 3; ch++) + { + int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6; + + if (isSigned) + { + if (comp < 0) + comp = -(((-comp) * 31) >> 5); + else + comp = (comp * 31) >> 5; + + int s = 0; + if (comp < 0) + { + s = 0x8000; + comp = -comp; + } + + output.m_pixels[px][ch] = static_cast(s | comp); + } + else + { + comp = (comp * 31) >> 6; + output.m_pixels[px][ch] = static_cast(comp); + } + } + output.m_pixels[px][3] = 0x3c00; // 1.0 + } + } }; namespace S3TCSingleColorTables @@ -6093,10 +6588,10 @@ namespace cvtt MUInt15 endPoints[2][3]; refiner.GetRefinedEndpointsLDR(endPoints, rtn); - TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, nullptr, rtn); + TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn); } - static void PackExplicitAlpha(uint32_t flags, const InputBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) + static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) { UNREFERENCED_PARAMETER(flags); ParallelMath::RoundTowardNearestForScope rtn; @@ -6136,7 +6631,7 @@ namespace cvtt } } - static void PackInterpolatedAlpha(uint32_t flags, const InputBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) + static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) { if (maxTweakRounds < 1) maxTweakRounds = 1; @@ -6510,7 +7005,7 @@ namespace cvtt } } - static void PackRGB(uint32_t flags, const InputBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) + static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) { ParallelMath::RoundTowardNearestForScope rtn; @@ -6541,7 +7036,7 @@ namespace cvtt if (alphaTest) { - MUInt15 threshold = ParallelMath::MakeUInt15(static_cast(floorf(alphaThreshold * 255.0f + 0.5f))); + MUInt15 threshold = ParallelMath::MakeUInt15(static_cast(floor(alphaThreshold * 255.0f + 0.5f))); for (int px = 0; px < 16; px++) { @@ -6849,12 +7344,12 @@ namespace cvtt }; // Signed input blocks are converted into unsigned space, with the maximum value being 254 - void BiasSignedInput(InputBlockU8 inputNormalized[ParallelMath::ParallelSize], const InputBlockS8 inputSigned[ParallelMath::ParallelSize]) + void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize]) { for (size_t block = 0; block < ParallelMath::ParallelSize; block++) { - const InputBlockS8& inputSignedBlock = inputSigned[block]; - InputBlockU8& inputNormalizedBlock = inputNormalized[block]; + const PixelBlockS8& inputSignedBlock = inputSigned[block]; + PixelBlockU8& inputNormalizedBlock = inputNormalized[block]; for (size_t px = 0; px < 16; px++) { @@ -6880,7 +7375,7 @@ namespace cvtt namespace Kernels { - void EncodeBC7(uint8_t *pBC, const InputBlockU8 *pBlocks, const cvtt::Options &options) + void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options) { assert(pBlocks); assert(pBC); @@ -6895,7 +7390,7 @@ namespace cvtt } } - void EncodeBC6HU(uint8_t *pBC, const InputBlockF16 *pBlocks, const cvtt::Options &options) + void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options) { assert(pBlocks); assert(pBC); @@ -6910,7 +7405,7 @@ namespace cvtt } } - void EncodeBC6HS(uint8_t *pBC, const InputBlockF16 *pBlocks, const cvtt::Options &options) + void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options) { assert(pBlocks); assert(pBC); @@ -6925,7 +7420,7 @@ namespace cvtt } } - void EncodeBC1(uint8_t *pBC, const InputBlockU8 *pBlocks, const cvtt::Options &options) + void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options) { assert(pBlocks); assert(pBC); @@ -6940,7 +7435,7 @@ namespace cvtt } } - void EncodeBC2(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options) + void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -6956,7 +7451,7 @@ namespace cvtt } } - void EncodeBC3(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options) + void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -6972,7 +7467,7 @@ namespace cvtt } } - void EncodeBC4U(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options) + void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -6987,7 +7482,7 @@ namespace cvtt } } - void EncodeBC4S(uint8_t *pBC, const InputBlockS8 *pBlocks, const Options &options) + void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -6997,7 +7492,7 @@ namespace cvtt for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) { - InputBlockU8 inputBlocks[ParallelMath::ParallelSize]; + PixelBlockU8 inputBlocks[ParallelMath::ParallelSize]; Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase); Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC); @@ -7005,7 +7500,7 @@ namespace cvtt } } - void EncodeBC5U(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options) + void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -7021,7 +7516,7 @@ namespace cvtt } } - void EncodeBC5S(uint8_t *pBC, const InputBlockS8 *pBlocks, const Options &options) + void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options) { assert(pBlocks); assert(pBC); @@ -7031,7 +7526,7 @@ namespace cvtt for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) { - InputBlockU8 inputBlocks[ParallelMath::ParallelSize]; + PixelBlockU8 inputBlocks[ParallelMath::ParallelSize]; Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase); Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC); @@ -7039,5 +7534,41 @@ namespace cvtt pBC += ParallelMath::ParallelSize * 16; } } + + void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC) + { + assert(pBlocks); + assert(pBC); + + for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) + { + Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC); + pBC += 16; + } + } + + void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC) + { + assert(pBlocks); + assert(pBC); + + for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) + { + Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false); + pBC += 16; + } + } + + void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC) + { + assert(pBlocks); + assert(pBC); + + for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) + { + Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true); + pBC += 16; + } + } } } diff --git a/ConvectionKernels/ConvectionKernels.h b/ConvectionKernels/ConvectionKernels.h index 495e5417..9703dc91 100644 --- a/ConvectionKernels/ConvectionKernels.h +++ b/ConvectionKernels/ConvectionKernels.h @@ -82,19 +82,19 @@ namespace cvtt }; // RGBA input block for unsigned 8-bit formats - struct InputBlockU8 + struct PixelBlockU8 { uint8_t m_pixels[16][4]; }; // RGBA input block for signed 8-bit formats - struct InputBlockS8 + struct PixelBlockS8 { int8_t m_pixels[16][4]; }; // RGBA input block for half-precision float formats (bit-cast to int16_t) - struct InputBlockF16 + struct PixelBlockF16 { int16_t m_pixels[16][4]; }; @@ -102,16 +102,20 @@ namespace cvtt namespace Kernels { // NOTE: All functions accept and output NumParallelBlocks blocks at once - void EncodeBC1(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); - void EncodeBC2(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); - void EncodeBC3(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); - void EncodeBC4U(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); - void EncodeBC4S(uint8_t *pBC, const InputBlockS8 *pBlocks, const Options &options); - void EncodeBC5U(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); - void EncodeBC5S(uint8_t *pBC, const InputBlockS8 *pBlocks, const Options &options); - void EncodeBC6HU(uint8_t *pBC, const InputBlockF16 *pBlocks, const Options &options); - void EncodeBC6HS(uint8_t *pBC, const InputBlockF16 *pBlocks, const Options &options); - void EncodeBC7(uint8_t *pBC, const InputBlockU8 *pBlocks, const Options &options); + void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options); + void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options); + void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options); + void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options); + void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options); + + void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC); + void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC); + void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC); } }