From 049b99bed605e80944dd4a7b0a45020c89da6726 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Mon, 3 Jul 2023 22:58:36 +0200 Subject: [PATCH 01/34] Investigating block sizes --- src/block_rANS32x32_16w.cpp | 299 ++++++++++++++++++++++++++++++++++++ src/block_rANS32x32_16w.h | 22 +++ src/hist.cpp | 35 +++-- src/hist.h | 4 + src/main.cpp | 113 +++++++++----- 5 files changed, 420 insertions(+), 53 deletions(-) create mode 100644 src/block_rANS32x32_16w.cpp create mode 100644 src/block_rANS32x32_16w.h diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp new file mode 100644 index 0000000..76d80fb --- /dev/null +++ b/src/block_rANS32x32_16w.cpp @@ -0,0 +1,299 @@ +#include "block_rANS32x32_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +#include +#include + +constexpr size_t StateCount = 32; // Needs to be a power of two. +constexpr bool EncodeNoBranch = false; +//constexpr bool DecodeNoBranch = false; +constexpr size_t SafeHistBitMax = 0; +constexpr size_t MinBlockSize = 1 << 15; + +template +struct HistReplaceMul +{ + constexpr static size_t GetValue(); +}; + +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 200; } }; + +size_t block_rANS32x32_16w_capacity(const size_t inputSize) +{ + const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); + const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1; + const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t); + + return baseSize + blockCount * perBlockExtraSize; // i hope this covers all of our bases. +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) +{ + if (outCapacity < block_rANS32x32_16w_capacity(length)) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + + uint32_t states[StateCount]; + uint16_t *pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); + uint16_t *pStart = pEnd; + uint16_t *pBlockBack = pStart; + size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); + size_t blockLowCmp = blockLowI + StateCount; + + size_t histCount = 1; + size_t histPotentialCount = 1; + size_t histDiff = 0; + size_t histPotentialDiff = 0; + size_t histRejectedDiff = 0; + + if (blockLowI > MinBlockSize) + blockLowI -= MinBlockSize; + + uint32_t symCount[256]; + observe_hist(symCount, pInData + blockLowI, length - blockLowI); + + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + hist_t hist; + normalize_hist(&hist, symCount, length - blockLowI, TotalSymbolCountBits); + + // Init States. + for (size_t i = 0; i < StateCount; i++) + states[i] = DecodeConsumePoint16; + + const uint8_t idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; + static_assert(sizeof(idx2idx) == StateCount); + + int64_t i = length - 1; + i &= ~(size_t)(StateCount - 1); + i += StateCount; + + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = idx2idx[j]; + + if (i - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pStart = (uint16_t)(state & 0xFFFF); + *pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pStart = (uint16_t)(state & 0xFFFF); + pStart--; + state >>= 16; + } + } + + states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount); + } + } + + i -= StateCount; + + while (true) + { + for (; i >= (int64_t)blockLowCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pStart = (uint16_t)(state & 0xFFFF); + *pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pStart = (uint16_t)(state & 0xFFFF); + pStart--; + state >>= 16; + } + } + + states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount); + } + } + + if (i == 0) + break; + + // Potentially replace histogram. + { + blockLowI = i - MinBlockSize; + blockLowCmp = blockLowI + StateCount; + + memset(symCount, 0, sizeof(symCount)); + observe_hist(symCount, pInData + blockLowI, MinBlockSize); + + bool mustReplaceHist = false; + + if constexpr (!IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] > 0 && hist.symbolCount[j] == 0) + { + mustReplaceHist = true; + normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits); + break; + } + } + } + + if (!mustReplaceHist) + { + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + hist_t newHist; + + if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize) + { + for (size_t j = 0; j < 256; j++) + newHist.symbolCount[j] = (uint16_t)symCount[j]; + } + else + { + normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits); + } + + size_t accumAbsDiff = 0; + + for (size_t j = 0; j < 256; j++) + accumAbsDiff += (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]); + + histPotentialCount++; + histPotentialDiff += accumAbsDiff; + + constexpr size_t histReplacePoint = ((1 << TotalSymbolCountBits) * HistReplaceMul::GetValue()) >> 10; + + if (accumAbsDiff >= histReplacePoint) + { + histDiff += accumAbsDiff; + mustReplaceHist = true; + hist = newHist; + } + else + { + histRejectedDiff += accumAbsDiff; + } + } + + if (mustReplaceHist) + { + const uint64_t blockSize = pBlockBack - pStart; + + pStart++; + pStart -= 256; + memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount)); + pStart -= sizeof(uint64_t); + memcpy(pStart, &blockSize, sizeof(blockSize)); + + pStart--; + pBlockBack = pStart; + + histCount++; + } + } + } + + uint8_t *pWrite = pOutData; + size_t outIndex = 0; + + *reinterpret_cast(pWrite + outIndex) = (uint64_t)length; + outIndex += sizeof(uint64_t); + + // compressed expected length. + outIndex += sizeof(uint64_t); + + for (size_t j = 0; j < 256; j++) + { + *reinterpret_cast(pWrite + outIndex) = hist.symbolCount[j]; + outIndex += sizeof(uint16_t); + } + + for (size_t j = 0; j < StateCount; j++) + { + *reinterpret_cast(pWrite + outIndex) = states[j]; + outIndex += sizeof(uint32_t); + } + + const size_t size = (pEnd - pStart) * sizeof(uint16_t); + + memmove(pWrite + outIndex, pStart + 1, size); + outIndex += size; + + *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. + + printf("\t>>>>> %" PRIu64 " / %" PRIu64 " histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3fk, total: %5.3fk, rejected: %5.3fk\n", histCount, histPotentialCount, (length / 1024.0) / histCount, (histDiff / 1024.0) / histCount, (histPotentialDiff / 1024.0) / histPotentialCount, (histRejectedDiff / 1024.0) / (histPotentialCount - histCount)); + + return outIndex; +} + + + +////////////////////////////////////////////////////////////////////////// + +size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<14>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<13>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<12>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<11>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<10>(pInData, length, pOutData, outCapacity); } + +size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } +size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } +size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } +size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } +size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } +size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } diff --git a/src/block_rANS32x32_16w.h b/src/block_rANS32x32_16w.h new file mode 100644 index 0000000..47b6bb3 --- /dev/null +++ b/src/block_rANS32x32_16w.h @@ -0,0 +1,22 @@ +#ifndef block_rANS32x32_16w_h__ +#define block_rANS32x32_16w_h__ + +#include "hist.h" + +size_t block_rANS32x32_16w_capacity(const size_t inputSize); + +size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); + +size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); + +#endif // block_rANS32x32_16w_h__ diff --git a/src/hist.cpp b/src/hist.cpp index 3aab176..4124ef7 100644 --- a/src/hist.cpp +++ b/src/hist.cpp @@ -4,21 +4,19 @@ ////////////////////////////////////////////////////////////////////////// -void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits) +void observe_hist(uint32_t hist[256], const uint8_t *pData, const size_t size) { - uint32_t hist[256]; - memset(hist, 0, sizeof(hist)); - - const uint32_t totalSymbolCount = ((uint32_t)1 << totalSymbolCountBits); + memset(hist, 0, sizeof(uint32_t) * 256); for (size_t i = 0; i < size; i++) hist[pData[i]]++; +} - uint32_t counter = 0; - - for (size_t i = 0; i < 256; i++) - counter += hist[i]; +void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBytes, const size_t totalSymbolCountBits) +{ + const uint32_t totalSymbolCount = ((uint32_t)1 << totalSymbolCountBits); + size_t counter = dataBytes; uint16_t capped[256]; size_t cappedSum = 0; @@ -73,7 +71,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz } else { - const uint32_t div = counter / totalSymbolCount; + const uint32_t div = (uint32_t)(counter / (size_t)totalSymbolCount); if (div) { @@ -91,7 +89,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz } else { - const uint32_t mul = totalSymbolCount / counter; + const uint32_t mul = (uint32_t)((size_t)totalSymbolCount / counter); for (size_t i = 0; i < 256; i++) { @@ -109,13 +107,13 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz while (true) { - size_t found = totalSymbolCount; + size_t found = totalSymbolCount + 1; for (size_t i = 0; i < 256; i++) if (capped[i] > target && capped[i] < found) found = capped[i]; - if (found == totalSymbolCount) + if (found == totalSymbolCount + 1) break; for (size_t i = 0; i < 256; i++) @@ -136,7 +134,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz while (cappedSum < totalSymbolCount) // Start a charity. { - size_t target = totalSymbolCount; + size_t target = totalSymbolCount + 1; while (true) { @@ -156,7 +154,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz capped[i]++; cappedSum++; - if (cappedSum == totalSymbolCount) + if (cappedSum == totalSymbolCount + 1) goto hist_ready; } } @@ -177,6 +175,13 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz } } +void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits) +{ + uint32_t hist[256]; + observe_hist(hist, pData, size); + normalize_hist(pHist, hist, size, totalSymbolCountBits); +} + void make_enc_hist(hist_enc_t *pHistEnc, const hist_t *pHist) { for (size_t i = 0; i < 256; i++) diff --git a/src/hist.h b/src/hist.h index 1a1c449..78d7074 100644 --- a/src/hist.h +++ b/src/hist.h @@ -51,6 +51,10 @@ struct hist_dec_pack_t ////////////////////////////////////////////////////////////////////////// +void observe_hist(uint32_t hist[256], const uint8_t *pData, const size_t size); + +void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBytes, const size_t totalSymbolCountBits); + // `totalSymbolCountBits` should be <= 15 void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits); diff --git a/src/main.cpp b/src/main.cpp index 44d7e9a..15a9075 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,6 +10,7 @@ #include "rANS32x32_16w.h" #include "rANS32x16_16w.h" #include "rANS32x64_16w.h" +#include "block_rANS32x32_16w.h" #ifdef _WIN32 #include @@ -48,6 +49,7 @@ static bool _OnlyRelevantCodecs = true; static size_t _HistMax = 15; static size_t _HistMin = 10; static bool _Include32Block = false; +static bool _IncludeRaw = false; static size_t _RunCount = 8; constexpr size_t MaxRunCount = 256; @@ -139,42 +141,55 @@ struct codec_info_t func_info_t decoders[MaxDecoderCount]; }; +template +size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity, const hist_t *) +{ + return func(pInData, length, pOutData, outCapacity); +} + static codec_info_t _Codecs[] = { - { "rANS32x32 16w", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, - { "rANS32x32 16w", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, - { "rANS32x32 16w", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, - { "rANS32x32 16w", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, - { "rANS32x32 16w", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, - { "rANS32x32 16w", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, - - { "rANS32x64 16w", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}}, - { "rANS32x64 16w", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}}, - { "rANS32x64 16w", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}}, - { "rANS32x64 16w", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, - { "rANS32x64 16w", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, - { "rANS32x64 16w", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, - - { "rANS32x32 32blk 16w", 15, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_15, true }, {}}}, - { "rANS32x32 32blk 16w", 14, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_14, true }, {}}}, - { "rANS32x32 32blk 16w", 13, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_13, true }, {}}}, - { "rANS32x32 32blk 16w", 12, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_12, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_12, true }, {}}}, - { "rANS32x32 32blk 16w", 11, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_11, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_11, true }, {}}}, - { "rANS32x32 32blk 16w", 10, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_10, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_10, true }, {}}}, - - { "rANS32x32 32blk 8w", 15, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_15, true }, {}}}, - { "rANS32x32 32blk 8w", 14, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_14, true }, {}}}, - { "rANS32x32 32blk 8w", 13, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_13, true }, {}}}, - { "rANS32x32 32blk 8w", 12, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_12 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_12, true }, {}}}, - { "rANS32x32 32blk 8w", 11, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_11 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_11, true }, {}}}, - { "rANS32x32 32blk 8w", 10, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_10 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_10, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + + { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, + { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, + { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, + { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, + { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, + { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, + + { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}}, + { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}}, + { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}}, + { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, + { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, + { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, - { "rANS32x16 16w", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, - { "rANS32x16 16w", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, - { "rANS32x16 16w", 13, {{ "enc scalar", rANS32x16_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, - { "rANS32x16 16w", 12, {{ "enc scalar", rANS32x16_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_12 }, {}}}, - { "rANS32x16 16w", 11, {{ "enc scalar", rANS32x16_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_11 }, {}}}, - { "rANS32x16 16w", 10, {{ "enc scalar", rANS32x16_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_10 }, {}}}, + { "rANS32x16 16w (raw)", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, + { "rANS32x16 16w (raw)", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, + { "rANS32x16 16w (raw)", 13, {{ "enc scalar", rANS32x16_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, + { "rANS32x16 16w (raw)", 12, {{ "enc scalar", rANS32x16_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_12 }, {}}}, + { "rANS32x16 16w (raw)", 11, {{ "enc scalar", rANS32x16_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_11 }, {}}}, + { "rANS32x16 16w (raw)", 10, {{ "enc scalar", rANS32x16_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_10 }, {}}}, + + { "rANS32x32 32blk 16w (raw)", 15, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_15, true }, {}}}, + { "rANS32x32 32blk 16w (raw)", 14, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_14, true }, {}}}, + { "rANS32x32 32blk 16w (raw)", 13, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_13, true }, {}}}, + { "rANS32x32 32blk 16w (raw)", 12, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_12, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_12, true }, {}}}, + { "rANS32x32 32blk 16w (raw)", 11, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_11, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_11, true }, {}}}, + { "rANS32x32 32blk 16w (raw)", 10, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_10, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_10, true }, {}}}, + + { "rANS32x32 32blk 8w (raw)", 15, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_15, true }, {}}}, + { "rANS32x32 32blk 8w (raw)", 14, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_14, true }, {}}}, + { "rANS32x32 32blk 8w (raw)", 13, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_13, true }, {}}}, + { "rANS32x32 32blk 8w (raw)", 12, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_12 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_12, true }, {}}}, + { "rANS32x32 32blk 8w (raw)", 11, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_11 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_11, true }, {}}}, + { "rANS32x32 32blk 8w (raw)", 10, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_10 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_10, true }, {}}}, }; ////////////////////////////////////////////////////////////////////////// @@ -183,6 +198,7 @@ const char ArgumentAllVariants[] = "--all"; const char ArgumentHistMin[] = "--hist-min"; const char ArgumentHistMax[] = "--hist-max"; const char ArgumentInclude32Blk[] = "--include-32blk"; +const char ArgumentIncludeRaw[] = "--include-raw"; const char ArgumentNoSleep[] = "--no-sleep"; const char ArgumentCpuCore[] = "--cpu-core"; const char ArgumentRuns[] = "--runs"; @@ -199,7 +215,8 @@ int32_t main(const int32_t argc, char **pArgv) printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMax); printf("\t%s \tRun all implementations of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants); printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore); - printf("\t%s \tInclude 32 block variants (which are generally quite slow)\n", ArgumentInclude32Blk); + printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw); + printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw); printf("\t%s \tRun the benchmark for a specified amount of times (default: 8)\n", ArgumentNoSleep); return 1; } @@ -220,6 +237,12 @@ int32_t main(const int32_t argc, char **pArgv) argsRemaining--; _OnlyRelevantCodecs = false; } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeRaw, sizeof(ArgumentIncludeRaw)) == 0) + { + argIndex++; + argsRemaining--; + _IncludeRaw = true; + } else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentInclude32Blk, sizeof(ArgumentInclude32Blk)) == 0) { argIndex++; @@ -318,7 +341,13 @@ int32_t main(const int32_t argc, char **pArgv) pUncompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize); pDecompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize); - compressedDataCapacity = rans_max(rans_max(rans_max(rANS32x64_16w_capacity(fileSize), rANS32x16_16w_capacity(fileSize)), rANS32x32_16w_capacity(fileSize)), rans_max(rANS32x32_32blk_16w_capacity(fileSize), rANS32x32_32blk_8w_capacity(fileSize))); + compressedDataCapacity = rANS32x64_16w_capacity(fileSize); + compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x16_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize)); + pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity); if (pUncompressedData == nullptr || pDecompressedData == nullptr || pCompressedData == nullptr) @@ -405,6 +434,7 @@ int32_t main(const int32_t argc, char **pArgv) make_hist(&hist, pUncompressedData, fileSize, _Codecs[codecId].totalSymbolCountBits); bool skipCodec = false; + skipCodec |= (!_IncludeRaw && strstr(_Codecs[codecId].name, " (raw)") != nullptr); skipCodec |= (!_Include32Block && strstr(_Codecs[codecId].name, " 32blk ") != nullptr); skipCodec |= _Codecs[codecId].totalSymbolCountBits > _HistMax; skipCodec |= _Codecs[codecId].totalSymbolCountBits < _HistMin; @@ -466,10 +496,17 @@ int32_t main(const int32_t argc, char **pArgv) printf("\r %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); print_perf_info(fileSize); - const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize); + if (_Codecs[codecId].decoders[0].func != nullptr) + { + const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize); - if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize)) - puts("Failed to validate."); + if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize)) + puts("Failed to validate."); + } + else + { + puts("Unable to validate, no decoder available."); + } } size_t decodedSize = 0; From ffb8e634048a25aa0143c11ed747e46a9f724bea Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Tue, 4 Jul 2023 00:29:26 +0200 Subject: [PATCH 02/34] making things worse-better --- src/block_rANS32x32_16w.cpp | 19 ++++++++++--------- src/main.cpp | 12 ++++++------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp index 76d80fb..cb34b82 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w.cpp @@ -21,12 +21,12 @@ struct HistReplaceMul constexpr static size_t GetValue(); }; -template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 200; } }; -template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 200; } }; -template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 200; } }; -template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 200; } }; -template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 200; } }; -template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 200; } }; +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 52450; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 33915; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 16800; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 8140; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 3865; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 2898; } }; size_t block_rANS32x32_16w_capacity(const size_t inputSize) { @@ -210,7 +210,10 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le size_t accumAbsDiff = 0; for (size_t j = 0; j < 256; j++) - accumAbsDiff += (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]); + { + const size_t diff = (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]); + accumAbsDiff += diff * diff; + } histPotentialCount++; histPotentialDiff += accumAbsDiff; @@ -280,8 +283,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le return outIndex; } - - ////////////////////////////////////////////////////////////////////////// size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); } diff --git a/src/main.cpp b/src/main.cpp index 15a9075..99c5ca1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8 static codec_info_t _Codecs[] = { - { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_10, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, From 1bfd2df6b8c0986356eb975be4cbeba548bd0328 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Tue, 4 Jul 2023 07:18:18 +0200 Subject: [PATCH 03/34] block stuff --- src/block_rANS32x32_16w.cpp | 81 +++++++++++++++++++++++++++---------- src/main.cpp | 6 +-- 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp index cb34b82..5895fcd 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w.cpp @@ -21,12 +21,12 @@ struct HistReplaceMul constexpr static size_t GetValue(); }; -template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 52450; } }; -template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 33915; } }; -template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 16800; } }; -template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 8140; } }; -template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 3865; } }; -template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 2898; } }; +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 110; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 110; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 110; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 110; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 110; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 90; } }; size_t block_rANS32x32_16w_capacity(const size_t inputSize) { @@ -57,11 +57,14 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); size_t blockLowCmp = blockLowI + StateCount; + constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); + constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 10; + size_t histCount = 1; size_t histPotentialCount = 1; - size_t histDiff = 0; - size_t histPotentialDiff = 0; - size_t histRejectedDiff = 0; + int64_t histDiff = 0; + int64_t histPotentialDiff = 0; + int64_t histRejectedDiff = 0; if (blockLowI > MinBlockSize) blockLowI -= MinBlockSize; @@ -167,6 +170,8 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le // Potentially replace histogram. { + histPotentialCount++; + blockLowI = i - MinBlockSize; blockLowCmp = blockLowI + StateCount; @@ -192,8 +197,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le { if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) - if (symCount[j] == 0) - symCount[j] = 1; + symCount[j]++; hist_t newHist; @@ -201,34 +205,67 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le { for (size_t j = 0; j < 256; j++) newHist.symbolCount[j] = (uint16_t)symCount[j]; + + size_t counter = 0; + + for (size_t j = 0; j < 256; j++) + { + newHist.cumul[j] = (uint16_t)counter; + counter += newHist.symbolCount[j]; + } } else { normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits); } - size_t accumAbsDiff = 0; + double costBefore = 0; + double costAfter = 0; - for (size_t j = 0; j < 256; j++) + if constexpr (IsSafeHist) { - const size_t diff = (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]); - accumAbsDiff += diff * diff; + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const double before = (symCount[j] - 1) * log2(hist.symbolCount[j] / (double)totalSymbolCount); + const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + else + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const double before = symCount[j] * log2(hist.symbolCount[j] / (double)totalSymbolCount); + const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } } - histPotentialCount++; - histPotentialDiff += accumAbsDiff; + const double accumDiff = costBefore - costAfter; + + //printf("Block %" PRIu64": %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", histPotentialCount, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), accumDiff * 100.0 / histReplacePoint, accumDiff >= histReplacePoint ? "Accepted" : "Rejected"); - constexpr size_t histReplacePoint = ((1 << TotalSymbolCountBits) * HistReplaceMul::GetValue()) >> 10; + histPotentialDiff += (int64_t)accumDiff; - if (accumAbsDiff >= histReplacePoint) + if (accumDiff >= histReplacePoint) { - histDiff += accumAbsDiff; + histDiff += (int64_t)accumDiff; mustReplaceHist = true; hist = newHist; } else { - histRejectedDiff += accumAbsDiff; + histRejectedDiff += (int64_t)accumDiff; } } @@ -278,7 +315,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. - printf("\t>>>>> %" PRIu64 " / %" PRIu64 " histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3fk, total: %5.3fk, rejected: %5.3fk\n", histCount, histPotentialCount, (length / 1024.0) / histCount, (histDiff / 1024.0) / histCount, (histPotentialDiff / 1024.0) / histPotentialCount, (histRejectedDiff / 1024.0) / (histPotentialCount - histCount)); + printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3f, total: %5.3f, rejected: %5.3f\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount, (histDiff * 100.0 / histReplacePoint) / histCount, (histPotentialDiff * 100.0 / histReplacePoint) / histPotentialCount, (histRejectedDiff * 100.0 / histReplacePoint) / (histPotentialCount - histCount)); return outIndex; } diff --git a/src/main.cpp b/src/main.cpp index 99c5ca1..72ecd2e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -488,12 +488,12 @@ int32_t main(const int32_t argc, char **pArgv) _NsPerRun[run] = TicksToNs(endTick - startTick); _ClocksPerRun[run] = endClock - startClock; - printf("\r %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); + printf("\r %-38s | %7.3f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000)); } - printf("\r %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); + printf("\r %-38s | %7.3f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); print_perf_info(fileSize); if (_Codecs[codecId].decoders[0].func != nullptr) @@ -505,7 +505,7 @@ int32_t main(const int32_t argc, char **pArgv) } else { - puts("Unable to validate, no decoder available."); + //puts("Unable to validate, no decoder available."); } } From c9d4a8531bba2324a2a4265430f0dbd6e2436cb8 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Tue, 4 Jul 2023 21:56:45 +0200 Subject: [PATCH 04/34] slow block pre-calc --- src/block_rANS32x32_16w.cpp | 280 +++++++++++++++++++++--------------- 1 file changed, 161 insertions(+), 119 deletions(-) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp index 5895fcd..93e7a9e 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w.cpp @@ -21,12 +21,12 @@ struct HistReplaceMul constexpr static size_t GetValue(); }; -template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 110; } }; -template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 110; } }; -template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 110; } }; -template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 110; } }; -template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 110; } }; -template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 90; } }; +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } }; size_t block_rANS32x32_16w_capacity(const size_t inputSize) { @@ -39,6 +39,97 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize) ////////////////////////////////////////////////////////////////////////// +template +static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) +{ + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + + memset(symCount, 0, sizeof(uint32_t) * 256); + observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize); + + // Do we include a symbol that hasn't been included before? + if constexpr (!IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0) + return false; + } + + hist_t newHist; + + if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize) + { + for (size_t j = 0; j < 256; j++) + newHist.symbolCount[j] = (uint16_t)symCount[j]; + + size_t counter = 0; + + for (size_t j = 0; j < 256; j++) + { + newHist.cumul[j] = (uint16_t)counter; + counter += newHist.symbolCount[j]; + } + } + else + { + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + symCount[j]++; + + normalize_hist(&newHist, symCount, MinBlockSize + 256, TotalSymbolCountBits); + } + else + { + normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits); + } + } + + constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); + constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; + + // this comparison isn't fair or fast, but should be a good starting point hopefully. + double costBefore = 0; + double costAfter = 0; + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const double before = (symCount[j] - 1) * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount); + const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + else + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const double before = symCount[j] * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount); + const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + + const double diff = costBefore - costAfter; + + //printf("[%8" PRIX64 " ~ %8" PRIX64 "] %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", nextBlockStartOffset, nextBlockStartOffset + nextBlockSize, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), diff * 100.0 / histReplacePoint, diff >= histReplacePoint ? "Accepted" : "Rejected"); + + return (diff < histReplacePoint); +} + +////////////////////////////////////////////////////////////////////////// + template size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { @@ -53,24 +144,20 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le uint32_t states[StateCount]; uint16_t *pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); uint16_t *pStart = pEnd; - uint16_t *pBlockBack = pStart; + size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); - size_t blockLowCmp = blockLowI + StateCount; - constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); - constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 10; + if (blockLowI > MinBlockSize) + blockLowI -= MinBlockSize; + + size_t blockLowCmp = blockLowI + StateCount; + size_t blockBackPoint = length; size_t histCount = 1; size_t histPotentialCount = 1; - int64_t histDiff = 0; - int64_t histPotentialDiff = 0; - int64_t histRejectedDiff = 0; - - if (blockLowI > MinBlockSize) - blockLowI -= MinBlockSize; uint32_t symCount[256]; - observe_hist(symCount, pInData + blockLowI, length - blockLowI); + observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) @@ -78,7 +165,28 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le symCount[j] = 1; hist_t hist; - normalize_hist(&hist, symCount, length - blockLowI, TotalSymbolCountBits); + normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); + + while (blockLowI > 0) + { + histPotentialCount++; + + if (_CanExtendHist(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount)) + { + blockLowI -= MinBlockSize; + blockLowCmp -= MinBlockSize; + } + else + { + break; + } + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); + normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); + //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]\n", blockLowI, blockBackPoint); + blockBackPoint = blockLowI; // Init States. for (size_t i = 0; i < StateCount; i++) @@ -164,126 +272,60 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount); } } + + // Write hist. + { + const uint64_t blockSize = blockBackPoint - blockLowI; + + pStart++; + pStart -= 256; + memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount)); + + pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(pStart, &blockSize, sizeof(blockSize)); + + pStart--; + + histCount++; + } if (i == 0) break; // Potentially replace histogram. { - histPotentialCount++; - - blockLowI = i - MinBlockSize; - blockLowCmp = blockLowI + StateCount; + blockLowI -= MinBlockSize; + blockLowCmp -= MinBlockSize; - memset(symCount, 0, sizeof(symCount)); observe_hist(symCount, pInData + blockLowI, MinBlockSize); - bool mustReplaceHist = false; - - if constexpr (!IsSafeHist) - { + if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) - { - if (symCount[j] > 0 && hist.symbolCount[j] == 0) - { - mustReplaceHist = true; - normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits); - break; - } - } - } - - if (!mustReplaceHist) - { - if constexpr (IsSafeHist) - for (size_t j = 0; j < 256; j++) - symCount[j]++; - - hist_t newHist; - - if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize) - { - for (size_t j = 0; j < 256; j++) - newHist.symbolCount[j] = (uint16_t)symCount[j]; - - size_t counter = 0; - - for (size_t j = 0; j < 256; j++) - { - newHist.cumul[j] = (uint16_t)counter; - counter += newHist.symbolCount[j]; - } - } - else - { - normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits); - } - - double costBefore = 0; - double costAfter = 0; - - if constexpr (IsSafeHist) - { - for (size_t j = 0; j < 256; j++) - { - if (symCount[j] == 0) - continue; - - const double before = (symCount[j] - 1) * log2(hist.symbolCount[j] / (double)totalSymbolCount); - const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount); - - costBefore -= before; - costAfter -= after; - } - } - else - { - for (size_t j = 0; j < 256; j++) - { - if (symCount[j] == 0) - continue; - - const double before = symCount[j] * log2(hist.symbolCount[j] / (double)totalSymbolCount); - const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + if (symCount[j] == 0) + symCount[j] = 1; - costBefore -= before; - costAfter -= after; - } - } + normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits); - const double accumDiff = costBefore - costAfter; - - //printf("Block %" PRIu64": %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", histPotentialCount, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), accumDiff * 100.0 / histReplacePoint, accumDiff >= histReplacePoint ? "Accepted" : "Rejected"); - - histPotentialDiff += (int64_t)accumDiff; + while (blockLowI > 0) + { + histPotentialCount++; - if (accumDiff >= histReplacePoint) + if (_CanExtendHist(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount)) { - histDiff += (int64_t)accumDiff; - mustReplaceHist = true; - hist = newHist; + blockLowI -= MinBlockSize; + blockLowCmp -= MinBlockSize; } else { - histRejectedDiff += (int64_t)accumDiff; + break; } } - if (mustReplaceHist) - { - const uint64_t blockSize = pBlockBack - pStart; - - pStart++; - pStart -= 256; - memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount)); - pStart -= sizeof(uint64_t); - memcpy(pStart, &blockSize, sizeof(blockSize)); - - pStart--; - pBlockBack = pStart; - - histCount++; - } + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); + normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); + //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]: i: %" PRIX64 "\n", blockLowI, blockBackPoint, i); + blockBackPoint = blockLowI; } } @@ -315,7 +357,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. - printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3f, total: %5.3f, rejected: %5.3f\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount, (histDiff * 100.0 / histReplacePoint) / histCount, (histPotentialDiff * 100.0 / histReplacePoint) / histPotentialCount, (histRejectedDiff * 100.0 / histReplacePoint) / (histPotentialCount - histCount)); + printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB.\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount); return outIndex; } From 79fc1db9a70a817eb1643ef0f9760a75af39d32f Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Tue, 4 Jul 2023 21:56:53 +0200 Subject: [PATCH 05/34] cleanup --- src/block_rANS32x32_16w.cpp | 25 ++++++++----------------- src/main.cpp | 18 +++++++++--------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp index 93e7a9e..8a2d0c0 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w.cpp @@ -6,9 +6,6 @@ #include #include -#include -#include - constexpr size_t StateCount = 32; // Needs to be a power of two. constexpr bool EncodeNoBranch = false; //constexpr bool DecodeNoBranch = false; @@ -86,11 +83,11 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs } constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); - constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; + constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; // this comparison isn't fair or fast, but should be a good starting point hopefully. - double costBefore = 0; - double costAfter = 0; + float costBefore = 0; + float costAfter = 0; if constexpr (IsSafeHist) { @@ -99,8 +96,8 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs if (symCount[j] == 0) continue; - const double before = (symCount[j] - 1) * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount); - const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = (symCount[j] - 1) * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); costBefore -= before; costAfter -= after; @@ -113,17 +110,15 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs if (symCount[j] == 0) continue; - const double before = symCount[j] * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount); - const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount); + const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); costBefore -= before; costAfter -= after; } } - const double diff = costBefore - costAfter; - - //printf("[%8" PRIX64 " ~ %8" PRIX64 "] %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", nextBlockStartOffset, nextBlockStartOffset + nextBlockSize, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), diff * 100.0 / histReplacePoint, diff >= histReplacePoint ? "Accepted" : "Rejected"); + const float diff = costBefore - costAfter; return (diff < histReplacePoint); } @@ -185,7 +180,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); - //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]\n", blockLowI, blockBackPoint); blockBackPoint = blockLowI; // Init States. @@ -324,7 +318,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); - //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]: i: %" PRIX64 "\n", blockLowI, blockBackPoint, i); blockBackPoint = blockLowI; } } @@ -357,8 +350,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. - printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB.\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount); - return outIndex; } diff --git a/src/main.cpp b/src/main.cpp index 72ecd2e..15a9075 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8 static codec_info_t _Codecs[] = { - { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, @@ -488,12 +488,12 @@ int32_t main(const int32_t argc, char **pArgv) _NsPerRun[run] = TicksToNs(endTick - startTick); _ClocksPerRun[run] = endClock - startClock; - printf("\r %-38s | %7.3f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); + printf("\r %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000)); } - printf("\r %-38s | %7.3f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); + printf("\r %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); print_perf_info(fileSize); if (_Codecs[codecId].decoders[0].func != nullptr) @@ -505,7 +505,7 @@ int32_t main(const int32_t argc, char **pArgv) } else { - //puts("Unable to validate, no decoder available."); + puts("Unable to validate, no decoder available."); } } From b6d0dec9f8c838b9986b5ac05af614ec3f60d5cb Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 02:36:32 +0200 Subject: [PATCH 06/34] Further cleanup --- src/block_rANS32x32_16w.cpp | 239 +++++++++++++++++------------------- 1 file changed, 111 insertions(+), 128 deletions(-) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp index 8a2d0c0..07be88b 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w.cpp @@ -10,7 +10,8 @@ constexpr size_t StateCount = 32; // Needs to be a power of two. constexpr bool EncodeNoBranch = false; //constexpr bool DecodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; -constexpr size_t MinBlockSize = 1 << 15; +constexpr size_t MinBlockSizeBits = 15; +constexpr size_t MinBlockSize = 1 << MinBlockSizeBits; template struct HistReplaceMul @@ -31,11 +32,16 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize) const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1; const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t); - return baseSize + blockCount * perBlockExtraSize; // i hope this covers all of our bases. + return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases. } ////////////////////////////////////////////////////////////////////////// +static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + template static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) { @@ -54,7 +60,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs hist_t newHist; - if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize) + if constexpr (!IsSafeHist && TotalSymbolCountBits == MinBlockSizeBits) { for (size_t j = 0; j < 256; j++) newHist.symbolCount[j] = (uint16_t)symCount[j]; @@ -125,6 +131,56 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs ////////////////////////////////////////////////////////////////////////// +struct _rans_encode_state_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +template +static void rans32x32_16w_encode_internal_scalar(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) +{ + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } +} + template size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { @@ -136,189 +192,122 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; - uint32_t states[StateCount]; - uint16_t *pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); - uint16_t *pStart = pEnd; + _rans_encode_state_t encodeState; + encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); + encodeState.pStart = encodeState.pEnd; - size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); + size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); - if (blockLowI > MinBlockSize) - blockLowI -= MinBlockSize; + if (inputBlockTargetIndex > MinBlockSize) + inputBlockTargetIndex -= MinBlockSize; - size_t blockLowCmp = blockLowI + StateCount; size_t blockBackPoint = length; - size_t histCount = 1; - size_t histPotentialCount = 1; - uint32_t symCount[256]; - observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) if (symCount[j] == 0) symCount[j] = 1; - hist_t hist; - normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - while (blockLowI > 0) + while (inputBlockTargetIndex > 0) { - histPotentialCount++; - - if (_CanExtendHist(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount)) - { - blockLowI -= MinBlockSize; - blockLowCmp -= MinBlockSize; - } + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSize; else - { break; - } } // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); - normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); - blockBackPoint = blockLowI; + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = inputBlockTargetIndex; // Init States. for (size_t i = 0; i < StateCount; i++) - states[i] = DecodeConsumePoint16; - - const uint8_t idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; - static_assert(sizeof(idx2idx) == StateCount); + encodeState.states[i] = DecodeConsumePoint16; - int64_t i = length - 1; - i &= ~(size_t)(StateCount - 1); - i += StateCount; + int64_t inputIndex = length - 1; + inputIndex &= ~(size_t)(StateCount - 1); + inputIndex += StateCount; for (int64_t j = StateCount - 1; j >= 0; j--) { - const uint8_t index = idx2idx[j]; + const uint8_t index = _Rans32x32_idx2idx[j]; - if (i - (int64_t)StateCount + (int64_t)index < (int64_t)length) + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) { - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = hist.symbolCount[in]; + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; const uint32_t max = EncodeEmitPoint * symbolCount; const size_t stateIndex = j; - uint32_t state = states[stateIndex]; + uint32_t state = encodeState.states[stateIndex]; - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pStart = (uint16_t)(state & 0xFFFF); - *pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else + if (state >= max) { - if (state >= max) - { - *pStart = (uint16_t)(state & 0xFFFF); - pStart--; - state >>= 16; - } + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; } - states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount); + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); } } - i -= StateCount; + inputIndex -= StateCount; while (true) { - for (; i >= (int64_t)blockLowCmp; i -= StateCount) - { - for (int64_t j = StateCount - 1; j >= 0; j--) - { - const uint8_t index = idx2idx[j]; + rans32x32_16w_encode_internal_scalar(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; - - const size_t stateIndex = j; - - uint32_t state = states[stateIndex]; - - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pStart = (uint16_t)(state & 0xFFFF); - *pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else - { - if (state >= max) - { - *pStart = (uint16_t)(state & 0xFFFF); - pStart--; - state >>= 16; - } - } - - states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount); - } - } - // Write hist. { - const uint64_t blockSize = blockBackPoint - blockLowI; - - pStart++; - pStart -= 256; - memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount)); + const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; - pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(pStart, &blockSize, sizeof(blockSize)); + encodeState.pStart++; + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); - pStart--; + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); - histCount++; + encodeState.pStart--; } - if (i == 0) + if (inputIndex == 0) break; - // Potentially replace histogram. + // Determine new histogram. { - blockLowI -= MinBlockSize; - blockLowCmp -= MinBlockSize; + inputBlockTargetIndex -= MinBlockSize; - observe_hist(symCount, pInData + blockLowI, MinBlockSize); + observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSize); if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) if (symCount[j] == 0) symCount[j] = 1; - normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, MinBlockSize, TotalSymbolCountBits); - while (blockLowI > 0) + while (inputBlockTargetIndex > 0) { - histPotentialCount++; - - if (_CanExtendHist(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount)) - { - blockLowI -= MinBlockSize; - blockLowCmp -= MinBlockSize; - } + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSize; else - { break; - } } // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI); - normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits); - blockBackPoint = blockLowI; + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = inputBlockTargetIndex; } } @@ -331,21 +320,15 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le // compressed expected length. outIndex += sizeof(uint64_t); - for (size_t j = 0; j < 256; j++) - { - *reinterpret_cast(pWrite + outIndex) = hist.symbolCount[j]; - outIndex += sizeof(uint16_t); - } - for (size_t j = 0; j < StateCount; j++) { - *reinterpret_cast(pWrite + outIndex) = states[j]; + *reinterpret_cast(pWrite + outIndex) = encodeState.states[j]; outIndex += sizeof(uint32_t); } - const size_t size = (pEnd - pStart) * sizeof(uint16_t); + const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t); - memmove(pWrite + outIndex, pStart + 1, size); + memmove(pWrite + outIndex, encodeState.pStart + 1, size); outIndex += size; *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. From 4c5006184b0c118beb441b9bfe92f8e06b38f35a Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 02:46:46 +0200 Subject: [PATCH 07/34] Even more cleanup --- ...16w.cpp => block_rANS32x32_16w_encode.cpp} | 137 ++++++++++-------- 1 file changed, 77 insertions(+), 60 deletions(-) rename src/{block_rANS32x32_16w.cpp => block_rANS32x32_16w_encode.cpp} (81%) diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w_encode.cpp similarity index 81% rename from src/block_rANS32x32_16w.cpp rename to src/block_rANS32x32_16w_encode.cpp index 07be88b..62cda4f 100644 --- a/src/block_rANS32x32_16w.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -8,7 +8,6 @@ constexpr size_t StateCount = 32; // Needs to be a power of two. constexpr bool EncodeNoBranch = false; -//constexpr bool DecodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; constexpr size_t MinBlockSizeBits = 15; constexpr size_t MinBlockSize = 1 << MinBlockSizeBits; @@ -42,6 +41,74 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); ////////////////////////////////////////////////////////////////////////// +struct _rans_encode_state_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +enum rans32x32_encoder_type_t +{ + r32x32_et_scalar, +}; + +template +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); +}; + +template <> +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + { + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + template static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) { @@ -131,58 +198,8 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs ////////////////////////////////////////////////////////////////////////// -struct _rans_encode_state_t -{ - uint32_t states[StateCount]; - hist_t hist; - uint16_t *pEnd, *pStart; // both compressed. -}; - -template -static void rans32x32_16w_encode_internal_scalar(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) -{ - int64_t targetCmp = targetIndex + StateCount; - - constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); - - for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) - { - for (int64_t j = StateCount - 1; j >= 0; j--) - { - const uint8_t index = _Rans32x32_idx2idx[j]; - - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = pState->hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; - - const size_t stateIndex = j; - - uint32_t state = pState->states[stateIndex]; - - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pState->pStart = (uint16_t)(state & 0xFFFF); - *pState->pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else - { - if (state >= max) - { - *pState->pStart = (uint16_t)(state & 0xFFFF); - pState->pStart--; - state >>= 16; - } - } - - pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); - } - } -} - -template -size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) +template +size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { if (outCapacity < block_rANS32x32_16w_capacity(length)) return 0; @@ -263,7 +280,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le while (true) { - rans32x32_16w_encode_internal_scalar(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); inputIndex = inputBlockTargetIndex; // Write hist. @@ -338,12 +355,12 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le ////////////////////////////////////////////////////////////////////////// -size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); } -size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<14>(pInData, length, pOutData, outCapacity); } -size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<13>(pInData, length, pOutData, outCapacity); } -size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<12>(pInData, length, pOutData, outCapacity); } -size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<11>(pInData, length, pOutData, outCapacity); } -size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<10>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<15, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<14, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<13, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } From 0b60d3210d125979d7145c5160f722e288ba8cd8 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 07:16:09 +0200 Subject: [PATCH 08/34] End not working yet, but generally getting close --- src/block_rANS32x32_16w_decode.cpp | 215 +++++++++++++++++++++++++++++ src/block_rANS32x32_16w_encode.cpp | 11 +- 2 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 src/block_rANS32x32_16w_decode.cpp diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp new file mode 100644 index 0000000..fe2eef1 --- /dev/null +++ b/src/block_rANS32x32_16w_decode.cpp @@ -0,0 +1,215 @@ +#include "block_rANS32x32_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +constexpr size_t StateCount = 32; // Needs to be a power of two. +constexpr bool DecodeNoBranch = false; + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +template +struct _rans_decode_state_t +{ + uint32_t states[StateCount]; + hist_type hist; +}; + +enum rans32x32_decoder_type_t +{ + r32x32_dt_scalar, +}; + +template +struct rans32x32_16w_decoder +{ + static const uint16_t *decode_section(_rans_decode_state_t *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); +}; + +template +struct rans32x32_16w_decoder> +{ + static const uint16_t *decode_section(_rans_decode_state_t> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t i = startIndex; + + for (; i < endIndex; i += StateCount) + { + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + uint32_t state = pState->states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = pState->hist.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pReadHead; + state = read ? newState : state; + pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pReadHead; + pReadHead++; + } + } + + pState->states[j] = state; + } + } + + return pReadHead; + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + (void)totalSymbolCountBits; + + memcpy(pDecHist, pIncompleteHist, sizeof(hist_t)); + + return inplace_make_hist_dec(pDecHist); +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state_t decodeState; + + for (size_t i = 0; i < StateCount; i++) + { + decodeState.states[i] = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint32_t); + } + + const uint16_t *pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(pReadHead); + pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *pReadHead; + pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + pReadHead = rans32x32_16w_decoder::decode_section(&decodeState, pReadHead, pOutData, i, blockEndInStates); + + i = blockEndInStates; + + if (i + StateCount > outLengthInStates) + break; + } + while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec, &hist, sizeof(hist)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pReadHead; + state = read ? newState : state; + pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pReadHead; + pReadHead++; + } + } + + decodeState.states[j] = state; + } + } + } + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<15, r32x32_dt_scalar, hist_dec_t<15>>(pInData, inLength, pOutData, outCapacity); } +size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<14, r32x32_dt_scalar, hist_dec_t<14>>(pInData, inLength, pOutData, outCapacity); } +size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<13, r32x32_dt_scalar, hist_dec_t<13>>(pInData, inLength, pOutData, outCapacity); } +size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<12, r32x32_dt_scalar, hist_dec_t<12>>(pInData, inLength, pOutData, outCapacity); } +size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<11, r32x32_dt_scalar, hist_dec_t<11>>(pInData, inLength, pOutData, outCapacity); } +size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<10, r32x32_dt_scalar, hist_dec_t<10>>(pInData, inLength, pOutData, outCapacity); } diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index 62cda4f..911b847 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -241,7 +241,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - blockBackPoint = inputBlockTargetIndex; + blockBackPoint = length; // Init States. for (size_t i = 0; i < StateCount; i++) @@ -324,7 +324,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - blockBackPoint = inputBlockTargetIndex; + blockBackPoint = inputIndex; } } @@ -361,10 +361,3 @@ size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } - -size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } -size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } -size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } -size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } -size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } -size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; } From 6ba49d9ca5d0d372ebaba97b5f0277811ce438b9 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 18:04:06 +0200 Subject: [PATCH 09/34] decoder fixed --- src/block_rANS32x32_16w_decode.cpp | 52 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index fe2eef1..94c25ca 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -21,6 +21,7 @@ struct _rans_decode_state_t { uint32_t states[StateCount]; hist_type hist; + const uint16_t *pReadHead; }; enum rans32x32_decoder_type_t @@ -31,13 +32,13 @@ enum rans32x32_decoder_type_t template struct rans32x32_16w_decoder { - static const uint16_t *decode_section(_rans_decode_state_t *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); + static size_t decode_section(_rans_decode_state_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); }; template struct rans32x32_16w_decoder> { - static const uint16_t *decode_section(_rans_decode_state_t> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); @@ -59,16 +60,16 @@ struct rans32x32_16w_decoderpReadHead; state = read ? newState : state; - pReadHead += (size_t)read; + pState->pReadHead += (size_t)read; } else { if (state < DecodeConsumePoint16) { - state = state << 16 | *pReadHead; - pReadHead++; + state = state << 16 | *pState->pReadHead; + pState->pReadHead++; } } @@ -76,7 +77,7 @@ struct rans32x32_16w_decoder *pDecHist, hist_t * { (void)totalSymbolCountBits; - memcpy(pDecHist, pIncompleteHist, sizeof(hist_t)); + memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); return inplace_make_hist_dec(pDecHist); } @@ -124,20 +125,20 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, inputIndex += sizeof(uint32_t); } - const uint16_t *pReadHead = reinterpret_cast(pInData + inputIndex); + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; hist_t hist; do { - const uint64_t blockSize = *reinterpret_cast(pReadHead); - pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); for (size_t j = 0; j < 256; j++) { - hist.symbolCount[j] = *pReadHead; - pReadHead++; + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; } if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) @@ -150,19 +151,22 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, else if ((blockEndInStates & (StateCount - 1)) != 0) return 0; - pReadHead = rans32x32_16w_decoder::decode_section(&decodeState, pReadHead, pOutData, i, blockEndInStates); + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); - i = blockEndInStates; + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } - if (i + StateCount > outLengthInStates) - break; - } - while (i < outLengthInStates); + } while (i < outLengthInStates); if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec, &hist, sizeof(hist)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); if (!inplace_make_hist_dec(&histDec)) return 0; @@ -184,16 +188,16 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, if constexpr (DecodeNoBranch) { const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *pReadHead; + const uint32_t newState = state << 16 | *decodeState.pReadHead; state = read ? newState : state; - pReadHead += (size_t)read; + decodeState.pReadHead += (size_t)read; } else { if (state < DecodeConsumePoint16) { - state = state << 16 | *pReadHead; - pReadHead++; + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; } } From b7c38a7dd5a2b8c868ae48e6d81da129fbed4b77 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 18:32:23 +0200 Subject: [PATCH 10/34] SIMD for 13-15 --- src/block_rANS32x32_16w_decode.cpp | 313 ++++++++++++++++++++++++++++- src/main.cpp | 12 +- 2 files changed, 313 insertions(+), 12 deletions(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index 94c25ca..5ce9784 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -16,6 +16,12 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); ////////////////////////////////////////////////////////////////////////// +extern const uint8_t _ShuffleLutShfl32[256 * 8]; +extern const uint8_t _ShuffleLutPerm32[256 * 8]; +extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; + +////////////////////////////////////////////////////////////////////////// + template struct _rans_decode_state_t { @@ -27,6 +33,7 @@ struct _rans_decode_state_t enum rans32x32_decoder_type_t { r32x32_dt_scalar, + r32x32_dt_avx2_large_cache_15_to_13, }; template @@ -81,6 +88,249 @@ struct rans32x32_16w_decoder +static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_loadu_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm256_set1_epi32(0xFFFF); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const simd_t _1 = _mm256_set1_epi32(1); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // const uint8_t symbol = pHist->cumulInv[slot]; + simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); + simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); + simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); + simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); + + // since they were int32_t turn into uint8_t + symbol0 = _mm256_and_si256(symbol0, lower8); + symbol1 = _mm256_and_si256(symbol1, lower8); + symbol2 = _mm256_and_si256(symbol2, lower8); + symbol3 = _mm256_and_si256(symbol3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); + + // freq, cumul. + const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); + const simd_t freq0 = _mm256_and_si256(pack0, lower16); + const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); + const simd_t freq1 = _mm256_and_si256(pack1, lower16); + const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); + const simd_t freq2 = _mm256_and_si256(pack2, lower16); + const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); + const simd_t freq3 = _mm256_and_si256(pack3, lower16); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_storeu_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + ////////////////////////////////////////////////////////////////////////// template @@ -93,6 +343,17 @@ static bool _init_from_hist(hist_dec_t *pDecHist, hist_t * return inplace_make_hist_dec(pDecHist); } +template +static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec2_hist(pDecHist, pIncompleteHist); + + return true; +} + ////////////////////////////////////////////////////////////////////////// template @@ -211,9 +472,49 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ////////////////////////////////////////////////////////////////////////// -size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<15, r32x32_dt_scalar, hist_dec_t<15>>(pInData, inLength, pOutData, outCapacity); } -size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<14, r32x32_dt_scalar, hist_dec_t<14>>(pInData, inLength, pOutData, outCapacity); } -size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<13, r32x32_dt_scalar, hist_dec_t<13>>(pInData, inLength, pOutData, outCapacity); } -size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<12, r32x32_dt_scalar, hist_dec_t<12>>(pInData, inLength, pOutData, outCapacity); } -size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<11, r32x32_dt_scalar, hist_dec_t<11>>(pInData, inLength, pOutData, outCapacity); } -size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<10, r32x32_dt_scalar, hist_dec_t<10>>(pInData, inLength, pOutData, outCapacity); } +template +static size_t block_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + _DetectCPUFeatures(); + + if (avx2Supported) + { + if constexpr (TotalSymbolCountBits >= 13) + return block_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + + // Fallback. + return block_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity); +} diff --git a/src/main.cpp b/src/main.cpp index 15a9075..d076555 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8 static codec_info_t _Codecs[] = { - { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, From 8e958b6dc6678922cf6f1bd5da90f00a0c533e48 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 18:46:09 +0200 Subject: [PATCH 11/34] improving clang compat --- src/block_rANS32x32_16w_decode.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index 5ce9784..f97e1c3 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -34,6 +34,9 @@ enum rans32x32_decoder_type_t { r32x32_dt_scalar, r32x32_dt_avx2_large_cache_15_to_13, + r32x32_dt_avx2_small_cache_15_to_13, + r32x32_dt_avx2_large_cache_12_to_10, + r32x32_dt_avx2_small_cache_12_to_10, }; template @@ -89,6 +92,9 @@ struct rans32x32_16w_decoder +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { if constexpr (!WriteAligned32) @@ -110,7 +116,6 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + ////////////////////////////////////////////////////////////////////////// template From 04d52d9fc147ca468bbbd94c75474a139ab8f0fe Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 19:05:34 +0200 Subject: [PATCH 12/34] adding 10-12 --- src/block_rANS32x32_16w_decode.cpp | 264 +++++++++++++++++++++++++++++ src/main.cpp | 49 +++++- 2 files changed, 312 insertions(+), 1 deletion(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index f97e1c3..fe5a8be 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -102,6 +102,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t(pState, pOutData, startIndex, endIndex); constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); typedef __m256i simd_t; simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; @@ -326,6 +327,236 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + + static_assert(TotalSymbolCountBits <= 12); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_loadu_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot3, sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm256_and_si256(pack0, lower8); + const simd_t symbol1 = _mm256_and_si256(pack1, lower8); + const simd_t symbol2 = _mm256_and_si256(pack2, lower8); + const simd_t symbol3 = _mm256_and_si256(pack3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // unpack freq, cumul. + const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm256_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm256_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm256_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm256_srli_epi32(pack3, 20); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_storeu_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + template struct rans32x32_16w_decoder> { @@ -346,6 +577,26 @@ struct rans32x32_16w_decoder +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + ////////////////////////////////////////////////////////////////////////// template @@ -369,6 +620,17 @@ static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t return true; } +template +static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec_pack_hist(pDecHist, pIncompleteHist); + + return true; +} + ////////////////////////////////////////////////////////////////////////// template @@ -496,6 +758,8 @@ static size_t block_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_ { if constexpr (TotalSymbolCountBits >= 13) return block_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); + else + return block_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); } // Fallback. diff --git a/src/main.cpp b/src/main.cpp index d076555..80650d0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,6 +51,8 @@ static size_t _HistMin = 10; static bool _Include32Block = false; static bool _IncludeRaw = false; static size_t _RunCount = 8; +static size_t _EncodeRunCount = 2; +static size_t _DecodeRunCount = 16; constexpr size_t MaxRunCount = 256; static uint64_t _ClocksPerRun[MaxRunCount]; @@ -202,6 +204,8 @@ const char ArgumentIncludeRaw[] = "--include-raw"; const char ArgumentNoSleep[] = "--no-sleep"; const char ArgumentCpuCore[] = "--cpu-core"; const char ArgumentRuns[] = "--runs"; +const char ArgumentRunsEncode[] = "--runs-enc"; +const char ArgumentRunsDecode[] = "--runs-dec"; ////////////////////////////////////////////////////////////////////////// @@ -217,7 +221,10 @@ int32_t main(const int32_t argc, char **pArgv) printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore); printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw); printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw); - printf("\t%s \tRun the benchmark for a specified amount of times (default: 8)\n", ArgumentNoSleep); + printf("\t%s \tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode); + printf("\t%s \tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode); + printf("\t%s \tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode); + printf("\t%s \tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep); return 1; } @@ -270,6 +277,44 @@ int32_t main(const int32_t argc, char **pArgv) _DisableSleep = true; } + _EncodeRunCount = _DecodeRunCount = _RunCount; + + argIndex += 2; + argsRemaining -= 2; + } + else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRunsEncode, sizeof(ArgumentRunsEncode)) == 0) + { + _EncodeRunCount = strtoull(pArgv[argIndex + 1], nullptr, 10); + + if (_EncodeRunCount > MaxRunCount) + { + puts("Invalid Parameter."); + return 1; + } + else if (_EncodeRunCount == 0) + { + _EncodeRunCount = 1; + _DisableSleep = true; + } + + argIndex += 2; + argsRemaining -= 2; + } + else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRunsDecode, sizeof(ArgumentRunsDecode)) == 0) + { + _DecodeRunCount = strtoull(pArgv[argIndex + 1], nullptr, 10); + + if (_DecodeRunCount > MaxRunCount) + { + puts("Invalid Parameter."); + return 1; + } + else if (_DecodeRunCount == 0) + { + _DecodeRunCount = 1; + _DisableSleep = true; + } + argIndex += 2; argsRemaining -= 2; } @@ -445,6 +490,7 @@ int32_t main(const int32_t argc, char **pArgv) printf("%-37s %2" PRIu32 " | -------- | ---------------- | ------------------------------------ | -------------- | ------------------------------------\n", _Codecs[codecId].name, _Codecs[codecId].totalSymbolCountBits); size_t encodedSize = 0; + _RunCount = _EncodeRunCount; for (size_t i = 0; i < MaxEncoderCount; i++) { @@ -510,6 +556,7 @@ int32_t main(const int32_t argc, char **pArgv) } size_t decodedSize = 0; + _RunCount = _DecodeRunCount; for (size_t i = 0; i < MaxDecoderCount; i++) { From 9c11b44924db9bb87657979d4541df75b3cce91f Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 22:20:29 +0200 Subject: [PATCH 13/34] Trying to choose good cutoff values --- src/block_rANS32x32_16w_decode.cpp | 6 +++ src/block_rANS32x32_16w_encode.cpp | 63 ++++++++++++++++++------------ 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index fe5a8be..ab59d85 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -25,7 +25,13 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; template struct _rans_decode_state_t { +#ifdef _MSC_VER + __declspec(align(32)) +#else + __attribute__((aligned(32))) +#endif uint32_t states[StateCount]; + hist_type hist; const uint16_t *pReadHead; }; diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index 911b847..04e8306 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -9,8 +9,9 @@ constexpr size_t StateCount = 32; // Needs to be a power of two. constexpr bool EncodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; -constexpr size_t MinBlockSizeBits = 15; -constexpr size_t MinBlockSize = 1 << MinBlockSizeBits; + +constexpr size_t MinMinBlockSizeBits = 15; +constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits; template struct HistReplaceMul @@ -25,10 +26,29 @@ template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { ret template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } }; template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } }; +template +struct MinBlockSizeBits +{ + constexpr static size_t GetValue(); +}; + +template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } }; +template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } }; +template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } }; +template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } }; +template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } }; +template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } }; + +template +constexpr size_t MinBlockSize() +{ + return (size_t)1 << MinBlockSizeBits::GetValue(); +} + size_t block_rANS32x32_16w_capacity(const size_t inputSize) { const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); - const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1; + const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1; const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t); return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases. @@ -127,7 +147,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs hist_t newHist; - if constexpr (!IsSafeHist && TotalSymbolCountBits == MinBlockSizeBits) + if constexpr (TotalSymbolCountBits == MinBlockSize()) { for (size_t j = 0; j < 256; j++) newHist.symbolCount[j] = (uint16_t)symCount[j]; @@ -142,17 +162,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs } else { - if constexpr (IsSafeHist) - { - for (size_t j = 0; j < 256; j++) - symCount[j]++; - - normalize_hist(&newHist, symCount, MinBlockSize + 256, TotalSymbolCountBits); - } - else - { - normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits); - } + normalize_hist(&newHist, symCount, MinBlockSize(), TotalSymbolCountBits); } constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); @@ -170,7 +180,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs continue; const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); - const float after = (symCount[j] - 1) * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); costBefore -= before; costAfter -= after; @@ -208,15 +218,16 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + constexpr size_t MinBlockSizeX = MinBlockSize(); _rans_encode_state_t encodeState; encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); encodeState.pStart = encodeState.pEnd; - size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1)); + size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1)); - if (inputBlockTargetIndex > MinBlockSize) - inputBlockTargetIndex -= MinBlockSize; + if (inputBlockTargetIndex > MinBlockSizeX) + inputBlockTargetIndex -= MinBlockSizeX; size_t blockBackPoint = length; @@ -232,8 +243,8 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u while (inputBlockTargetIndex > 0) { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSize; + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; else break; } @@ -302,21 +313,21 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u // Determine new histogram. { - inputBlockTargetIndex -= MinBlockSize; + inputBlockTargetIndex -= MinBlockSizeX; - observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSize); + observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); if constexpr (IsSafeHist) for (size_t j = 0; j < 256; j++) if (symCount[j] == 0) symCount[j] = 1; - normalize_hist(&encodeState.hist, symCount, MinBlockSize, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); while (inputBlockTargetIndex > 0) { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSize; + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; else break; } From d4ca35afd0bcea34ba45dbef5e73b95cc781b5d0 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Wed, 5 Jul 2023 23:19:06 +0200 Subject: [PATCH 14/34] Fixing issue with histogram creation --- src/block_rANS32x32_16w_decode.cpp | 4 ++-- src/block_rANS32x32_16w_encode.cpp | 11 ++++++++++- src/hist.cpp | 7 ++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index ab59d85..a616441 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -114,7 +114,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); size_t i = startIndex; @@ -328,7 +328,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); return i; } diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index 04e8306..5a53a10 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -234,12 +234,21 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u uint32_t symCount[256]; observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + size_t extraCount = 0; + if constexpr (IsSafeHist) + { for (size_t j = 0; j < 256; j++) + { if (symCount[j] == 0) + { symCount[j] = 1; + extraCount++; + } + } + } - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); while (inputBlockTargetIndex > 0) { diff --git a/src/hist.cpp b/src/hist.cpp index 4124ef7..1c2179a 100644 --- a/src/hist.cpp +++ b/src/hist.cpp @@ -154,7 +154,7 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy capped[i]++; cappedSum++; - if (cappedSum == totalSymbolCount + 1) + if (cappedSum == totalSymbolCount) goto hist_ready; } } @@ -173,6 +173,11 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy pHist->symbolCount[i] = capped[i]; counter += capped[i]; } + +#if defined(_DEBUG) && defined(_MSC_VER) + if (counter != totalSymbolCount) + __debugbreak(); +#endif } void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits) From a0c04e4b95a2c9f4de074ab2c3dbc4e98187f36e Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Thu, 6 Jul 2023 00:39:35 +0200 Subject: [PATCH 15/34] 32x64 block based (no tweaking yet) --- src/block_rANS32x32_16w_decode.cpp | 24 +- src/block_rANS32x32_16w_encode.cpp | 8 +- src/block_rANS32x64_16w.h | 22 + src/block_rANS32x64_16w_decode.cpp | 1860 ++++++++++++++++++++++++++++ src/block_rANS32x64_16w_encode.cpp | 390 ++++++ src/main.cpp | 24 +- 6 files changed, 2305 insertions(+), 23 deletions(-) create mode 100644 src/block_rANS32x64_16w.h create mode 100644 src/block_rANS32x64_16w_decode.cpp create mode 100644 src/block_rANS32x64_16w_encode.cpp diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index a616441..dc4b341 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -23,7 +23,7 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; ////////////////////////////////////////////////////////////////////////// template -struct _rans_decode_state_t +struct _rans_decode_state32_t { #ifdef _MSC_VER __declspec(align(32)) @@ -48,13 +48,13 @@ enum rans32x32_decoder_type_t template struct rans32x32_16w_decoder { - static size_t decode_section(_rans_decode_state_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); + static size_t decode_section(_rans_decode_state32_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); }; template struct rans32x32_16w_decoder> { - static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); @@ -101,7 +101,7 @@ template > *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { if constexpr (!WriteAligned32) if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) @@ -337,7 +337,7 @@ template > *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { if constexpr (!WriteAligned32) if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) @@ -350,7 +350,7 @@ static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); size_t i = startIndex; @@ -558,7 +558,7 @@ static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); return i; } @@ -567,7 +567,7 @@ template struct rans32x32_16w_decoder> { template - static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); } @@ -577,7 +577,7 @@ template struct rans32x32_16w_decoder> { template - static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); } @@ -587,7 +587,7 @@ template struct rans32x32_16w_decoder> { template - static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); } @@ -597,7 +597,7 @@ template struct rans32x32_16w_decoder> { template - static size_t decode_section(_rans_decode_state_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) { return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); } @@ -661,7 +661,7 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, if (inLength < expectedInputLength) return 0; - _rans_decode_state_t decodeState; + _rans_decode_state32_t decodeState; for (size_t i = 0; i < StateCount; i++) { diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index 5a53a10..ec85a1d 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -61,7 +61,7 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); ////////////////////////////////////////////////////////////////////////// -struct _rans_encode_state_t +struct _rans_encode_state32_t { uint32_t states[StateCount]; hist_t hist; @@ -77,14 +77,14 @@ template struct rans32x32_16w_encoder { template - static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); + static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); }; template <> struct rans32x32_16w_encoder { template - static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) { int64_t targetCmp = targetIndex + StateCount; @@ -220,7 +220,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; constexpr size_t MinBlockSizeX = MinBlockSize(); - _rans_encode_state_t encodeState; + _rans_encode_state32_t encodeState; encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); encodeState.pStart = encodeState.pEnd; diff --git a/src/block_rANS32x64_16w.h b/src/block_rANS32x64_16w.h new file mode 100644 index 0000000..a06166e --- /dev/null +++ b/src/block_rANS32x64_16w.h @@ -0,0 +1,22 @@ +#ifndef block_rANS32x64_16w_h__ +#define block_rANS32x64_16w_h__ + +#include "hist.h" + +size_t block_rANS32x64_16w_capacity(const size_t inputSize); + +size_t block_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); + +size_t block_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t block_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); + +#endif // block_rANS32x64_16w_h__ diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp new file mode 100644 index 0000000..1e7b4c8 --- /dev/null +++ b/src/block_rANS32x64_16w_decode.cpp @@ -0,0 +1,1860 @@ +#include "block_rANS32x64_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +constexpr size_t StateCount = 64; // Needs to be a power of two. +constexpr bool DecodeNoBranch = false; + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x64_idx2idx[] = +{ + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37, + 0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F, +}; + +static_assert(sizeof(_Rans32x64_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +extern const uint8_t _ShuffleLutShfl32[256 * 8]; +extern const uint8_t _ShuffleLutPerm32[256 * 8]; +extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; + +////////////////////////////////////////////////////////////////////////// + +template +struct _rans_decode_state64_t +{ +#ifdef _MSC_VER + __declspec(align(64)) +#else + __attribute__((aligned(64))) +#endif + uint32_t states[StateCount]; + + hist_type hist; + const uint16_t *pReadHead; +}; + +enum rans32x64_decoder_type_t +{ + r32x64_dt_scalar, + r32x64_dt_avx2_large_cache_15_to_13, + r32x64_dt_avx2_small_cache_15_to_13, + r32x64_dt_avx2_large_cache_12_to_10, + r32x64_dt_avx2_small_cache_12_to_10, + r32x64_dt_avx512_large_cache_15_to_13, + r32x64_dt_avx512_small_cache_15_to_13, + r32x64_dt_avx512_large_cache_12_to_10, + r32x64_dt_avx512_small_cache_12_to_10, +}; + +template +struct rans32x64_16w_decoder +{ + static size_t decode_section(_rans_decode_state64_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); +}; + +template +struct rans32x64_16w_decoder> +{ + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t i = startIndex; + + for (; i < endIndex; i += StateCount) + { + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + uint32_t state = pState->states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = pState->hist.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pState->pReadHead; + state = read ? newState : state; + pState->pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pState->pReadHead; + pState->pReadHead++; + } + } + + pState->states[j] = state; + } + } + + return i; + } +}; + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm256_set1_epi32(0xFFFF); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); + const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); + const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); + const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); + + // const uint8_t symbol = pHist->cumulInv[slot]; + simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); + simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); + simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); + simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); + simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot4, sizeof(uint8_t)); + simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot5, sizeof(uint8_t)); + simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot6, sizeof(uint8_t)); + simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot7, sizeof(uint8_t)); + + // since they were int32_t turn into uint8_t + symbol0 = _mm256_and_si256(symbol0, lower8); + symbol1 = _mm256_and_si256(symbol1, lower8); + symbol2 = _mm256_and_si256(symbol2, lower8); + symbol3 = _mm256_and_si256(symbol3, lower8); + symbol4 = _mm256_and_si256(symbol4, lower8); + symbol5 = _mm256_and_si256(symbol5, lower8); + symbol6 = _mm256_and_si256(symbol6, lower8); + symbol7 = _mm256_and_si256(symbol7, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); + const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); + const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. + + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); + const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol4, sizeof(uint32_t)); + const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol5, sizeof(uint32_t)); + const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol6, sizeof(uint32_t)); + const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol7, sizeof(uint32_t)); + + // freq, cumul. + const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); + const simd_t freq0 = _mm256_and_si256(pack0, lower16); + const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); + const simd_t freq1 = _mm256_and_si256(pack1, lower16); + const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); + const simd_t freq2 = _mm256_and_si256(pack2, lower16); + const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); + const simd_t freq3 = _mm256_and_si256(pack3, lower16); + const simd_t cumul4 = _mm256_srli_epi32(pack4, 16); + const simd_t freq4 = _mm256_and_si256(pack4, lower16); + const simd_t cumul5 = _mm256_srli_epi32(pack5, 16); + const simd_t freq5 = _mm256_and_si256(pack5, lower16); + const simd_t cumul6 = _mm256_srli_epi32(pack6, 16); + const simd_t freq6 = _mm256_and_si256(pack6, lower16); + const simd_t cumul7 = _mm256_srli_epi32(pack7, 16); + const simd_t freq7 = _mm256_and_si256(pack7, lower16); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); + const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); + const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); + const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); + const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); + const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); + const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); + const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); + const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); + const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); + const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); + const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); + const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); + lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); + lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); + lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); + const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); + const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); + const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot3, sizeof(uint32_t)); + const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot4, sizeof(uint32_t)); + const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot5, sizeof(uint32_t)); + const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot6, sizeof(uint32_t)); + const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot7, sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); + const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); + const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); + const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm256_and_si256(pack0, lower8); + const simd_t symbol1 = _mm256_and_si256(pack1, lower8); + const simd_t symbol2 = _mm256_and_si256(pack2, lower8); + const simd_t symbol3 = _mm256_and_si256(pack3, lower8); + const simd_t symbol4 = _mm256_and_si256(pack4, lower8); + const simd_t symbol5 = _mm256_and_si256(pack5, lower8); + const simd_t symbol6 = _mm256_and_si256(pack6, lower8); + const simd_t symbol7 = _mm256_and_si256(pack7, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); + const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); + const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. + + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + + // unpack freq, cumul. + const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm256_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm256_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm256_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm256_srli_epi32(pack3, 20); + const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12); + const simd_t freq4 = _mm256_srli_epi32(pack4, 20); + const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12); + const simd_t freq5 = _mm256_srli_epi32(pack5, 20); + const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12); + const simd_t freq6 = _mm256_srli_epi32(pack6, 20); + const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12); + const simd_t freq7 = _mm256_srli_epi32(pack7, 20); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); + const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); + const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); + const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); + const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); + const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); + const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); + const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); + const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); + const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); + lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); + lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); + lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +#ifdef __llvm__ +__attribute__((target("avx512bw"))) +#else +__attribute__((target("avx512f", "avx512bw"))) +#endif +#endif +static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned64) + if ((reinterpret_cast(pOutData) & (64 - 1)) == 0) + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m512i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm512_set1_epi32(0xFFFF); + const simd_t lower8 = _mm512_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); + const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); + const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); + const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); + const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); + + // retrieve pack. + simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); + + symbol0 = _mm512_and_si512(symbol0, lower8); + symbol1 = _mm512_and_si512(symbol1, lower8); + symbol2 = _mm512_and_si512(symbol2, lower8); + symbol3 = _mm512_and_si512(symbol3, lower8); + + // retrieve pack. + const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + + // pack symbols to one si512. + const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. + const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. + + // freq, cumul. + const simd_t cumul0 = _mm512_srli_epi32(pack0, 16); + const simd_t freq0 = _mm512_and_si512(pack0, lower16); + const simd_t cumul1 = _mm512_srli_epi32(pack1, 16); + const simd_t freq1 = _mm512_and_si512(pack1, lower16); + const simd_t cumul2 = _mm512_srli_epi32(pack2, 16); + const simd_t freq2 = _mm512_and_si512(pack2, lower16); + const simd_t cumul3 = _mm512_srli_epi32(pack3, 16); + const simd_t freq3 = _mm512_and_si512(pack3, lower16); + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned64) + _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); + else + _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); + + // const uint32_t freqScaled = shiftedState * freq; + const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); + const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); + const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); + const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); + const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); + const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); + const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. + __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. + __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. + __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. + __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. + __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. + __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. + __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. + __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // finalize lookups. + lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); + lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); + lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); + lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); + lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); + lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); + lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); + lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +#ifdef __llvm__ +__attribute__((target("avx512bw"))) +#else +__attribute__((target("avx512f", "avx512bw"))) +#endif +#endif +static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned64) + if ((reinterpret_cast(pOutData) & (63 - 1)) == 0) + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m512i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm512_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); + const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); + const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); + const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); + const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm512_and_si512(pack0, lower8); + const simd_t symbol1 = _mm512_and_si512(pack1, lower8); + const simd_t symbol2 = _mm512_and_si512(pack2, lower8); + const simd_t symbol3 = _mm512_and_si512(pack3, lower8); + + // pack symbols to one si512. + const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. + const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. + + // unpack freq, cumul. + const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm512_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm512_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm512_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm512_srli_epi32(pack3, 20); + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned64) + _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); + else + _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); + + // const uint32_t freqScaled = shiftedState * freq; + const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); + const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); + const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); + const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); + const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); + const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); + const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. + __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. + __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. + __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. + __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. + __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. + __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. + __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. + __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // finalize lookups. + lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); + lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); + lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); + lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); + lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); + lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); + lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); + lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + (void)totalSymbolCountBits; + + memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); + + return inplace_make_hist_dec(pDecHist); +} + +template +static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec2_hist(pDecHist, pIncompleteHist); + + return true; +} + +template +static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec_pack_hist(pDecHist, pIncompleteHist); + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state64_t decodeState; + + for (size_t i = 0; i < StateCount; i++) + { + decodeState.states[i] = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint32_t); + } + + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } + + } while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *decodeState.pReadHead; + state = read ? newState : state; + decodeState.pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; + } + } + + decodeState.states[j] = state; + } + } + } + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +template +static size_t block_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + _DetectCPUFeatures(); + + if (avx512FSupported && avx512BWSupported && avx512DQSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4)) + { + if constexpr (TotalSymbolCountBits >= 13) + return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + else + return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + if (avx2Supported) + { + if constexpr (TotalSymbolCountBits >= 13) + return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + else + return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + + // Fallback. + return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t block_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity); +} + +size_t block_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return block_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity); +} diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp new file mode 100644 index 0000000..cc3767d --- /dev/null +++ b/src/block_rANS32x64_16w_encode.cpp @@ -0,0 +1,390 @@ +#include "block_rANS32x64_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +constexpr size_t StateCount = 64; // Needs to be a power of two. +constexpr bool EncodeNoBranch = false; +constexpr size_t SafeHistBitMax = 0; + +constexpr size_t MinMinBlockSizeBits = 15; +constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits; + +template +struct HistReplaceMul +{ + constexpr static size_t GetValue(); +}; + +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } }; + +template +struct MinBlockSizeBits +{ + constexpr static size_t GetValue(); +}; + +template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } }; +template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } }; +template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } }; +template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } }; +template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } }; +template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } }; + +template +constexpr size_t MinBlockSize() +{ + return (size_t)1 << MinBlockSizeBits::GetValue(); +} + +size_t block_rANS32x64_16w_capacity(const size_t inputSize) +{ + const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); + const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1; + const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t); + + return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases. +} + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x64_idx2idx[] = +{ + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37, + 0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F, +}; + +static_assert(sizeof(_Rans32x64_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +struct _rans_encode_state64_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +enum rans32x64_encoder_type_t +{ + r32x64_et_scalar, +}; + +template +struct rans32x64_16w_encoder +{ + template + static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); +}; + +template <> +struct rans32x64_16w_encoder +{ + template + static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + { + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) +{ + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + + memset(symCount, 0, sizeof(uint32_t) * 256); + observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize); + + // Do we include a symbol that hasn't been included before? + if constexpr (!IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0) + return false; + } + + hist_t newHist; + + if constexpr (TotalSymbolCountBits == MinBlockSize()) + { + for (size_t j = 0; j < 256; j++) + newHist.symbolCount[j] = (uint16_t)symCount[j]; + + size_t counter = 0; + + for (size_t j = 0; j < 256; j++) + { + newHist.cumul[j] = (uint16_t)counter; + counter += newHist.symbolCount[j]; + } + } + else + { + normalize_hist(&newHist, symCount, MinBlockSize(), TotalSymbolCountBits); + } + + constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); + constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; + + // this comparison isn't fair or fast, but should be a good starting point hopefully. + float costBefore = 0; + float costAfter = 0; + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + else + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + + const float diff = costBefore - costAfter; + + return (diff < histReplacePoint); +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) +{ + if (outCapacity < block_rANS32x64_16w_capacity(length)) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + constexpr size_t MinBlockSizeX = MinBlockSize(); + + _rans_encode_state64_t encodeState; + encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); + encodeState.pStart = encodeState.pEnd; + + size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1)); + + if (inputBlockTargetIndex > MinBlockSizeX) + inputBlockTargetIndex -= MinBlockSizeX; + + size_t blockBackPoint = length; + + uint32_t symCount[256]; + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + + size_t extraCount = 0; + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } + } + } + + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + + // Init States. + for (size_t i = 0; i < StateCount; i++) + encodeState.states[i] = DecodeConsumePoint16; + + int64_t inputIndex = length - 1; + inputIndex &= ~(size_t)(StateCount - 1); + inputIndex += StateCount; + + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = encodeState.states[stateIndex]; + + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } + } + + inputIndex -= StateCount; + + while (true) + { + rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; + + // Write hist. + { + const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; + + encodeState.pStart++; + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + + encodeState.pStart--; + } + + if (inputIndex == 0) + break; + + // Determine new histogram. + { + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = inputIndex; + } + } + + uint8_t *pWrite = pOutData; + size_t outIndex = 0; + + *reinterpret_cast(pWrite + outIndex) = (uint64_t)length; + outIndex += sizeof(uint64_t); + + // compressed expected length. + outIndex += sizeof(uint64_t); + + for (size_t j = 0; j < StateCount; j++) + { + *reinterpret_cast(pWrite + outIndex) = encodeState.states[j]; + outIndex += sizeof(uint32_t); + } + + const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t); + + memmove(pWrite + outIndex, encodeState.pStart + 1, size); + outIndex += size; + + *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. + + return outIndex; +} + +////////////////////////////////////////////////////////////////////////// + +size_t block_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<15, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<14, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<13, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<12, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<11, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t block_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<10, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } diff --git a/src/main.cpp b/src/main.cpp index 80650d0..9e75bd0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,6 +11,7 @@ #include "rANS32x16_16w.h" #include "rANS32x64_16w.h" #include "block_rANS32x32_16w.h" +#include "block_rANS32x64_16w.h" #ifdef _WIN32 #include @@ -151,12 +152,19 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8 static codec_info_t _Codecs[] = { - { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + + { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, + { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, + { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}}, + { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}}, + { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}}, + { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, @@ -386,12 +394,14 @@ int32_t main(const int32_t argc, char **pArgv) pUncompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize); pDecompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize); - compressedDataCapacity = rANS32x64_16w_capacity(fileSize); + compressedDataCapacity = 0; + compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x64_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x16_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize)); pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity); From bea25968893bcecd674be172682b2dfccda9f136 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Thu, 6 Jul 2023 02:09:48 +0200 Subject: [PATCH 16/34] fine tuning stuff --- src/block_rANS32x64_16w_decode.cpp | 4 ++-- src/block_rANS32x64_16w_encode.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp index 1e7b4c8..c47f81a 100644 --- a/src/block_rANS32x64_16w_decode.cpp +++ b/src/block_rANS32x64_16w_decode.cpp @@ -1808,14 +1808,14 @@ static size_t block_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_ { _DetectCPUFeatures(); - if (avx512FSupported && avx512BWSupported && avx512DQSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4)) + if (avx512FSupported && avx512BWSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4)) { if constexpr (TotalSymbolCountBits >= 13) return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); else return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); } - if (avx2Supported) + else if (avx2Supported) { if constexpr (TotalSymbolCountBits >= 13) return block_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp index cc3767d..7b77a81 100644 --- a/src/block_rANS32x64_16w_encode.cpp +++ b/src/block_rANS32x64_16w_encode.cpp @@ -19,9 +19,9 @@ struct HistReplaceMul constexpr static size_t GetValue(); }; -template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } }; -template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } }; -template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } }; +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 850; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 1500; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 2500; } }; template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } }; template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } }; template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } }; @@ -32,10 +32,10 @@ struct MinBlockSizeBits constexpr static size_t GetValue(); }; -template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } }; +template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } }; template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } }; template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } }; -template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } }; +template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } }; template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } }; template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } }; From d5bde3141085cf6e2a668af0c2b7a86f009b4571 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Thu, 6 Jul 2023 06:38:49 +0200 Subject: [PATCH 17/34] 32x32 multithreading --- src/hist.cpp | 114 ++-- src/iacaMarks.h | 53 ++ src/main.cpp | 44 +- src/mt_rANS32x32_16w.h | 30 + src/mt_rANS32x32_16w_decode.cpp | 984 ++++++++++++++++++++++++++++++++ src/mt_rANS32x32_16w_encode.cpp | 387 +++++++++++++ src/thread_pool.cpp | 164 ++++++ src/thread_pool.h | 19 + 8 files changed, 1743 insertions(+), 52 deletions(-) create mode 100644 src/iacaMarks.h create mode 100644 src/mt_rANS32x32_16w.h create mode 100644 src/mt_rANS32x32_16w_decode.cpp create mode 100644 src/mt_rANS32x32_16w_encode.cpp create mode 100644 src/thread_pool.cpp create mode 100644 src/thread_pool.h diff --git a/src/hist.cpp b/src/hist.cpp index 1c2179a..2b8e816 100644 --- a/src/hist.cpp +++ b/src/hist.cpp @@ -1,6 +1,7 @@ #include "hist.h" #include +#include ////////////////////////////////////////////////////////////////////////// @@ -101,65 +102,98 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy if (cappedSum != totalSymbolCount) { - while (cappedSum > totalSymbolCount) // Start stealing. - { - size_t target = 2; + uint8_t sortedIdx[256]; + + for (size_t i = 0; i < 256; i++) + sortedIdx[i] = (uint8_t)i; - while (true) + struct _internal + { + static void heapify(uint8_t *pIdx, const uint16_t *pVal, const int64_t n, const int64_t i) { - size_t found = totalSymbolCount + 1; + const int64_t left = 2 * i + 1; + const int64_t right = 2 * i + 2; + int64_t largest = i; - for (size_t i = 0; i < 256; i++) - if (capped[i] > target && capped[i] < found) - found = capped[i]; + if (left < n && pVal[pIdx[left]] > pVal[pIdx[largest]]) + largest = left; - if (found == totalSymbolCount + 1) - break; + if (right < n && pVal[pIdx[right]] > pVal[pIdx[largest]]) + largest = right; - for (size_t i = 0; i < 256; i++) + if (largest != i) { - if (capped[i] == found) - { - capped[i]--; - cappedSum--; - - if (cappedSum == totalSymbolCount) - goto hist_ready; - } + std::swap(pIdx[i], pIdx[largest]); + heapify(pIdx, pVal, n, largest); } + } + + static void heapSort(uint8_t *pIdx, const uint16_t *pVal, const size_t length) + { + for (int64_t i = (int64_t)length / 2 - 1; i >= 0; i--) + heapify(pIdx, pVal, length, i); - target = found + 1; + for (int64_t i = length - 1; i >= 0; i--) + { + std::swap(pIdx[0], pIdx[i]); + heapify(pIdx, pVal, i, 0); + } } - } + }; - while (cappedSum < totalSymbolCount) // Start a charity. + _internal::heapSort(sortedIdx, capped, 256); + size_t minTwo = 0; + + for (size_t i = 0; i < 256; i++) { - size_t target = totalSymbolCount + 1; + if (capped[sortedIdx[i]] >= 2) + { + minTwo = i; + break; + } + } - while (true) + while (cappedSum > totalSymbolCount) // Start stealing. + { + for (size_t i = minTwo; i < 256; i++) { - size_t found = 1; + capped[sortedIdx[i]]--; + cappedSum--; - for (size_t i = 0; i < 256; i++) - if (capped[i] < target && capped[i] > found) - found = capped[i]; + if (cappedSum == totalSymbolCount) + goto hist_ready; + } - if (found == 1) + // Re-Adjust `minTwo`. + for (size_t i = minTwo; i < 256; i++) + { + if (capped[sortedIdx[i]] >= 2) + { + minTwo = i; break; + } + } + } + + while (cappedSum < totalSymbolCount) // Start a charity. + { + for (int64_t i = 255; i >= (int64_t)minTwo; i--) + { + capped[sortedIdx[i]]++; + cappedSum++; + + if (cappedSum == totalSymbolCount) + goto hist_ready; + } - for (size_t i = 0; i < 256; i++) + // Re-Adjust `minTwo`. + for (size_t i = minTwo; i < 256; i++) + { + if (capped[sortedIdx[i]] >= 2) { - if (capped[i] == found) - { - capped[i]++; - cappedSum++; - - if (cappedSum == totalSymbolCount) - goto hist_ready; - } + minTwo = i; + break; } - - target = found - 1; } } } diff --git a/src/iacaMarks.h b/src/iacaMarks.h new file mode 100644 index 0000000..be1973e --- /dev/null +++ b/src/iacaMarks.h @@ -0,0 +1,53 @@ +/* +* Copyright (2008-2009) Intel Corporation All Rights Reserved. +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material +* contains trade secrets and proprietary and confidential information +* of Intel or its suppliers and licensors. The Material is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel(R)s prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +*/ + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#else +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } +#endif + +#define IACA_START {IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222)} + +#ifdef _WIN64 +#include +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); +#endif + +/**************** asm ***************** +;START_MARKER +mov ebx, 111 +db 0x64, 0x67, 0x90 + +;END_MARKER +mov ebx, 222 +db 0x64, 0x67, 0x90 + +**************************************/ diff --git a/src/main.cpp b/src/main.cpp index 9e75bd0..9cf1652 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,6 +12,7 @@ #include "rANS32x64_16w.h" #include "block_rANS32x32_16w.h" #include "block_rANS32x64_16w.h" +#include "mt_rANS32x32_16w.h" #ifdef _WIN32 #include @@ -120,6 +121,8 @@ uint8_t *pDecompressedData = nullptr; size_t compressedLength = 0; +thread_pool *_pGlobalThreadPool = nullptr; + ////////////////////////////////////////////////////////////////////////// template @@ -150,21 +153,37 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8 return func(pInData, length, pOutData, outCapacity); } +template +size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + if (_pGlobalThreadPool == nullptr) + _pGlobalThreadPool = thread_pool_new((size_t)rans_max((int64_t)1, (int64_t)thread_pool_max_threads() - 1)); + + return func(pInData, inLength, pOutData, outCapacity, _pGlobalThreadPool); +} + static codec_info_t _Codecs[] = { - //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + // { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + + // { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, + // { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, + // { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}}, + // { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}}, + // { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}}, + // { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}}, - { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, - { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, - { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}}, - { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}}, - { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}}, - { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, @@ -402,6 +421,7 @@ int32_t main(const int32_t argc, char **pArgv) compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x32_16w_capacity(fileSize)); pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity); diff --git a/src/mt_rANS32x32_16w.h b/src/mt_rANS32x32_16w.h new file mode 100644 index 0000000..06a5a50 --- /dev/null +++ b/src/mt_rANS32x32_16w.h @@ -0,0 +1,30 @@ +#ifndef mt_rANS32x32_16w_h__ +#define mt_rANS32x32_16w_h__ + +#include "hist.h" +#include "thread_pool.h" + +size_t mt_rANS32x32_16w_capacity(const size_t inputSize); + +size_t mt_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); + +size_t mt_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); + +size_t mt_rANS32x32_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x32_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x32_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x32_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x32_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x32_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); + +#endif // mt_rANS32x32_16w_h__ diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp new file mode 100644 index 0000000..b62beae --- /dev/null +++ b/src/mt_rANS32x32_16w_decode.cpp @@ -0,0 +1,984 @@ +#include "mt_rANS32x32_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +constexpr size_t StateCount = 32; // Needs to be a power of two. +constexpr bool DecodeNoBranch = false; + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +extern const uint8_t _ShuffleLutShfl32[256 * 8]; +extern const uint8_t _ShuffleLutPerm32[256 * 8]; +extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; + +////////////////////////////////////////////////////////////////////////// + +template +struct _rans_decode_state32mt_t +{ +#ifdef _MSC_VER + __declspec(align(32)) +#else + __attribute__((aligned(32))) +#endif + uint32_t states[StateCount]; + + hist_type hist; + const uint16_t *pReadHead; +}; + +enum rans32x32_decoder_type_t +{ + r32x32_dt_scalar, + r32x32_dt_avx2_large_cache_15_to_13, + r32x32_dt_avx2_small_cache_15_to_13, + r32x32_dt_avx2_large_cache_12_to_10, + r32x32_dt_avx2_small_cache_12_to_10, +}; + +template +struct rans32x32_16w_decoder +{ + static size_t decode_section(_rans_decode_state32mt_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); +}; + +template +struct rans32x32_16w_decoder> +{ + static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t i = startIndex; + + for (; i < endIndex; i += StateCount) + { + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + uint32_t state = pState->states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = pState->hist.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pState->pReadHead; + state = read ? newState : state; + pState->pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pState->pReadHead; + pState->pReadHead++; + } + } + + pState->states[j] = state; + } + } + + return i; + } +}; + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _mt_rans32x32_decode_section_avx2_varA(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm256_set1_epi32(0xFFFF); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // const uint8_t symbol = pHist->cumulInv[slot]; + simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); + simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); + simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); + simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); + + // since they were int32_t turn into uint8_t + symbol0 = _mm256_and_si256(symbol0, lower8); + symbol1 = _mm256_and_si256(symbol1, lower8); + symbol2 = _mm256_and_si256(symbol2, lower8); + symbol3 = _mm256_and_si256(symbol3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); + + // freq, cumul. + const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); + const simd_t freq0 = _mm256_and_si256(pack0, lower16); + const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); + const simd_t freq1 = _mm256_and_si256(pack1, lower16); + const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); + const simd_t freq2 = _mm256_and_si256(pack2, lower16); + const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); + const simd_t freq3 = _mm256_and_si256(pack3, lower16); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _mt_rans32x32_decode_section_avx2_varC(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + + static_assert(TotalSymbolCountBits <= 12); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot3, sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm256_and_si256(pack0, lower8); + const simd_t symbol1 = _mm256_and_si256(pack1, lower8); + const simd_t symbol2 = _mm256_and_si256(pack2, lower8); + const simd_t symbol3 = _mm256_and_si256(pack3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // unpack freq, cumul. + const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm256_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm256_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm256_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm256_srli_epi32(pack3, 20); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + (void)totalSymbolCountBits; + + memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); + + return inplace_make_hist_dec(pDecHist); +} + +template +static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec2_hist(pDecHist, pIncompleteHist); + + return true; +} + +template +static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec_pack_hist(pDecHist, pIncompleteHist); + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state32mt_t decodeState; + + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + (void)readHeadBackOffset; // unused in single-threaded version. + + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } + + } while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *decodeState.pReadHead; + state = read ? newState : state; + decodeState.pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; + } + } + + decodeState.states[j] = state; + } + } + } + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state32mt_t decodeState; + + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + if (i + blockSize > blockEndInStates) + { + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + break; + } + else + { + thread_pool_add(pThreadPool, [=]() { + auto decState = decodeState; + rans32x32_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); + }); + + i = blockEndInStates; + decodeState.pReadHead = pReadHeadAfter; + } + + } while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *decodeState.pReadHead; + state = read ? newState : state; + decodeState.pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; + } + } + + decodeState.states[j] = state; + } + } + } + + thread_pool_await(pThreadPool); + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +template +static size_t mt_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool = nullptr) +{ + _DetectCPUFeatures(); + + if (avx2Supported) + { + if constexpr (TotalSymbolCountBits >= 13) + { + if (pThreadPool) + return mt_rANS32x32_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + else + { + if (pThreadPool) + return mt_rANS32x32_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + } + + // Fallback. + if (pThreadPool) + return mt_rANS32x32_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x32_16w_decode>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x32_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x32_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x32_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x32_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x32_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x32_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp new file mode 100644 index 0000000..07401a1 --- /dev/null +++ b/src/mt_rANS32x32_16w_encode.cpp @@ -0,0 +1,387 @@ +#include "mt_rANS32x32_16w.h" + +#include "hist.h" +#include "simd_platform.h" + +#include +#include + +constexpr size_t StateCount = 32; // Needs to be a power of two. +constexpr bool EncodeNoBranch = false; +constexpr size_t SafeHistBitMax = 0; + +constexpr size_t MinMinBlockSizeBits = 15; +constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits; + +template +struct HistReplaceMul +{ + constexpr static size_t GetValue(); +}; + +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 50; } }; + +template +struct MinBlockSizeBits +{ + constexpr static size_t GetValue(); +}; + +template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 16; } }; + +template +constexpr size_t MinBlockSize() +{ + return (size_t)1 << MinBlockSizeBits::GetValue(); +} + +size_t mt_rANS32x32_16w_capacity(const size_t inputSize) +{ + const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); + const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1; + const size_t perBlockExtraSize = sizeof(uint64_t) * 2 + 256 * sizeof(uint16_t) + StateCount * sizeof(uint32_t); + + return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases. +} + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +struct _rans_encode_state32mt_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +enum rans32x32_encoder_type_t +{ + r32x32_et_scalar, +}; + +template +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); +}; + +template <> +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + { + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) +{ + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + + memset(symCount, 0, sizeof(uint32_t) * 256); + observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize); + + // Do we include a symbol that hasn't been included before? + if constexpr (!IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0) + return false; + } + + hist_t newHist; + + if constexpr (TotalSymbolCountBits == MinBlockSize()) + { + for (size_t j = 0; j < 256; j++) + newHist.symbolCount[j] = (uint16_t)symCount[j]; + + size_t counter = 0; + + for (size_t j = 0; j < 256; j++) + { + newHist.cumul[j] = (uint16_t)counter; + counter += newHist.symbolCount[j]; + } + } + else + { + normalize_hist(&newHist, symCount, MinBlockSize(), TotalSymbolCountBits); + } + + constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); + constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; + + // this comparison isn't fair or fast, but should be a good starting point hopefully. + float costBefore = 0; + float costAfter = (float)(sizeof(uint16_t) * 256 + StateCount * sizeof(uint32_t) + sizeof(uint64_t) * 2) * 0.5f; // let's assume that block will be able to share it's histogram with someone else. + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + else + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + + const float diff = costBefore - costAfter; + + return (diff < histReplacePoint); +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) +{ + if (outCapacity < mt_rANS32x32_16w_capacity(length)) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + constexpr size_t MinBlockSizeX = MinBlockSize(); + + _rans_encode_state32mt_t encodeState; + encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); + encodeState.pStart = encodeState.pEnd; + + size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1)); + + if (inputBlockTargetIndex > MinBlockSizeX) + inputBlockTargetIndex -= MinBlockSizeX; + + uint16_t *pBlockEnd = encodeState.pEnd; + size_t blockBackPoint = length; + + uint32_t symCount[256]; + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + + size_t extraCount = 0; + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } + } + } + + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + + // Init States. + for (size_t i = 0; i < StateCount; i++) + encodeState.states[i] = DecodeConsumePoint16; + + int64_t inputIndex = length - 1; + inputIndex &= ~(size_t)(StateCount - 1); + inputIndex += StateCount; + + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = encodeState.states[stateIndex]; + + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } + } + + inputIndex -= StateCount; + + while (true) + { + rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; + + // Write hist & states. + { + const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; + + encodeState.pStart++; + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; + memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + + const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + + pBlockEnd = encodeState.pStart; + encodeState.pStart--; + } + + if (inputIndex == 0) + break; + + // Determine new histogram. + { + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = inputIndex; + } + } + + uint8_t *pWrite = pOutData; + size_t outIndex = 0; + + *reinterpret_cast(pWrite + outIndex) = (uint64_t)length; + outIndex += sizeof(uint64_t); + + // compressed expected length. + outIndex += sizeof(uint64_t); + + const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t); + + memmove(pWrite + outIndex, encodeState.pStart + 1, size); + outIndex += size; + + *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. + + return outIndex; +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<15, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<14, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<13, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); } diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp new file mode 100644 index 0000000..63afda4 --- /dev/null +++ b/src/thread_pool.cpp @@ -0,0 +1,164 @@ +// Improved Version of https://github.com/rainerzufalldererste/slapcodec/blob/master/slapcodec/src/threadpool.cpp + +#include "thread_pool.h" + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif + +struct thread_pool +{ + std::queue> tasks; + + std::thread *pThreads; + size_t threadCount; + + std::atomic taskCount; + std::atomic isRunning; + std::mutex mutex; + std::condition_variable condition_var; + + thread_pool(const size_t threadCount); + ~thread_pool(); +}; + +void thread_pool_ThreadFunc(thread_pool *pThreadPool, const size_t index) +{ +#ifdef _WIN32 + SetThreadIdealProcessor(GetCurrentThread(), (DWORD)index); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); +#else + (void)index; +#endif + + while (pThreadPool->isRunning) + { + std::function task = nullptr; + + { + std::unique_lock lock(pThreadPool->mutex); + pThreadPool->condition_var.wait_for(lock, std::chrono::milliseconds(1)); + + if (!pThreadPool->tasks.empty()) + { + task = pThreadPool->tasks.front(); + pThreadPool->tasks.pop(); + } + } + + if (task) + { + task(); + --pThreadPool->taskCount; + continue; + } + } +} + +thread_pool::thread_pool(const size_t threads) : + tasks(), + pThreads(nullptr), + threadCount(threads), + taskCount(0), + isRunning(true), + mutex(), + condition_var() +{ + pThreads = reinterpret_cast(malloc(sizeof(std::thread) * threads)); + + for (size_t i = 0; i < threads; i++) + new (&pThreads[i]) std::thread(thread_pool_ThreadFunc, this, i); +} + +thread_pool::~thread_pool() +{ + thread_pool_await(this); + + isRunning = false; + condition_var.notify_all(); + + for (size_t i = 0; i < threadCount; i++) + { + pThreads[i].join(); + pThreads[i].~thread(); + } + + free(pThreads); +} + +thread_pool * thread_pool_new(const size_t threads) +{ + return new thread_pool(threads); +} + +void thread_pool_destroy(thread_pool **ppThreadPool) +{ + if (ppThreadPool == nullptr || *ppThreadPool == nullptr) + return; + + delete *ppThreadPool; +} + +size_t thread_pool_thread_count(thread_pool *pPool) +{ + if (pPool == nullptr) + return 1; + + return pPool->threadCount == 0 ? 1 : pPool->threadCount; +} + +void thread_pool_add(thread_pool *pThreadPool, const std::function &task) +{ + pThreadPool->taskCount++; + + pThreadPool->mutex.lock(); + pThreadPool->tasks.push(task); + pThreadPool->mutex.unlock(); + + pThreadPool->condition_var.notify_one(); +} + +void thread_pool_await(thread_pool *pThreadPool) +{ + while (true) + { + std::function task = nullptr; + + // Locked by mutex. + { + pThreadPool->mutex.lock(); + + if (!pThreadPool->tasks.empty()) + { + task = pThreadPool->tasks.front(); + pThreadPool->tasks.pop(); + } + + pThreadPool->mutex.unlock(); + } + + if (task) + { + task(); + pThreadPool->taskCount--; + } + else + { + break; + } + } + + while (pThreadPool->taskCount > 0) + std::this_thread::yield(); // Wait for all other threads to finish their tasks. +} + +size_t thread_pool_max_threads() +{ + return std::thread::hardware_concurrency(); +} diff --git a/src/thread_pool.h b/src/thread_pool.h new file mode 100644 index 0000000..39de4ec --- /dev/null +++ b/src/thread_pool.h @@ -0,0 +1,19 @@ +#ifndef thread_pool_h__ +#define thread_pool_h__ + +#include +#include + +struct thread_pool; + +thread_pool * thread_pool_new(const size_t threads); +void thread_pool_destroy(thread_pool **ppThreadPool); + +size_t thread_pool_thread_count(thread_pool *pThreadPool); + +void thread_pool_add(thread_pool *pThreadPool, const std::function &func); +void thread_pool_await(thread_pool *pThreadPool); + +size_t thread_pool_max_threads(); + +#endif // thread_pool_h__ From e186a491940b324fbf7d81dd6da3c9457bf2dc01 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 02:17:11 +0200 Subject: [PATCH 18/34] hopefully fixing weird issue with decoding 0 size blocks --- src/hist.cpp | 9 +++++++-- src/mt_rANS32x32_16w_decode.cpp | 3 +++ src/mt_rANS32x32_16w_encode.cpp | 17 ++++++++++------- src/thread_pool.cpp | 10 +++++++++- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/hist.cpp b/src/hist.cpp index 2b8e816..ab0dee6 100644 --- a/src/hist.cpp +++ b/src/hist.cpp @@ -307,7 +307,7 @@ void make_dec_pack_hist(hist_dec_pack_t *pHistDec, const h bool inplace_complete_hist(hist_t *pHist, const size_t totalSymbolCountBits) { - uint16_t counter = 0; + uint32_t counter = 0; for (size_t i = 0; i < 256; i++) { @@ -315,7 +315,12 @@ bool inplace_complete_hist(hist_t *pHist, const size_t totalSymbolCountBits) counter += pHist->symbolCount[i]; } - return (counter == 1 << totalSymbolCountBits); +#if defined(_DEBUG) && defined(_MSC_VER) + if (counter != ((uint32_t)1 << totalSymbolCountBits)) + __debugbreak(); +#endif + + return (counter == (uint32_t)(1 << totalSymbolCountBits)); } template diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp index b62beae..07d4741 100644 --- a/src/mt_rANS32x32_16w_decode.cpp +++ b/src/mt_rANS32x32_16w_decode.cpp @@ -675,6 +675,7 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; (void)readHeadBackOffset; // unused in single-threaded version. for (size_t j = 0; j < StateCount; j++) @@ -709,6 +710,8 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui break; } + decodeState.pReadHead = pReadHeadAfter; + } while (i < outLengthInStates); if (i < expectedOutputLength) diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp index 07401a1..140a013 100644 --- a/src/mt_rANS32x32_16w_encode.cpp +++ b/src/mt_rANS32x32_16w_encode.cpp @@ -20,11 +20,11 @@ struct HistReplaceMul }; template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } }; -template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 50; } }; -template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 50; } }; -template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 50; } }; -template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 50; } }; -template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 500; } }; template struct MinBlockSizeBits @@ -45,6 +45,9 @@ constexpr size_t MinBlockSize() return (size_t)1 << MinBlockSizeBits::GetValue(); } +constexpr size_t MaxBlockSizeBits = 25; +constexpr size_t MaxBlockSize = (size_t)1 << MaxBlockSizeBits; + size_t mt_rANS32x32_16w_capacity(const size_t inputSize) { const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); @@ -251,7 +254,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0) + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) { if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) inputBlockTargetIndex -= MinBlockSizeX; @@ -343,7 +346,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0) + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) { if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) inputBlockTargetIndex -= MinBlockSizeX; diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp index 63afda4..ed55ecb 100644 --- a/src/thread_pool.cpp +++ b/src/thread_pool.cpp @@ -10,6 +10,9 @@ #ifdef _WIN32 #include +#else +#include +#include #endif struct thread_pool @@ -34,7 +37,12 @@ void thread_pool_ThreadFunc(thread_pool *pThreadPool, const size_t index) SetThreadIdealProcessor(GetCurrentThread(), (DWORD)index); SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); #else - (void)index; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET((int32_t)index, &cpuset); + + pthread_t current_thread = pthread_self(); + pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); #endif while (pThreadPool->isRunning) From a490e8062f6521c7eb768877660b2ea7d9e2457e Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 02:49:51 +0200 Subject: [PATCH 19/34] as cleanup as possible --- src/block_codec32.h | 713 +++++++++++ src/block_codec64.h | 1761 ++++++++++++++++++++++++++++ src/block_rANS32x32_16w_decode.cpp | 626 +--------- src/block_rANS32x32_16w_encode.cpp | 76 +- src/block_rANS32x64_16w_decode.cpp | 1696 +-------------------------- src/block_rANS32x64_16w_encode.cpp | 83 +- src/mt_rANS32x32_16w_decode.cpp | 668 +---------- src/mt_rANS32x32_16w_encode.cpp | 78 +- 8 files changed, 2492 insertions(+), 3209 deletions(-) create mode 100644 src/block_codec32.h create mode 100644 src/block_codec64.h diff --git a/src/block_codec32.h b/src/block_codec32.h new file mode 100644 index 0000000..106da1c --- /dev/null +++ b/src/block_codec32.h @@ -0,0 +1,713 @@ +#ifndef block_codec32_h__ +#define block_codec32_h__ + +#include "hist.h" + +#include + +constexpr size_t StateCount = 32; // Needs to be a power of two. + +////////////////////////////////////////////////////////////////////////// + +extern const uint8_t _ShuffleLutShfl32[256 * 8]; +extern const uint8_t _ShuffleLutPerm32[256 * 8]; +extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; + +constexpr bool EncodeNoBranch = false; +constexpr bool DecodeNoBranch = false; + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); + +////////////////////////////////////////////////////////////////////////// + +struct _rans_encode_state32_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +enum rans32x32_encoder_type_t +{ + r32x32_et_scalar, +}; + +template +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); +}; + +////////////////////////////////////////////////////////////////////////// + +template <> +struct rans32x32_16w_encoder +{ + template + static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + { + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +struct _rans_decode_state32_t +{ +#ifdef _MSC_VER + __declspec(align(32)) +#else + __attribute__((aligned(32))) +#endif + uint32_t states[StateCount]; + + hist_type hist; + const uint16_t *pReadHead; +}; + +enum rans32x32_decoder_type_t +{ + r32x32_dt_scalar, + r32x32_dt_avx2_large_cache_15_to_13, + r32x32_dt_avx2_small_cache_15_to_13, + r32x32_dt_avx2_large_cache_12_to_10, + r32x32_dt_avx2_small_cache_12_to_10, +}; + +template +struct rans32x32_16w_decoder +{ + static size_t decode_section(_rans_decode_state32_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + (void)totalSymbolCountBits; + + memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); + + return inplace_make_hist_dec(pDecHist); +} + +template +static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec2_hist(pDecHist, pIncompleteHist); + + return true; +} + +template +static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec_pack_hist(pDecHist, pIncompleteHist); + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +template +struct rans32x32_16w_decoder> +{ + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t i = startIndex; + + for (; i < endIndex; i += StateCount) + { + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x32_idx2idx[j]; + uint32_t state = pState->states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = pState->hist.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pState->pReadHead; + state = read ? newState : state; + pState->pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pState->pReadHead; + pState->pReadHead++; + } + } + + pState->states[j] = state; + } + } + + return i; + } +}; + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm256_set1_epi32(0xFFFF); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // const uint8_t symbol = pHist->cumulInv[slot]; + simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); + simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); + simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); + simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); + + // since they were int32_t turn into uint8_t + symbol0 = _mm256_and_si256(symbol0, lower8); + symbol1 = _mm256_and_si256(symbol1, lower8); + symbol2 = _mm256_and_si256(symbol2, lower8); + symbol3 = _mm256_and_si256(symbol3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); + + // freq, cumul. + const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); + const simd_t freq0 = _mm256_and_si256(pack0, lower16); + const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); + const simd_t freq1 = _mm256_and_si256(pack1, lower16); + const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); + const simd_t freq2 = _mm256_and_si256(pack2, lower16); + const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); + const simd_t freq3 = _mm256_and_si256(pack3, lower16); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + + static_assert(TotalSymbolCountBits <= 12); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot3, sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm256_and_si256(pack0, lower8); + const simd_t symbol1 = _mm256_and_si256(pack1, lower8); + const simd_t symbol2 = _mm256_and_si256(pack2, lower8); + const simd_t symbol3 = _mm256_and_si256(pack3, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + // unpack freq, cumul. + const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm256_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm256_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm256_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm256_srli_epi32(pack3, 20); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0, 1. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x32_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +#endif block_codec32_h__ diff --git a/src/block_codec64.h b/src/block_codec64.h new file mode 100644 index 0000000..45f05b5 --- /dev/null +++ b/src/block_codec64.h @@ -0,0 +1,1761 @@ +#ifndef block_codec64_h__ +#define block_codec64_h__ + +#include "hist.h" + +#include + +constexpr size_t StateCount = 64; // Needs to be a power of two. + +////////////////////////////////////////////////////////////////////////// + +extern const uint8_t _ShuffleLutShfl32[256 * 8]; +extern const uint8_t _ShuffleLutPerm32[256 * 8]; +extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; + +constexpr bool EncodeNoBranch = false; +constexpr bool DecodeNoBranch = false; + +////////////////////////////////////////////////////////////////////////// + +static const uint8_t _Rans32x64_idx2idx[] = +{ + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37, + 0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F, +}; + +static_assert(sizeof(_Rans32x64_idx2idx) == StateCount); + +struct _rans_encode_state64_t +{ + uint32_t states[StateCount]; + hist_t hist; + uint16_t *pEnd, *pStart; // both compressed. +}; + +////////////////////////////////////////////////////////////////////////// + +enum rans32x64_encoder_type_t +{ + r32x64_et_scalar, +}; + +template +struct rans32x64_16w_encoder +{ + template + static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); +}; + +////////////////////////////////////////////////////////////////////////// + +template <> +struct rans32x64_16w_encoder +{ + template + static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) + { + int64_t targetCmp = targetIndex + StateCount; + + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) + { + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + const uint8_t in = pInData[i - StateCount + index]; + const uint32_t symbolCount = pState->hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = pState->states[stateIndex]; + + if constexpr (EncodeNoBranch) + { + const bool write = state >= max; + *pState->pStart = (uint16_t)(state & 0xFFFF); + *pState->pStart -= (size_t)write; + state = write ? state >> 16 : state; + } + else + { + if (state >= max) + { + *pState->pStart = (uint16_t)(state & 0xFFFF); + pState->pStart--; + state >>= 16; + } + } + + pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + +template +struct _rans_decode_state64_t +{ +#ifdef _MSC_VER + __declspec(align(64)) +#else + __attribute__((aligned(64))) +#endif + uint32_t states[StateCount]; + + hist_type hist; + const uint16_t *pReadHead; +}; + +enum rans32x64_decoder_type_t +{ + r32x64_dt_scalar, + r32x64_dt_avx2_large_cache_15_to_13, + r32x64_dt_avx2_small_cache_15_to_13, + r32x64_dt_avx2_large_cache_12_to_10, + r32x64_dt_avx2_small_cache_12_to_10, + r32x64_dt_avx512_large_cache_15_to_13, + r32x64_dt_avx512_small_cache_15_to_13, + r32x64_dt_avx512_large_cache_12_to_10, + r32x64_dt_avx512_small_cache_12_to_10, +}; + +template +struct rans32x64_16w_decoder +{ + static size_t decode_section(_rans_decode_state64_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); +}; + +////////////////////////////////////////////////////////////////////////// + +template +static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + (void)totalSymbolCountBits; + + memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); + + return inplace_make_hist_dec(pDecHist); +} + +template +static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec2_hist(pDecHist, pIncompleteHist); + + return true; +} + +template +static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) +{ + if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) + return false; + + make_dec_pack_hist(pDecHist, pIncompleteHist); + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +template +struct rans32x64_16w_decoder> +{ + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t i = startIndex; + + for (; i < endIndex; i += StateCount) + { + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + uint32_t state = pState->states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = pState->hist.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; + + if constexpr (DecodeNoBranch) + { + const bool read = state < DecodeConsumePoint16; + const uint32_t newState = state << 16 | *pState->pReadHead; + state = read ? newState : state; + pState->pReadHead += (size_t)read; + } + else + { + if (state < DecodeConsumePoint16) + { + state = state << 16 | *pState->pReadHead; + pState->pReadHead++; + } + } + + pState->states[j] = state; + } + } + + return i; + } +}; + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm256_set1_epi32(0xFFFF); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); + const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); + const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); + const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); + + // const uint8_t symbol = pHist->cumulInv[slot]; + simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); + simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); + simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); + simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); + simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot4, sizeof(uint8_t)); + simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot5, sizeof(uint8_t)); + simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot6, sizeof(uint8_t)); + simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot7, sizeof(uint8_t)); + + // since they were int32_t turn into uint8_t + symbol0 = _mm256_and_si256(symbol0, lower8); + symbol1 = _mm256_and_si256(symbol1, lower8); + symbol2 = _mm256_and_si256(symbol2, lower8); + symbol3 = _mm256_and_si256(symbol3, lower8); + symbol4 = _mm256_and_si256(symbol4, lower8); + symbol5 = _mm256_and_si256(symbol5, lower8); + symbol6 = _mm256_and_si256(symbol6, lower8); + symbol7 = _mm256_and_si256(symbol7, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); + const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); + const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. + + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); + const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol4, sizeof(uint32_t)); + const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol5, sizeof(uint32_t)); + const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol6, sizeof(uint32_t)); + const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol7, sizeof(uint32_t)); + + // freq, cumul. + const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); + const simd_t freq0 = _mm256_and_si256(pack0, lower16); + const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); + const simd_t freq1 = _mm256_and_si256(pack1, lower16); + const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); + const simd_t freq2 = _mm256_and_si256(pack2, lower16); + const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); + const simd_t freq3 = _mm256_and_si256(pack3, lower16); + const simd_t cumul4 = _mm256_srli_epi32(pack4, 16); + const simd_t freq4 = _mm256_and_si256(pack4, lower16); + const simd_t cumul5 = _mm256_srli_epi32(pack5, 16); + const simd_t freq5 = _mm256_and_si256(pack5, lower16); + const simd_t cumul6 = _mm256_srli_epi32(pack6, 16); + const simd_t freq6 = _mm256_and_si256(pack6, lower16); + const simd_t cumul7 = _mm256_srli_epi32(pack7, 16); + const simd_t freq7 = _mm256_and_si256(pack7, lower16); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); + const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); + const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); + const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); + const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); + const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); + const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); + const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); + const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); + const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); + const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); + const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); + const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); + lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); + lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); + lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +__attribute__((target("avx2"))) +#endif +static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned32) + if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m256i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm256_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); + const simd_t _16 = _mm256_set1_epi32(16); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); + const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); + const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); + const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); + const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); + const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); + const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); + const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot0, sizeof(uint32_t)); + const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot1, sizeof(uint32_t)); + const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot2, sizeof(uint32_t)); + const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot3, sizeof(uint32_t)); + const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot4, sizeof(uint32_t)); + const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot5, sizeof(uint32_t)); + const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot6, sizeof(uint32_t)); + const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot7, sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); + const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); + const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); + const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); + const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm256_and_si256(pack0, lower8); + const simd_t symbol1 = _mm256_and_si256(pack1, lower8); + const simd_t symbol2 = _mm256_and_si256(pack2, lower8); + const simd_t symbol3 = _mm256_and_si256(pack3, lower8); + const simd_t symbol4 = _mm256_and_si256(pack4, lower8); + const simd_t symbol5 = _mm256_and_si256(pack5, lower8); + const simd_t symbol6 = _mm256_and_si256(pack6, lower8); + const simd_t symbol7 = _mm256_and_si256(pack7, lower8); + + // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) + const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); + + const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); + const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); + const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. + + if constexpr (WriteAligned32) + _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + else + _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); + + // unpack freq, cumul. + const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm256_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm256_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm256_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm256_srli_epi32(pack3, 20); + const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12); + const simd_t freq4 = _mm256_srli_epi32(pack4, 20); + const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12); + const simd_t freq5 = _mm256_srli_epi32(pack5, 20); + const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12); + const simd_t freq6 = _mm256_srli_epi32(pack6, 20); + const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12); + const simd_t freq7 = _mm256_srli_epi32(pack7, 20); + + // const uint32_t freqScaled = shiftedState * freq; + const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); + const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); + const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); + const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); + const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); + const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); + const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); + const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); + const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); + const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); + const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); + const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); + const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); + const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); + const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); + const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); + const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); + const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); + const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); + __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. + + const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); + __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. + + const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); + __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. + + const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); + __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. + + const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); + __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. + + const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); + __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. + + const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); + __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. + + const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); + __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); + pState->pReadHead += maskPop0; + + const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); + pState->pReadHead += maskPop1; + + const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); + pState->pReadHead += maskPop2; + + const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); + pState->pReadHead += maskPop3; + + const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); + pState->pReadHead += maskPop4; + + const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); + pState->pReadHead += maskPop5; + + const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); + pState->pReadHead += maskPop6; + + const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); + pState->pReadHead += maskPop7; + + // finalize lookups. + lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); + lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); + lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); + lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); + lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); + lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); + lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); + lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); + const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); + const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); + const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); + const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); + const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); + const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); + const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); + + // shuffle new words in place. + const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); + const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); + const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); + const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); + const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); + const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); + const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); + const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); + + // expand new word. + const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); + const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); + const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); + const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); + const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); + const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); + const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); + const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); + + // state = state << 16 | newWord; + statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); + statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); + statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); + statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); + statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); + statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); + statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); + statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +#ifdef __llvm__ +__attribute__((target("avx512bw"))) +#else +__attribute__((target("avx512f", "avx512bw"))) +#endif +#endif +static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned64) + if ((reinterpret_cast(pOutData) & (64 - 1)) == 0) + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m512i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); + const simd_t lower16 = _mm512_set1_epi32(0xFFFF); + const simd_t lower8 = _mm512_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); + const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); + const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); + const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); + const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); + + // retrieve pack. + simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); + + symbol0 = _mm512_and_si512(symbol0, lower8); + symbol1 = _mm512_and_si512(symbol1, lower8); + symbol2 = _mm512_and_si512(symbol2, lower8); + symbol3 = _mm512_and_si512(symbol3, lower8); + + // retrieve pack. + const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); + + // pack symbols to one si512. + const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. + const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. + + // freq, cumul. + const simd_t cumul0 = _mm512_srli_epi32(pack0, 16); + const simd_t freq0 = _mm512_and_si512(pack0, lower16); + const simd_t cumul1 = _mm512_srli_epi32(pack1, 16); + const simd_t freq1 = _mm512_and_si512(pack1, lower16); + const simd_t cumul2 = _mm512_srli_epi32(pack2, 16); + const simd_t freq2 = _mm512_and_si512(pack2, lower16); + const simd_t cumul3 = _mm512_srli_epi32(pack3, 16); + const simd_t freq3 = _mm512_and_si512(pack3, lower16); + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned64) + _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); + else + _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); + + // const uint32_t freqScaled = shiftedState * freq; + const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); + const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); + const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); + const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); + const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); + const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); + const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. + __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. + __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. + __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. + __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. + __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. + __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. + __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. + __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // finalize lookups. + lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); + lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); + lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); + lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); + lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); + lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); + lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); + lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +#ifndef _MSC_VER +#ifdef __llvm__ +__attribute__((target("avx512bw"))) +#else +__attribute__((target("avx512f", "avx512bw"))) +#endif +#endif +static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) +{ + if constexpr (!WriteAligned64) + if ((reinterpret_cast(pOutData) & (63 - 1)) == 0) + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + static_assert(TotalSymbolCountBits < 16); + + typedef __m512i simd_t; + simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; + + for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) + statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); + + size_t i = startIndex; + + const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); + const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1); + const simd_t lower8 = _mm512_set1_epi32(0xFF); + const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); + const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); + const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); + + for (; i < endIndex; i += StateCount) + { + // const uint32_t slot = state & (TotalSymbolCount - 1); + const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); + const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); + const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); + const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); + + // retrieve pack. + const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); + + // const uint32_t shiftedState = (state >> TotalSymbolCountBits); + const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); + const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); + const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); + const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); + + // unpack symbol. + const simd_t symbol0 = _mm512_and_si512(pack0, lower8); + const simd_t symbol1 = _mm512_and_si512(pack1, lower8); + const simd_t symbol2 = _mm512_and_si512(pack2, lower8); + const simd_t symbol3 = _mm512_and_si512(pack3, lower8); + + // pack symbols to one si512. + const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); + const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); + const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. + const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. + + // unpack freq, cumul. + const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12); + const simd_t freq0 = _mm512_srli_epi32(pack0, 20); + const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12); + const simd_t freq1 = _mm512_srli_epi32(pack1, 20); + const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12); + const simd_t freq2 = _mm512_srli_epi32(pack2, 20); + const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12); + const simd_t freq3 = _mm512_srli_epi32(pack3, 20); + + // We intentionally encoded in a way to not have to do horrible things here. + if constexpr (WriteAligned64) + _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); + else + _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); + + // const uint32_t freqScaled = shiftedState * freq; + const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); + const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); + const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); + const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); + + // state = freqScaled + slot - cumul; + const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); + const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); + const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); + const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); + + // now to the messy part... + { + // read input for blocks 0. + const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 + const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); + const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); + const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); + const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); + + if constexpr (ShuffleMask16) + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. + __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. + __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. + __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. + __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + else + { + // get masks of those compares & start loading shuffle masks. + const uint32_t cmpMask0a = cmpMask0 & 0xFF; + const uint32_t cmpMask0b = cmpMask0 >> 8; + __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. + __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. + + const uint32_t cmpMask1a = cmpMask1 & 0xFF; + const uint32_t cmpMask1b = cmpMask1 >> 8; + __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. + __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. + + const uint32_t cmpMask2a = cmpMask2 & 0xFF; + const uint32_t cmpMask2b = cmpMask2 >> 8; + __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. + __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. + + const uint32_t cmpMask3a = cmpMask3 & 0xFF; + const uint32_t cmpMask3b = cmpMask3 >> 8; + __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. + __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. + + // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. + const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); + pState->pReadHead += maskPop0a; + + const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); + pState->pReadHead += maskPop0b; + + const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); + pState->pReadHead += maskPop1a; + + const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); + pState->pReadHead += maskPop1b; + + const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); + pState->pReadHead += maskPop2a; + + const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); + pState->pReadHead += maskPop2b; + + const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); + pState->pReadHead += maskPop3a; + + const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); + + const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); + pState->pReadHead += maskPop3b; + + // finalize lookups. + lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); + lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); + lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); + lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); + lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); + lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); + lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); + lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); + + // matching: state << 16 + const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); + const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); + const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); + const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); + + if constexpr (YmmShuffle) + { + // shuffle new words in place. + const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); + const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); + const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); + const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); + const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); + const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); + const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + else + { + // shuffle new words in place. + const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); + const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); + const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); + const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); + const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); + const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); + const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); + const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); + + // expand new word. + const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); + const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); + const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); + const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); + + // state = state << 16 | newWord; + statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); + statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); + statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); + statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); + } + } + } + } + + for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) + _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); + + return i; +} + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + } +}; + +template +struct rans32x64_16w_decoder> +{ + template + static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) + { + return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); + } +}; + +#endif diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index dc4b341..ae2bd55 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -2,18 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec32.h" #include #include -constexpr size_t StateCount = 32; // Needs to be a power of two. -constexpr bool DecodeNoBranch = false; - -////////////////////////////////////////////////////////////////////////// - -static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; -static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); - ////////////////////////////////////////////////////////////////////////// extern const uint8_t _ShuffleLutShfl32[256 * 8]; @@ -22,623 +15,6 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; ////////////////////////////////////////////////////////////////////////// -template -struct _rans_decode_state32_t -{ -#ifdef _MSC_VER - __declspec(align(32)) -#else - __attribute__((aligned(32))) -#endif - uint32_t states[StateCount]; - - hist_type hist; - const uint16_t *pReadHead; -}; - -enum rans32x32_decoder_type_t -{ - r32x32_dt_scalar, - r32x32_dt_avx2_large_cache_15_to_13, - r32x32_dt_avx2_small_cache_15_to_13, - r32x32_dt_avx2_large_cache_12_to_10, - r32x32_dt_avx2_small_cache_12_to_10, -}; - -template -struct rans32x32_16w_decoder -{ - static size_t decode_section(_rans_decode_state32_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); -}; - -template -struct rans32x32_16w_decoder> -{ - static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - - size_t i = startIndex; - - for (; i < endIndex; i += StateCount) - { - for (size_t j = 0; j < StateCount; j++) - { - const uint8_t index = _Rans32x32_idx2idx[j]; - uint32_t state = pState->states[j]; - - const uint32_t slot = state & (TotalSymbolCount - 1); - const uint8_t symbol = pState->hist.cumulInv[slot]; - pOutData[i + index] = symbol; - - state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; - - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *pState->pReadHead; - state = read ? newState : state; - pState->pReadHead += (size_t)read; - } - else - { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *pState->pReadHead; - pState->pReadHead++; - } - } - - pState->states[j] = state; - } - } - - return i; - } -}; - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) - return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower16 = _mm256_set1_epi32(0xFFFF); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - - // const uint8_t symbol = pHist->cumulInv[slot]; - simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); - simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); - simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); - simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); - - // since they were int32_t turn into uint8_t - symbol0 = _mm256_and_si256(symbol0, lower8); - symbol1 = _mm256_and_si256(symbol1, lower8); - symbol2 = _mm256_and_si256(symbol2, lower8); - symbol3 = _mm256_and_si256(symbol3, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); - - // freq, cumul. - const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); - const simd_t freq0 = _mm256_and_si256(pack0, lower16); - const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); - const simd_t freq1 = _mm256_and_si256(pack1, lower16); - const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); - const simd_t freq2 = _mm256_and_si256(pack2, lower16); - const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); - const simd_t freq3 = _mm256_and_si256(pack3, lower16); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0, 1. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) - return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - - static_assert(TotalSymbolCountBits <= 12); - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot3, sizeof(uint32_t)); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - - // unpack symbol. - const simd_t symbol0 = _mm256_and_si256(pack0, lower8); - const simd_t symbol1 = _mm256_and_si256(pack1, lower8); - const simd_t symbol2 = _mm256_and_si256(pack2, lower8); - const simd_t symbol3 = _mm256_and_si256(pack3, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - // unpack freq, cumul. - const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); - const simd_t freq0 = _mm256_srli_epi32(pack0, 20); - const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); - const simd_t freq1 = _mm256_srli_epi32(pack1, 20); - const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); - const simd_t freq2 = _mm256_srli_epi32(pack2, 20); - const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); - const simd_t freq3 = _mm256_srli_epi32(pack3, 20); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0, 1. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -////////////////////////////////////////////////////////////////////////// - -template -static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - (void)totalSymbolCountBits; - - memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); - - return inplace_make_hist_dec(pDecHist); -} - -template -static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec2_hist(pDecHist, pIncompleteHist); - - return true; -} - -template -static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec_pack_hist(pDecHist, pIncompleteHist); - - return true; -} - -////////////////////////////////////////////////////////////////////////// - template size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index ec85a1d..3a25ed0 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -2,12 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec32.h" #include #include -constexpr size_t StateCount = 32; // Needs to be a power of two. -constexpr bool EncodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; constexpr size_t MinMinBlockSizeBits = 15; @@ -56,79 +55,6 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize) ////////////////////////////////////////////////////////////////////////// -static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; -static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); - -////////////////////////////////////////////////////////////////////////// - -struct _rans_encode_state32_t -{ - uint32_t states[StateCount]; - hist_t hist; - uint16_t *pEnd, *pStart; // both compressed. -}; - -enum rans32x32_encoder_type_t -{ - r32x32_et_scalar, -}; - -template -struct rans32x32_16w_encoder -{ - template - static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); -}; - -template <> -struct rans32x32_16w_encoder -{ - template - static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) - { - int64_t targetCmp = targetIndex + StateCount; - - constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); - - for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) - { - for (int64_t j = StateCount - 1; j >= 0; j--) - { - const uint8_t index = _Rans32x32_idx2idx[j]; - - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = pState->hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; - - const size_t stateIndex = j; - - uint32_t state = pState->states[stateIndex]; - - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pState->pStart = (uint16_t)(state & 0xFFFF); - *pState->pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else - { - if (state >= max) - { - *pState->pStart = (uint16_t)(state & 0xFFFF); - pState->pStart--; - state >>= 16; - } - } - - pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// - template static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) { diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp index c47f81a..c4e1788 100644 --- a/src/block_rANS32x64_16w_decode.cpp +++ b/src/block_rANS32x64_16w_decode.cpp @@ -2,1689 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec64.h" #include #include -constexpr size_t StateCount = 64; // Needs to be a power of two. -constexpr bool DecodeNoBranch = false; - -////////////////////////////////////////////////////////////////////////// - -static const uint8_t _Rans32x64_idx2idx[] = -{ - 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F, - 0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37, - 0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F, -}; - -static_assert(sizeof(_Rans32x64_idx2idx) == StateCount); - -////////////////////////////////////////////////////////////////////////// - -extern const uint8_t _ShuffleLutShfl32[256 * 8]; -extern const uint8_t _ShuffleLutPerm32[256 * 8]; -extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; - -////////////////////////////////////////////////////////////////////////// - -template -struct _rans_decode_state64_t -{ -#ifdef _MSC_VER - __declspec(align(64)) -#else - __attribute__((aligned(64))) -#endif - uint32_t states[StateCount]; - - hist_type hist; - const uint16_t *pReadHead; -}; - -enum rans32x64_decoder_type_t -{ - r32x64_dt_scalar, - r32x64_dt_avx2_large_cache_15_to_13, - r32x64_dt_avx2_small_cache_15_to_13, - r32x64_dt_avx2_large_cache_12_to_10, - r32x64_dt_avx2_small_cache_12_to_10, - r32x64_dt_avx512_large_cache_15_to_13, - r32x64_dt_avx512_small_cache_15_to_13, - r32x64_dt_avx512_large_cache_12_to_10, - r32x64_dt_avx512_small_cache_12_to_10, -}; - -template -struct rans32x64_16w_decoder -{ - static size_t decode_section(_rans_decode_state64_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); -}; - -template -struct rans32x64_16w_decoder> -{ - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - - size_t i = startIndex; - - for (; i < endIndex; i += StateCount) - { - for (size_t j = 0; j < StateCount; j++) - { - const uint8_t index = _Rans32x64_idx2idx[j]; - uint32_t state = pState->states[j]; - - const uint32_t slot = state & (TotalSymbolCount - 1); - const uint8_t symbol = pState->hist.cumulInv[slot]; - pOutData[i + index] = symbol; - - state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; - - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *pState->pReadHead; - state = read ? newState : state; - pState->pReadHead += (size_t)read; - } - else - { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *pState->pReadHead; - pState->pReadHead++; - } - } - - pState->states[j] = state; - } - } - - return i; - } -}; - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) - return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower16 = _mm256_set1_epi32(0xFFFF); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); - const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); - const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); - const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); - - // const uint8_t symbol = pHist->cumulInv[slot]; - simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); - simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); - simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); - simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); - simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot4, sizeof(uint8_t)); - simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot5, sizeof(uint8_t)); - simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot6, sizeof(uint8_t)); - simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot7, sizeof(uint8_t)); - - // since they were int32_t turn into uint8_t - symbol0 = _mm256_and_si256(symbol0, lower8); - symbol1 = _mm256_and_si256(symbol1, lower8); - symbol2 = _mm256_and_si256(symbol2, lower8); - symbol3 = _mm256_and_si256(symbol3, lower8); - symbol4 = _mm256_and_si256(symbol4, lower8); - symbol5 = _mm256_and_si256(symbol5, lower8); - symbol6 = _mm256_and_si256(symbol6, lower8); - symbol7 = _mm256_and_si256(symbol7, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); - const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); - const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. - - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); - const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol4, sizeof(uint32_t)); - const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol5, sizeof(uint32_t)); - const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol6, sizeof(uint32_t)); - const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol7, sizeof(uint32_t)); - - // freq, cumul. - const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); - const simd_t freq0 = _mm256_and_si256(pack0, lower16); - const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); - const simd_t freq1 = _mm256_and_si256(pack1, lower16); - const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); - const simd_t freq2 = _mm256_and_si256(pack2, lower16); - const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); - const simd_t freq3 = _mm256_and_si256(pack3, lower16); - const simd_t cumul4 = _mm256_srli_epi32(pack4, 16); - const simd_t freq4 = _mm256_and_si256(pack4, lower16); - const simd_t cumul5 = _mm256_srli_epi32(pack5, 16); - const simd_t freq5 = _mm256_and_si256(pack5, lower16); - const simd_t cumul6 = _mm256_srli_epi32(pack6, 16); - const simd_t freq6 = _mm256_and_si256(pack6, lower16); - const simd_t cumul7 = _mm256_srli_epi32(pack7, 16); - const simd_t freq7 = _mm256_and_si256(pack7, lower16); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); - const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); - const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); - const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); - const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); - const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); - const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); - const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); - const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); - const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); - - // now to the messy part... - { - // read input for blocks 0. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); - const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); - const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); - const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); - __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. - - const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); - __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. - - const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); - __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. - - const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); - __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); - pState->pReadHead += maskPop4; - - const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); - pState->pReadHead += maskPop5; - - const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); - pState->pReadHead += maskPop6; - - const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); - pState->pReadHead += maskPop7; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); - const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); - const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); - const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); - const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); - const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); - const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); - const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); - const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); - const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); - statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); - statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); - statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); - __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. - - const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); - __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. - - const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); - __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. - - const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); - __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); - pState->pReadHead += maskPop4; - - const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); - pState->pReadHead += maskPop5; - - const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); - pState->pReadHead += maskPop6; - - const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); - pState->pReadHead += maskPop7; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); - lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); - lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); - lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); - const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); - const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); - const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); - const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); - const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); - const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); - const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); - const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); - const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); - statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); - statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); - statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (32 - 1)) == 0) - return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask); - const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask); - const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask); - const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot3, sizeof(uint32_t)); - const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot4, sizeof(uint32_t)); - const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot5, sizeof(uint32_t)); - const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot6, sizeof(uint32_t)); - const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast(pState->hist.symbol), slot7, sizeof(uint32_t)); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits); - const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits); - const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits); - const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits); - - // unpack symbol. - const simd_t symbol0 = _mm256_and_si256(pack0, lower8); - const simd_t symbol1 = _mm256_and_si256(pack1, lower8); - const simd_t symbol2 = _mm256_and_si256(pack2, lower8); - const simd_t symbol3 = _mm256_and_si256(pack3, lower8); - const simd_t symbol4 = _mm256_and_si256(pack4, lower8); - const simd_t symbol5 = _mm256_and_si256(pack5, lower8); - const simd_t symbol6 = _mm256_and_si256(pack6, lower8); - const simd_t symbol7 = _mm256_and_si256(pack7, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5); - const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7); - const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order. - - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567); - - // unpack freq, cumul. - const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); - const simd_t freq0 = _mm256_srli_epi32(pack0, 20); - const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); - const simd_t freq1 = _mm256_srli_epi32(pack1, 20); - const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); - const simd_t freq2 = _mm256_srli_epi32(pack2, 20); - const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); - const simd_t freq3 = _mm256_srli_epi32(pack3, 20); - const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12); - const simd_t freq4 = _mm256_srli_epi32(pack4, 20); - const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12); - const simd_t freq5 = _mm256_srli_epi32(pack5, 20); - const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12); - const simd_t freq6 = _mm256_srli_epi32(pack6, 20); - const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12); - const simd_t freq7 = _mm256_srli_epi32(pack7, 20); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4); - const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5); - const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6); - const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4)); - const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5)); - const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6)); - const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7)); - - // now to the messy part... - { - // read input for blocks 0. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4); - const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5); - const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6); - const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); - __m128i lut4 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`. - - const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); - __m128i lut5 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`. - - const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); - __m128i lut6 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`. - - const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); - __m128i lut7 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); - pState->pReadHead += maskPop4; - - const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); - pState->pReadHead += maskPop5; - - const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); - pState->pReadHead += maskPop6; - - const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); - pState->pReadHead += maskPop7; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); - const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); - const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); - const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); - const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); - const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); - const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); - const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); - const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); - const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); - statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); - statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); - statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4)); - __m128i lut4 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`. - - const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5)); - __m128i lut5 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`. - - const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6)); - __m128i lut6 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`. - - const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7)); - __m128i lut7 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4); - pState->pReadHead += maskPop4; - - const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5); - pState->pReadHead += maskPop5; - - const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6); - pState->pReadHead += maskPop6; - - const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7); - pState->pReadHead += maskPop7; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit); - lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit); - lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit); - lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16)); - const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16)); - const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16)); - const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4); - const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5); - const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6); - const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4); - const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5); - const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6); - const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4); - statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5); - statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6); - statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -#ifndef _MSC_VER -#ifdef __llvm__ -__attribute__((target("avx512bw"))) -#else -__attribute__((target("avx512f", "avx512bw"))) -#endif -#endif -static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned64) - if ((reinterpret_cast(pOutData) & (64 - 1)) == 0) - return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m512i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); - const simd_t lower16 = _mm512_set1_epi32(0xFFFF); - const simd_t lower8 = _mm512_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); - const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); - const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); - const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); - const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); - - // retrieve pack. - simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); - simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); - simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); - simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.cumulInv), sizeof(uint8_t)); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); - - symbol0 = _mm512_and_si512(symbol0, lower8); - symbol1 = _mm512_and_si512(symbol1, lower8); - symbol2 = _mm512_and_si512(symbol2, lower8); - symbol3 = _mm512_and_si512(symbol3, lower8); - - // retrieve pack. - const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); - const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); - const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); - const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast(pState->hist.symbols), sizeof(uint32_t)); - - // pack symbols to one si512. - const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. - const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. - - // freq, cumul. - const simd_t cumul0 = _mm512_srli_epi32(pack0, 16); - const simd_t freq0 = _mm512_and_si512(pack0, lower16); - const simd_t cumul1 = _mm512_srli_epi32(pack1, 16); - const simd_t freq1 = _mm512_and_si512(pack1, lower16); - const simd_t cumul2 = _mm512_srli_epi32(pack2, 16); - const simd_t freq2 = _mm512_and_si512(pack2, lower16); - const simd_t cumul3 = _mm512_srli_epi32(pack3, 16); - const simd_t freq3 = _mm512_and_si512(pack3, lower16); - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned64) - _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); - else - _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); - - // const uint32_t freqScaled = shiftedState * freq; - const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); - const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); - const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); - const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0. - const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); - const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); - const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); - const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0a = cmpMask0 & 0xFF; - const uint32_t cmpMask0b = cmpMask0 >> 8; - __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. - __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. - - const uint32_t cmpMask1a = cmpMask1 & 0xFF; - const uint32_t cmpMask1b = cmpMask1 >> 8; - __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. - __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. - - const uint32_t cmpMask2a = cmpMask2 & 0xFF; - const uint32_t cmpMask2b = cmpMask2 >> 8; - __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. - __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. - - const uint32_t cmpMask3a = cmpMask3 & 0xFF; - const uint32_t cmpMask3b = cmpMask3 >> 8; - __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. - __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); - pState->pReadHead += maskPop0a; - - const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); - pState->pReadHead += maskPop0b; - - const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); - pState->pReadHead += maskPop1a; - - const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); - pState->pReadHead += maskPop1b; - - const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); - pState->pReadHead += maskPop2a; - - const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); - pState->pReadHead += maskPop2b; - - const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); - pState->pReadHead += maskPop3a; - - const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); - pState->pReadHead += maskPop3b; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); - const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); - const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); - const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); - - if constexpr (YmmShuffle) - { - // shuffle new words in place. - const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); - const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); - const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); - const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); - const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); - const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); - const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - else - { - // shuffle new words in place. - const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); - const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); - const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); - const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); - const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); - const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); - const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); - const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); - const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); - const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); - const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0a = cmpMask0 & 0xFF; - const uint32_t cmpMask0b = cmpMask0 >> 8; - __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. - __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. - - const uint32_t cmpMask1a = cmpMask1 & 0xFF; - const uint32_t cmpMask1b = cmpMask1 >> 8; - __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. - __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. - - const uint32_t cmpMask2a = cmpMask2 & 0xFF; - const uint32_t cmpMask2b = cmpMask2 >> 8; - __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. - __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. - - const uint32_t cmpMask3a = cmpMask3 & 0xFF; - const uint32_t cmpMask3b = cmpMask3 >> 8; - __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. - __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); - pState->pReadHead += maskPop0a; - - const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); - pState->pReadHead += maskPop0b; - - const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); - pState->pReadHead += maskPop1a; - - const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); - pState->pReadHead += maskPop1b; - - const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); - pState->pReadHead += maskPop2a; - - const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); - pState->pReadHead += maskPop2b; - - const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); - pState->pReadHead += maskPop3a; - - const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); - pState->pReadHead += maskPop3b; - - // finalize lookups. - lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); - lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); - lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); - lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); - lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); - lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); - lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); - lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); - const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); - const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); - const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); - - if constexpr (YmmShuffle) - { - // shuffle new words in place. - const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); - const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); - const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); - const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); - const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); - const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); - const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - else - { - // shuffle new words in place. - const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); - const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); - const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); - const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); - const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); - const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); - const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); - const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); - const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); - const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); - const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -#ifndef _MSC_VER -#ifdef __llvm__ -__attribute__((target("avx512bw"))) -#else -__attribute__((target("avx512f", "avx512bw"))) -#endif -#endif -static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned64) - if ((reinterpret_cast(pOutData) & (63 - 1)) == 0) - return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m512i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm512_loadu_si512(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1); - const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1); - const simd_t lower8 = _mm512_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16); - const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask); - const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask); - const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask); - const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask); - - // retrieve pack. - const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); - const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); - const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); - const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast(pState->hist.symbol), sizeof(uint32_t)); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits); - - // unpack symbol. - const simd_t symbol0 = _mm512_and_si512(pack0, lower8); - const simd_t symbol1 = _mm512_and_si512(pack1, lower8); - const simd_t symbol2 = _mm512_and_si512(pack2, lower8); - const simd_t symbol3 = _mm512_and_si512(pack3, lower8); - - // pack symbols to one si512. - const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now. - const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly. - - // unpack freq, cumul. - const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12); - const simd_t freq0 = _mm512_srli_epi32(pack0, 20); - const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12); - const simd_t freq1 = _mm512_srli_epi32(pack1, 20); - const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12); - const simd_t freq2 = _mm512_srli_epi32(pack2, 20); - const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12); - const simd_t freq3 = _mm512_srli_epi32(pack3, 20); - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned64) - _mm512_stream_si512(reinterpret_cast(pOutData + i), symPackCompat); - else - _mm512_storeu_si512(reinterpret_cast(pOutData + i), symPackCompat); - - // const uint32_t freqScaled = shiftedState * freq; - const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0); - const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1); - const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2); - const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0. - const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0); - const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1); - const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2); - const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0a = cmpMask0 & 0xFF; - const uint32_t cmpMask0b = cmpMask0 >> 8; - __m128i lut0a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`. - __m128i lut0b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`. - - const uint32_t cmpMask1a = cmpMask1 & 0xFF; - const uint32_t cmpMask1b = cmpMask1 >> 8; - __m128i lut1a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`. - __m128i lut1b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`. - - const uint32_t cmpMask2a = cmpMask2 & 0xFF; - const uint32_t cmpMask2b = cmpMask2 >> 8; - __m128i lut2a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`. - __m128i lut2b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`. - - const uint32_t cmpMask3a = cmpMask3 & 0xFF; - const uint32_t cmpMask3b = cmpMask3 >> 8; - __m128i lut3a = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`. - __m128i lut3b = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); - pState->pReadHead += maskPop0a; - - const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); - pState->pReadHead += maskPop0b; - - const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); - pState->pReadHead += maskPop1a; - - const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); - pState->pReadHead += maskPop1b; - - const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); - pState->pReadHead += maskPop2a; - - const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); - pState->pReadHead += maskPop2b; - - const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); - pState->pReadHead += maskPop3a; - - const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); - pState->pReadHead += maskPop3b; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); - const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); - const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); - const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); - - if constexpr (YmmShuffle) - { - // shuffle new words in place. - const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); - const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); - const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); - const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); - const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); - const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); - const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - else - { - // shuffle new words in place. - const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); - const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); - const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); - const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); - const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); - const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); - const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); - const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); - const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); - const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); - const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0a = cmpMask0 & 0xFF; - const uint32_t cmpMask0b = cmpMask0 >> 8; - __m128i lut0a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`. - __m128i lut0b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`. - - const uint32_t cmpMask1a = cmpMask1 & 0xFF; - const uint32_t cmpMask1b = cmpMask1 >> 8; - __m128i lut1a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`. - __m128i lut1b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`. - - const uint32_t cmpMask2a = cmpMask2 & 0xFF; - const uint32_t cmpMask2b = cmpMask2 >> 8; - __m128i lut2a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`. - __m128i lut2b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`. - - const uint32_t cmpMask3a = cmpMask3 & 0xFF; - const uint32_t cmpMask3b = cmpMask3 >> 8; - __m128i lut3a = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`. - __m128i lut3b = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7. - const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a); - pState->pReadHead += maskPop0a; - - const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b); - pState->pReadHead += maskPop0b; - - const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a); - pState->pReadHead += maskPop1a; - - const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b); - pState->pReadHead += maskPop1b; - - const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a); - pState->pReadHead += maskPop2a; - - const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b); - pState->pReadHead += maskPop2b; - - const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a); - pState->pReadHead += maskPop3a; - - const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b); - pState->pReadHead += maskPop3b; - - // finalize lookups. - lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit); - lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit); - lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit); - lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit); - lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit); - lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit); - lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit); - lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16); - const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16); - const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16); - const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16); - - if constexpr (YmmShuffle) - { - // shuffle new words in place. - const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a)); - const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a)); - const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a)); - const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a)); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0); - const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1); - const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2); - const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - else - { - // shuffle new words in place. - const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a); - const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b); - const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a); - const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b); - const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a); - const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b); - const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a); - const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b); - - // expand new word. - const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a)); - const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a)); - const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a)); - const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a)); - - // state = state << 16 | newWord; - statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0); - statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1); - statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2); - statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3); - } - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm512_storeu_si512(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx512_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x64_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _block_rans32x64_decode_section_avx512_varC(pState, pOutData, startIndex, endIndex); - } -}; - -////////////////////////////////////////////////////////////////////////// - -template -static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - (void)totalSymbolCountBits; - - memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); - - return inplace_make_hist_dec(pDecHist); -} - -template -static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec2_hist(pDecHist, pIncompleteHist); - - return true; -} - -template -static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec_pack_hist(pDecHist, pIncompleteHist); - - return true; -} - ////////////////////////////////////////////////////////////////////////// template @@ -1777,20 +99,10 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *decodeState.pReadHead; - state = read ? newState : state; - decodeState.pReadHead += (size_t)read; - } - else + if (state < DecodeConsumePoint16) { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *decodeState.pReadHead; - decodeState.pReadHead++; - } + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; } decodeState.states[j] = state; diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp index 7b77a81..e17469e 100644 --- a/src/block_rANS32x64_16w_encode.cpp +++ b/src/block_rANS32x64_16w_encode.cpp @@ -2,12 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec64.h" #include #include -constexpr size_t StateCount = 64; // Needs to be a power of two. -constexpr bool EncodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; constexpr size_t MinMinBlockSizeBits = 15; @@ -56,86 +55,6 @@ size_t block_rANS32x64_16w_capacity(const size_t inputSize) ////////////////////////////////////////////////////////////////////////// -static const uint8_t _Rans32x64_idx2idx[] = -{ - 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F, - 0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37, - 0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F, -}; - -static_assert(sizeof(_Rans32x64_idx2idx) == StateCount); - -////////////////////////////////////////////////////////////////////////// - -struct _rans_encode_state64_t -{ - uint32_t states[StateCount]; - hist_t hist; - uint16_t *pEnd, *pStart; // both compressed. -}; - -enum rans32x64_encoder_type_t -{ - r32x64_et_scalar, -}; - -template -struct rans32x64_16w_encoder -{ - template - static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); -}; - -template <> -struct rans32x64_16w_encoder -{ - template - static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) - { - int64_t targetCmp = targetIndex + StateCount; - - constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); - - for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) - { - for (int64_t j = StateCount - 1; j >= 0; j--) - { - const uint8_t index = _Rans32x64_idx2idx[j]; - - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = pState->hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; - - const size_t stateIndex = j; - - uint32_t state = pState->states[stateIndex]; - - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pState->pStart = (uint16_t)(state & 0xFFFF); - *pState->pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else - { - if (state >= max) - { - *pState->pStart = (uint16_t)(state & 0xFFFF); - pState->pStart--; - state >>= 16; - } - } - - pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// - template static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) { diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp index 07d4741..503a9e6 100644 --- a/src/mt_rANS32x32_16w_decode.cpp +++ b/src/mt_rANS32x32_16w_decode.cpp @@ -2,641 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec32.h" #include #include -constexpr size_t StateCount = 32; // Needs to be a power of two. -constexpr bool DecodeNoBranch = false; - -////////////////////////////////////////////////////////////////////////// - -static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; -static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); - -////////////////////////////////////////////////////////////////////////// - -extern const uint8_t _ShuffleLutShfl32[256 * 8]; -extern const uint8_t _ShuffleLutPerm32[256 * 8]; -extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2]; - -////////////////////////////////////////////////////////////////////////// - -template -struct _rans_decode_state32mt_t -{ -#ifdef _MSC_VER - __declspec(align(32)) -#else - __attribute__((aligned(32))) -#endif - uint32_t states[StateCount]; - - hist_type hist; - const uint16_t *pReadHead; -}; - -enum rans32x32_decoder_type_t -{ - r32x32_dt_scalar, - r32x32_dt_avx2_large_cache_15_to_13, - r32x32_dt_avx2_small_cache_15_to_13, - r32x32_dt_avx2_large_cache_12_to_10, - r32x32_dt_avx2_small_cache_12_to_10, -}; - -template -struct rans32x32_16w_decoder -{ - static size_t decode_section(_rans_decode_state32mt_t *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex); -}; - -template -struct rans32x32_16w_decoder> -{ - static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - - size_t i = startIndex; - - for (; i < endIndex; i += StateCount) - { - for (size_t j = 0; j < StateCount; j++) - { - const uint8_t index = _Rans32x32_idx2idx[j]; - uint32_t state = pState->states[j]; - - const uint32_t slot = state & (TotalSymbolCount - 1); - const uint8_t symbol = pState->hist.cumulInv[slot]; - pOutData[i + index] = symbol; - - state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol]; - - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *pState->pReadHead; - state = read ? newState : state; - pState->pReadHead += (size_t)read; - } - else - { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *pState->pReadHead; - pState->pReadHead++; - } - } - - pState->states[j] = state; - } - } - - return i; - } -}; - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _mt_rans32x32_decode_section_avx2_varA(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) - return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - static_assert(TotalSymbolCountBits < 16); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower16 = _mm256_set1_epi32(0xFFFF); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - - // const uint8_t symbol = pHist->cumulInv[slot]; - simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot0, sizeof(uint8_t)); - simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot1, sizeof(uint8_t)); - simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot2, sizeof(uint8_t)); - simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.cumulInv), slot3, sizeof(uint8_t)); - - // since they were int32_t turn into uint8_t - symbol0 = _mm256_and_si256(symbol0, lower8); - symbol1 = _mm256_and_si256(symbol1, lower8); - symbol2 = _mm256_and_si256(symbol2, lower8); - symbol3 = _mm256_and_si256(symbol3, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbols), symbol3, sizeof(uint32_t)); - - // freq, cumul. - const simd_t cumul0 = _mm256_srli_epi32(pack0, 16); - const simd_t freq0 = _mm256_and_si256(pack0, lower16); - const simd_t cumul1 = _mm256_srli_epi32(pack1, 16); - const simd_t freq1 = _mm256_and_si256(pack1, lower16); - const simd_t cumul2 = _mm256_srli_epi32(pack2, 16); - const simd_t freq2 = _mm256_and_si256(pack2, lower16); - const simd_t cumul3 = _mm256_srli_epi32(pack3, 16); - const simd_t freq3 = _mm256_and_si256(pack3, lower16); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0, 1. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -#ifndef _MSC_VER -__attribute__((target("avx2"))) -#endif -static size_t _mt_rans32x32_decode_section_avx2_varC(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) -{ - if constexpr (!WriteAligned32) - if ((reinterpret_cast(pOutData) & (StateCount - 1)) == 0) - return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - - static_assert(TotalSymbolCountBits <= 12); - constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); - - typedef __m256i simd_t; - simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))]; - - for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++) - statesX8[i] = _mm256_load_si256(reinterpret_cast(reinterpret_cast(pState->states) + i * sizeof(simd_t))); - - size_t i = startIndex; - - const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1); - const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1); - const simd_t lower8 = _mm256_set1_epi32(0xFF); - const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16); - const simd_t _16 = _mm256_set1_epi32(16); - const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100); - - for (; i < endIndex; i += StateCount) - { - // const uint32_t slot = state & (TotalSymbolCount - 1); - const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask); - const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask); - const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask); - const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask); - - // retrieve pack. - const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot0, sizeof(uint32_t)); - const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot1, sizeof(uint32_t)); - const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot2, sizeof(uint32_t)); - const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast(&pState->hist.symbol), slot3, sizeof(uint32_t)); - - // const uint32_t shiftedState = (state >> TotalSymbolCountBits); - const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits); - const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits); - const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits); - const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits); - - // unpack symbol. - const simd_t symbol0 = _mm256_and_si256(pack0, lower8); - const simd_t symbol1 = _mm256_and_si256(pack1, lower8); - const simd_t symbol2 = _mm256_and_si256(pack2, lower8); - const simd_t symbol3 = _mm256_and_si256(pack3, lower8); - - // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower) - const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1); - const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3); - const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F` - - // We intentionally encoded in a way to not have to do horrible things here. - if constexpr (WriteAligned32) - _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - else - _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123); - - // unpack freq, cumul. - const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12); - const simd_t freq0 = _mm256_srli_epi32(pack0, 20); - const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12); - const simd_t freq1 = _mm256_srli_epi32(pack1, 20); - const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12); - const simd_t freq2 = _mm256_srli_epi32(pack2, 20); - const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12); - const simd_t freq3 = _mm256_srli_epi32(pack3, 20); - - // const uint32_t freqScaled = shiftedState * freq; - const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0); - const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1); - const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2); - const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3); - - // state = freqScaled + slot - cumul; - const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0)); - const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1)); - const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2)); - const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3)); - - // now to the messy part... - { - // read input for blocks 0, 1. - const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0 - const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0); - const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1); - const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2); - const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3); - - if constexpr (ShuffleMask16) - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_load_si128(reinterpret_cast(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - else - { - // get masks of those compares & start loading shuffle masks. - const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0)); - __m128i lut0 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`. - - const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1)); - __m128i lut1 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`. - - const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2)); - __m128i lut2 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`. - - const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3)); - __m128i lut3 = _mm_lddqu_si128(reinterpret_cast(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`. - - // advance read head & read input for blocks 1, 2, 3. - const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0); - pState->pReadHead += maskPop0; - - const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1); - pState->pReadHead += maskPop1; - - const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2); - pState->pReadHead += maskPop2; - - const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast(pState->pReadHead)); - - const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3); - pState->pReadHead += maskPop3; - - // finalize lookups. - lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit); - lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit); - lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit); - lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit); - - // matching: state << 16 - const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16)); - const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16)); - const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16)); - const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16)); - - // shuffle new words in place. - const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0); - const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1); - const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2); - const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3); - - // expand new word. - const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0); - const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1); - const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2); - const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3); - - // state = state << 16 | newWord; - statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0); - statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1); - statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2); - statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3); - } - } - } - - for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++) - _mm256_store_si256(reinterpret_cast(reinterpret_cast(pState->states) + j * sizeof(simd_t)), statesX8[j]); - - return i; -} - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _mt_rans32x32_decode_section_avx2_varA(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -template -struct rans32x32_16w_decoder> -{ - template - static size_t decode_section(_rans_decode_state32mt_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) - { - return _mt_rans32x32_decode_section_avx2_varC(pState, pOutData, startIndex, endIndex); - } -}; - -////////////////////////////////////////////////////////////////////////// - -template -static bool _init_from_hist(hist_dec_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - (void)totalSymbolCountBits; - - memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount)); - - return inplace_make_hist_dec(pDecHist); -} - -template -static bool _init_from_hist(hist_dec2_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec2_hist(pDecHist, pIncompleteHist); - - return true; -} - -template -static bool _init_from_hist(hist_dec_pack_t *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits) -{ - if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits)) - return false; - - make_dec_pack_hist(pDecHist, pIncompleteHist); - - return true; -} - ////////////////////////////////////////////////////////////////////////// template @@ -661,7 +31,7 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui if (inLength < expectedInputLength) return 0; - _rans_decode_state32mt_t decodeState; + _rans_decode_state32_t decodeState; decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; @@ -736,20 +106,10 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *decodeState.pReadHead; - state = read ? newState : state; - decodeState.pReadHead += (size_t)read; - } - else + if (state < DecodeConsumePoint16) { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *decodeState.pReadHead; - decodeState.pReadHead++; - } + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; } decodeState.states[j] = state; @@ -784,7 +144,7 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, if (inLength < expectedInputLength) return 0; - _rans_decode_state32mt_t decodeState; + _rans_decode_state32_t decodeState; decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; @@ -864,20 +224,10 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; - if constexpr (DecodeNoBranch) - { - const bool read = state < DecodeConsumePoint16; - const uint32_t newState = state << 16 | *decodeState.pReadHead; - state = read ? newState : state; - decodeState.pReadHead += (size_t)read; - } - else + if (state < DecodeConsumePoint16) { - if (state < DecodeConsumePoint16) - { - state = state << 16 | *decodeState.pReadHead; - decodeState.pReadHead++; - } + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; } decodeState.states[j] = state; diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp index 140a013..632fce0 100644 --- a/src/mt_rANS32x32_16w_encode.cpp +++ b/src/mt_rANS32x32_16w_encode.cpp @@ -2,12 +2,11 @@ #include "hist.h" #include "simd_platform.h" +#include "block_codec32.h" #include #include -constexpr size_t StateCount = 32; // Needs to be a power of two. -constexpr bool EncodeNoBranch = false; constexpr size_t SafeHistBitMax = 0; constexpr size_t MinMinBlockSizeBits = 15; @@ -59,79 +58,6 @@ size_t mt_rANS32x32_16w_capacity(const size_t inputSize) ////////////////////////////////////////////////////////////////////////// -static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; -static_assert(sizeof(_Rans32x32_idx2idx) == StateCount); - -////////////////////////////////////////////////////////////////////////// - -struct _rans_encode_state32mt_t -{ - uint32_t states[StateCount]; - hist_t hist; - uint16_t *pEnd, *pStart; // both compressed. -}; - -enum rans32x32_encoder_type_t -{ - r32x32_et_scalar, -}; - -template -struct rans32x32_16w_encoder -{ - template - static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex); -}; - -template <> -struct rans32x32_16w_encoder -{ - template - static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex) - { - int64_t targetCmp = targetIndex + StateCount; - - constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); - - for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount) - { - for (int64_t j = StateCount - 1; j >= 0; j--) - { - const uint8_t index = _Rans32x32_idx2idx[j]; - - const uint8_t in = pInData[i - StateCount + index]; - const uint32_t symbolCount = pState->hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; - - const size_t stateIndex = j; - - uint32_t state = pState->states[stateIndex]; - - if constexpr (EncodeNoBranch) - { - const bool write = state >= max; - *pState->pStart = (uint16_t)(state & 0xFFFF); - *pState->pStart -= (size_t)write; - state = write ? state >> 16 : state; - } - else - { - if (state >= max) - { - *pState->pStart = (uint16_t)(state & 0xFFFF); - pState->pStart--; - state >>= 16; - } - } - - pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount); - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// - template static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) { @@ -223,7 +149,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; constexpr size_t MinBlockSizeX = MinBlockSize(); - _rans_encode_state32mt_t encodeState; + _rans_encode_state32_t encodeState; encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); encodeState.pStart = encodeState.pEnd; From 0da610c9d1ebdecc7ff0e62be40758bc196e119a Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 04:19:08 +0200 Subject: [PATCH 20/34] Adding 64 state variant --- src/main.cpp | 45 +++-- src/mt_rANS32x64_16w.h | 30 +++ src/mt_rANS32x64_16w_decode.cpp | 337 ++++++++++++++++++++++++++++++++ src/mt_rANS32x64_16w_encode.cpp | 316 ++++++++++++++++++++++++++++++ 4 files changed, 710 insertions(+), 18 deletions(-) create mode 100644 src/mt_rANS32x64_16w.h create mode 100644 src/mt_rANS32x64_16w_decode.cpp create mode 100644 src/mt_rANS32x64_16w_encode.cpp diff --git a/src/main.cpp b/src/main.cpp index 9cf1652..123d47d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,6 +13,7 @@ #include "block_rANS32x32_16w.h" #include "block_rANS32x64_16w.h" #include "mt_rANS32x32_16w.h" +#include "mt_rANS32x64_16w.h" #ifdef _WIN32 #include @@ -164,26 +165,33 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe static codec_info_t _Codecs[] = { - // { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - // { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - // { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - // { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - // { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - // { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}}, - // { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}}, + { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, + { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, + { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}}, + { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}}, + { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}}, + { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x32 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + + { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, + { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, @@ -422,6 +430,7 @@ int32_t main(const int32_t argc, char **pArgv) compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize)); compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x32_16w_capacity(fileSize)); + compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x64_16w_capacity(fileSize)); pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity); diff --git a/src/mt_rANS32x64_16w.h b/src/mt_rANS32x64_16w.h new file mode 100644 index 0000000..e588cf3 --- /dev/null +++ b/src/mt_rANS32x64_16w.h @@ -0,0 +1,30 @@ +#ifndef mt_rANS32x64_16w_h__ +#define mt_rANS32x64_16w_h__ + +#include "hist.h" +#include "thread_pool.h" + +size_t mt_rANS32x64_16w_capacity(const size_t inputSize); + +size_t mt_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity); + +size_t mt_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); +size_t mt_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity); + +size_t mt_rANS32x64_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x64_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x64_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x64_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x64_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); +size_t mt_rANS32x64_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool); + +#endif // mt_rANS32x64_16w_h__ diff --git a/src/mt_rANS32x64_16w_decode.cpp b/src/mt_rANS32x64_16w_decode.cpp new file mode 100644 index 0000000..fb207fa --- /dev/null +++ b/src/mt_rANS32x64_16w_decode.cpp @@ -0,0 +1,337 @@ +#include "mt_rANS32x64_16w.h" + +#include "hist.h" +#include "simd_platform.h" +#include "block_codec64.h" + +#include +#include + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state64_t decodeState; + + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + (void)readHeadBackOffset; // unused in single-threaded version. + + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } + + decodeState.pReadHead = pReadHeadAfter; + + } while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if (state < DecodeConsumePoint16) + { + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; + } + + decodeState.states[j] = state; + } + } + } + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits); + + size_t inputIndex = 0; + const uint64_t expectedOutputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (expectedOutputLength > outCapacity) + return 0; + + const uint64_t expectedInputLength = *reinterpret_cast(pInData + inputIndex); + inputIndex += sizeof(uint64_t); + + if (inLength < expectedInputLength) + return 0; + + _rans_decode_state64_t decodeState; + + decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); + const size_t outLengthInStates = expectedOutputLength - StateCount + 1; + size_t i = 0; + hist_t hist; + + do + { + const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } + + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } + + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; + + uint64_t blockEndInStates = (i + blockSize); + + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + if (i + blockSize > blockEndInStates) + { + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + break; + } + else + { + thread_pool_add(pThreadPool, [=]() { + auto decState = decodeState; + rans32x64_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); + }); + + i = blockEndInStates; + decodeState.pReadHead = pReadHeadAfter; + } + + } while (i < outLengthInStates); + + if (i < expectedOutputLength) + { + hist_dec_t histDec; + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + + if (!inplace_make_hist_dec(&histDec)) + return 0; + + for (size_t j = 0; j < StateCount; j++) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + if (i + index < expectedOutputLength) + { + uint32_t state = decodeState.states[j]; + + const uint32_t slot = state & (TotalSymbolCount - 1); + const uint8_t symbol = histDec.cumulInv[slot]; + pOutData[i + index] = symbol; + + state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol]; + + if (state < DecodeConsumePoint16) + { + state = state << 16 | *decodeState.pReadHead; + decodeState.pReadHead++; + } + + decodeState.states[j] = state; + } + } + } + + thread_pool_await(pThreadPool); + + return expectedOutputLength; +} + +////////////////////////////////////////////////////////////////////////// + +template +static size_t mt_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool = nullptr) +{ + _DetectCPUFeatures(); + + if (avx2Supported) + { + if constexpr (TotalSymbolCountBits >= 13) + { + if (pThreadPool) + return mt_rANS32x64_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + else + { + if (pThreadPool) + return mt_rANS32x64_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); + } + } + + // Fallback. + if (pThreadPool) + return mt_rANS32x64_16w_decode_mt>(pInData, inLength, pOutData, outCapacity, pThreadPool); + else + return mt_rANS32x64_16w_decode>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity); +} + +size_t mt_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) +{ + return mt_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity); +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x64_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x64_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x64_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x64_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x64_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} + +size_t mt_rANS32x64_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool) +{ + return mt_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity, pThreadPool); +} diff --git a/src/mt_rANS32x64_16w_encode.cpp b/src/mt_rANS32x64_16w_encode.cpp new file mode 100644 index 0000000..cb659f2 --- /dev/null +++ b/src/mt_rANS32x64_16w_encode.cpp @@ -0,0 +1,316 @@ +#include "mt_rANS32x64_16w.h" + +#include "hist.h" +#include "simd_platform.h" +#include "block_codec64.h" + +#include +#include + +constexpr size_t SafeHistBitMax = 0; + +constexpr size_t MinMinBlockSizeBits = 15; +constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits; + +template +struct HistReplaceMul +{ + constexpr static size_t GetValue(); +}; + +template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } }; +template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 500; } }; +template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 500; } }; + +template +struct MinBlockSizeBits +{ + constexpr static size_t GetValue(); +}; + +template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 16; } }; +template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 16; } }; + +template +constexpr size_t MinBlockSize() +{ + return (size_t)1 << MinBlockSizeBits::GetValue(); +} + +constexpr size_t MaxBlockSizeBits = 25; +constexpr size_t MaxBlockSize = (size_t)1 << MaxBlockSizeBits; + +size_t mt_rANS32x64_16w_capacity(const size_t inputSize) +{ + const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t); + const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1; + const size_t perBlockExtraSize = sizeof(uint64_t) * 2 + 256 * sizeof(uint16_t) + StateCount * sizeof(uint32_t); + + return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases. +} + +////////////////////////////////////////////////////////////////////////// + +template +static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256]) +{ + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + + memset(symCount, 0, sizeof(uint32_t) * 256); + observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize); + + // Do we include a symbol that hasn't been included before? + if constexpr (!IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0) + return false; + } + + hist_t newHist; + + if constexpr (TotalSymbolCountBits == MinBlockSize()) + { + for (size_t j = 0; j < 256; j++) + newHist.symbolCount[j] = (uint16_t)symCount[j]; + + size_t counter = 0; + + for (size_t j = 0; j < 256; j++) + { + newHist.cumul[j] = (uint16_t)counter; + counter += newHist.symbolCount[j]; + } + } + else + { + normalize_hist(&newHist, symCount, MinBlockSize(), TotalSymbolCountBits); + } + + constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits); + constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul::GetValue()) >> 12; + + // this comparison isn't fair or fast, but should be a good starting point hopefully. + float costBefore = 0; + float costAfter = (float)(sizeof(uint16_t) * 256 + StateCount * sizeof(uint32_t) + sizeof(uint64_t) * 2) * 0.5f; // let's assume that block will be able to share it's histogram with someone else. + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + else + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + continue; + + const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount); + const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount); + + costBefore -= before; + costAfter -= after; + } + } + + const float diff = costBefore - costAfter; + + return (diff < histReplacePoint); +} + +////////////////////////////////////////////////////////////////////////// + +template +size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) +{ + if (outCapacity < mt_rANS32x64_16w_capacity(length)) + return 0; + + static_assert(TotalSymbolCountBits < 16); + constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16); + + constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax; + constexpr size_t MinBlockSizeX = MinBlockSize(); + + _rans_encode_state64_t encodeState; + encodeState.pEnd = reinterpret_cast(pOutData + outCapacity - sizeof(uint16_t)); + encodeState.pStart = encodeState.pEnd; + + size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1)); + + if (inputBlockTargetIndex > MinBlockSizeX) + inputBlockTargetIndex -= MinBlockSizeX; + + uint16_t *pBlockEnd = encodeState.pEnd; + size_t blockBackPoint = length; + + uint32_t symCount[256]; + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + + size_t extraCount = 0; + + if constexpr (IsSafeHist) + { + for (size_t j = 0; j < 256; j++) + { + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } + } + } + + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + + // Init States. + for (size_t i = 0; i < StateCount; i++) + encodeState.states[i] = DecodeConsumePoint16; + + int64_t inputIndex = length - 1; + inputIndex &= ~(size_t)(StateCount - 1); + inputIndex += StateCount; + + for (int64_t j = StateCount - 1; j >= 0; j--) + { + const uint8_t index = _Rans32x64_idx2idx[j]; + + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; + + const size_t stateIndex = j; + + uint32_t state = encodeState.states[stateIndex]; + + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } + } + + inputIndex -= StateCount; + + while (true) + { + rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; + + // Write hist & states. + { + const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; + + encodeState.pStart++; + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; + memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + + const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + + pBlockEnd = encodeState.pStart; + encodeState.pStart--; + } + + if (inputIndex == 0) + break; + + // Determine new histogram. + { + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = inputIndex; + } + } + + uint8_t *pWrite = pOutData; + size_t outIndex = 0; + + *reinterpret_cast(pWrite + outIndex) = (uint64_t)length; + outIndex += sizeof(uint64_t); + + // compressed expected length. + outIndex += sizeof(uint64_t); + + const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t); + + memmove(pWrite + outIndex, encodeState.pStart + 1, size); + outIndex += size; + + *reinterpret_cast(pOutData + sizeof(uint64_t)) = outIndex; // write total output length. + + return outIndex; +} + +////////////////////////////////////////////////////////////////////////// + +size_t mt_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<15, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<14, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<13, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<12, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<11, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } +size_t mt_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<10, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); } From 2c96afcd87c5e819731610396058a7cf36e4921f Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 18:10:49 +0200 Subject: [PATCH 21/34] fixing weird single symbol hist issue for block_64 --- src/block_codec64.h | 1 + src/block_rANS32x64_16w_decode.cpp | 40 ++++--- src/block_rANS32x64_16w_encode.cpp | 182 ++++++++++++++++++++--------- src/main.cpp | 12 +- 4 files changed, 161 insertions(+), 74 deletions(-) diff --git a/src/block_codec64.h b/src/block_codec64.h index 45f05b5..ac3c9bc 100644 --- a/src/block_codec64.h +++ b/src/block_codec64.h @@ -4,6 +4,7 @@ #include "hist.h" #include +#include constexpr size_t StateCount = 64; // Needs to be a power of two. diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp index c4e1788..a909f9e 100644 --- a/src/block_rANS32x64_16w_decode.cpp +++ b/src/block_rANS32x64_16w_decode.cpp @@ -42,30 +42,42 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - for (size_t j = 0; j < 256; j++) + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); + + memset(pOutData + i, symbol, blockSize); + + i += blockSize; } + else + { + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - uint64_t blockEndInStates = (i + blockSize); + uint64_t blockEndInStates = i + blockSizeVal; - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; - i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + } if (i > outLengthInStates) { @@ -80,7 +92,7 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp index e17469e..4464a90 100644 --- a/src/block_rANS32x64_16w_encode.cpp +++ b/src/block_rANS32x64_16w_encode.cpp @@ -156,37 +156,63 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u inputBlockTargetIndex -= MinBlockSizeX; size_t blockBackPoint = length; + size_t numSymbols = 0; + uint8_t selectedSymbol = 0; uint32_t symCount[256]; observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - size_t extraCount = 0; + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) + if (numSymbols == 1) { - for (size_t j = 0; j < 256; j++) + int64_t idx = inputBlockTargetIndex - 1; + + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; + + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else + { + size_t extraCount = 0; + + if constexpr (IsSafeHist) { - if (symCount[j] == 0) + for (size_t j = 0; j < 256; j++) { - symCount[j] = 1; - extraCount++; + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } } } - } - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0) - { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); blockBackPoint = length; // Init States. @@ -197,28 +223,31 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u inputIndex &= ~(size_t)(StateCount - 1); inputIndex += StateCount; - for (int64_t j = StateCount - 1; j >= 0; j--) + if (numSymbols != 1) { - const uint8_t index = _Rans32x64_idx2idx[j]; - - if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + for (int64_t j = StateCount - 1; j >= 0; j--) { - const uint8_t in = pInData[inputIndex - StateCount + index]; - const uint32_t symbolCount = encodeState.hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; + const uint8_t index = _Rans32x64_idx2idx[j]; - const size_t stateIndex = j; + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; - uint32_t state = encodeState.states[stateIndex]; + const size_t stateIndex = j; - if (state >= max) - { - *encodeState.pStart = (uint16_t)(state & 0xFFFF); - encodeState.pStart--; - state >>= 16; - } + uint32_t state = encodeState.states[stateIndex]; + + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } - encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } } } @@ -226,19 +255,32 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u while (true) { - rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + if (numSymbols != 1) + rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; - // Write hist. + // Write block info. { const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; encodeState.pStart++; - encodeState.pStart -= 256; - memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + if (numSymbols != 1) // write hist. + { + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + } + else + { + const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator)); + } encodeState.pStart--; } @@ -248,28 +290,60 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u // Determine new histogram. { - inputBlockTargetIndex -= MinBlockSizeX; + inputBlockTargetIndex -= 1; + inputBlockTargetIndex &= ~(MinBlockSizeX - 1); + + const size_t initialSize = inputIndex - inputBlockTargetIndex; + + if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3) + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex); + + numSymbols = 0; - observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) - for (size_t j = 0; j < 256; j++) - if (symCount[j] == 0) - symCount[j] = 1; + if (numSymbols == 1) + { + int64_t idx = inputBlockTargetIndex - 1; - normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; - while (inputBlockTargetIndex > 0) + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); blockBackPoint = inputIndex; } } diff --git a/src/main.cpp b/src/main.cpp index 123d47d..09b0c55 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -165,12 +165,12 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe static codec_info_t _Codecs[] = { - { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, From 2337364713a0dbbf2e43ebc7dbdf629dc9cf1486 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 19:06:20 +0200 Subject: [PATCH 22/34] Hopefully applying those changes correctly to block_32 --- src/block_codec64.h | 1 - src/block_rANS32x32_16w_decode.cpp | 40 ++++--- src/block_rANS32x32_16w_encode.cpp | 185 ++++++++++++++++++++--------- src/main.cpp | 12 +- 4 files changed, 161 insertions(+), 77 deletions(-) diff --git a/src/block_codec64.h b/src/block_codec64.h index ac3c9bc..45f05b5 100644 --- a/src/block_codec64.h +++ b/src/block_codec64.h @@ -4,7 +4,6 @@ #include "hist.h" #include -#include constexpr size_t StateCount = 64; // Needs to be a power of two. diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp index ae2bd55..48444c9 100644 --- a/src/block_rANS32x32_16w_decode.cpp +++ b/src/block_rANS32x32_16w_decode.cpp @@ -48,30 +48,42 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - for (size_t j = 0; j < 256; j++) + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); + + memset(pOutData + i, symbol, blockSize); + + i += blockSize; } + else + { + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - uint64_t blockEndInStates = (i + blockSize); + uint64_t blockEndInStates = (i + blockSizeVal); - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; - i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + } if (i > outLengthInStates) { @@ -86,7 +98,7 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp index 3a25ed0..891b235 100644 --- a/src/block_rANS32x32_16w_encode.cpp +++ b/src/block_rANS32x32_16w_encode.cpp @@ -156,38 +156,63 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u inputBlockTargetIndex -= MinBlockSizeX; size_t blockBackPoint = length; + size_t numSymbols = 0; + uint8_t selectedSymbol = 0; uint32_t symCount[256]; observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - size_t extraCount = 0; + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } + + if (numSymbols == 1) + { + int64_t idx = inputBlockTargetIndex - 1; - if constexpr (IsSafeHist) + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; + + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else { - for (size_t j = 0; j < 256; j++) + size_t extraCount = 0; + + if constexpr (IsSafeHist) { - if (symCount[j] == 0) + for (size_t j = 0; j < 256; j++) { - symCount[j] = 1; - extraCount++; + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } } } - } - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0) - { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; - } + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - blockBackPoint = length; + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + } // Init States. for (size_t i = 0; i < StateCount; i++) @@ -197,28 +222,31 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u inputIndex &= ~(size_t)(StateCount - 1); inputIndex += StateCount; - for (int64_t j = StateCount - 1; j >= 0; j--) + if (numSymbols != 1) { - const uint8_t index = _Rans32x32_idx2idx[j]; - - if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + for (int64_t j = StateCount - 1; j >= 0; j--) { - const uint8_t in = pInData[inputIndex - StateCount + index]; - const uint32_t symbolCount = encodeState.hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; + const uint8_t index = _Rans32x32_idx2idx[j]; - const size_t stateIndex = j; + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; - uint32_t state = encodeState.states[stateIndex]; + const size_t stateIndex = j; - if (state >= max) - { - *encodeState.pStart = (uint16_t)(state & 0xFFFF); - encodeState.pStart--; - state >>= 16; - } + uint32_t state = encodeState.states[stateIndex]; - encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } } } @@ -226,19 +254,32 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u while (true) { - rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + if (numSymbols != 1) + rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; - // Write hist. + // Write block info. { const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; encodeState.pStart++; - encodeState.pStart -= 256; - memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + if (numSymbols != 1) // write hist. + { + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + } + else + { + const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54); + + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator)); + } encodeState.pStart--; } @@ -248,28 +289,60 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u // Determine new histogram. { - inputBlockTargetIndex -= MinBlockSizeX; + inputBlockTargetIndex -= 1; + inputBlockTargetIndex &= ~(MinBlockSizeX - 1); + + const size_t initialSize = inputIndex - inputBlockTargetIndex; + + if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3) + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex); + + numSymbols = 0; - observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) - for (size_t j = 0; j < 256; j++) - if (symCount[j] == 0) - symCount[j] = 1; + if (numSymbols == 1) + { + int64_t idx = inputBlockTargetIndex - 1; - normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; - while (inputBlockTargetIndex > 0) + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); blockBackPoint = inputIndex; } } diff --git a/src/main.cpp b/src/main.cpp index 09b0c55..123d47d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -165,12 +165,12 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe static codec_info_t _Codecs[] = { - //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, - //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, + { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, + { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, + { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}}, + { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}}, + { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}}, + { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}}, { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}}, { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}}, From 347604fa07689211cbba4f2c155c0ff437236e8d Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Fri, 7 Jul 2023 21:35:52 +0200 Subject: [PATCH 23/34] Adapting MT codecs to also feature those changes --- src/block_codec32.h | 2 +- src/block_codec64.h | 2 +- src/mt_rANS32x32_16w_decode.cpp | 158 +++++++++++++++----------- src/mt_rANS32x32_16w_encode.cpp | 195 ++++++++++++++++++++++---------- src/mt_rANS32x64_16w_decode.cpp | 158 +++++++++++++++----------- src/mt_rANS32x64_16w_encode.cpp | 195 ++++++++++++++++++++++---------- 6 files changed, 452 insertions(+), 258 deletions(-) diff --git a/src/block_codec32.h b/src/block_codec32.h index 106da1c..1366b5e 100644 --- a/src/block_codec32.h +++ b/src/block_codec32.h @@ -710,4 +710,4 @@ struct rans32x32_16w_decoder(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; - (void)readHeadBackOffset; // unused in single-threaded version. - - for (size_t j = 0; j < StateCount; j++) + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist { - decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); - } + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); - for (size_t j = 0; j < 256; j++) - { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; + memset(pOutData + i, symbol, blockSize); + + i += blockSize; } + else + { + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + (void)readHeadBackOffset; // unused in single-threaded version. - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } - uint64_t blockEndInStates = (i + blockSize); + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + uint64_t blockEndInStates = (i + blockSizeVal); - if (i > outLengthInStates) - { - if (i >= expectedOutputLength) - return expectedOutputLength; - else - break; - } + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; - decodeState.pReadHead = pReadHeadAfter; + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } + + decodeState.pReadHead = pReadHeadAfter; + } } while (i < outLengthInStates); if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; @@ -149,55 +161,67 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist + { + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); - const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + memset(pOutData + i, symbol, blockSize); // let's hope this isn't the last block, because otherwise we'd delay starting tasks for other stuff, but otherwise `memset` is probably faster than starting a task. - for (size_t j = 0; j < StateCount; j++) - { - decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + i += blockSize; } - - for (size_t j = 0; j < 256; j++) + else { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; - } + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; - uint64_t blockEndInStates = (i + blockSize); + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (i + blockSize > blockEndInStates) - { - i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - break; - } - else - { - thread_pool_add(pThreadPool, [=]() { - auto decState = decodeState; - rans32x32_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); - }); + uint64_t blockEndInStates = (i + blockSizeVal); - i = blockEndInStates; - decodeState.pReadHead = pReadHeadAfter; + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + if (i + blockSizeVal > blockEndInStates) + { + i = rans32x32_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + break; + } + else + { + thread_pool_add(pThreadPool, [=]() { + auto decState = decodeState; + rans32x32_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); + }); + + i = blockEndInStates; + decodeState.pReadHead = pReadHeadAfter; + } } } while (i < outLengthInStates); @@ -205,7 +229,7 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp index 632fce0..f37deaa 100644 --- a/src/mt_rANS32x32_16w_encode.cpp +++ b/src/mt_rANS32x32_16w_encode.cpp @@ -160,38 +160,63 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint uint16_t *pBlockEnd = encodeState.pEnd; size_t blockBackPoint = length; + size_t numSymbols = 0; + uint8_t selectedSymbol = 0; uint32_t symCount[256]; observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - size_t extraCount = 0; + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) + if (numSymbols == 1) { - for (size_t j = 0; j < 256; j++) + int64_t idx = inputBlockTargetIndex - 1; + + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; + + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else + { + size_t extraCount = 0; + + if constexpr (IsSafeHist) { - if (symCount[j] == 0) + for (size_t j = 0; j < 256; j++) { - symCount[j] = 1; - extraCount++; + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } } } - } - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) - { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; - } + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - blockBackPoint = length; + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + } // Init States. for (size_t i = 0; i < StateCount; i++) @@ -201,28 +226,31 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint inputIndex &= ~(size_t)(StateCount - 1); inputIndex += StateCount; - for (int64_t j = StateCount - 1; j >= 0; j--) + if (numSymbols != 1) { - const uint8_t index = _Rans32x32_idx2idx[j]; - - if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + for (int64_t j = StateCount - 1; j >= 0; j--) { - const uint8_t in = pInData[inputIndex - StateCount + index]; - const uint32_t symbolCount = encodeState.hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; + const uint8_t index = _Rans32x32_idx2idx[j]; - const size_t stateIndex = j; + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; - uint32_t state = encodeState.states[stateIndex]; + const size_t stateIndex = j; - if (state >= max) - { - *encodeState.pStart = (uint16_t)(state & 0xFFFF); - encodeState.pStart--; - state >>= 16; - } + uint32_t state = encodeState.states[stateIndex]; - encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } } } @@ -230,27 +258,40 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint while (true) { - rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + if (numSymbols != 1) + rans32x32_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; - // Write hist & states. + // Write block info. { const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; encodeState.pStart++; - encodeState.pStart -= 256; - memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); - encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; - memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + if (numSymbols != 1) // write hist & states. + { + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; + memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + + const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); - const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + } + else + { + const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator)); + } pBlockEnd = encodeState.pStart; encodeState.pStart--; @@ -261,28 +302,60 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint // Determine new histogram. { - inputBlockTargetIndex -= MinBlockSizeX; + inputBlockTargetIndex -= 1; + inputBlockTargetIndex &= ~(MinBlockSizeX - 1); + + const size_t initialSize = inputIndex - inputBlockTargetIndex; + + if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3) + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex); + + numSymbols = 0; - observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) - for (size_t j = 0; j < 256; j++) - if (symCount[j] == 0) - symCount[j] = 1; + if (numSymbols == 1) + { + int64_t idx = inputBlockTargetIndex - 1; - normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; - while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); blockBackPoint = inputIndex; } } diff --git a/src/mt_rANS32x64_16w_decode.cpp b/src/mt_rANS32x64_16w_decode.cpp index fb207fa..766366d 100644 --- a/src/mt_rANS32x64_16w_decode.cpp +++ b/src/mt_rANS32x64_16w_decode.cpp @@ -36,58 +36,70 @@ size_t mt_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, ui decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; - (void)readHeadBackOffset; // unused in single-threaded version. - - for (size_t j = 0; j < StateCount; j++) + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist { - decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); - } + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); - for (size_t j = 0; j < 256; j++) - { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; + memset(pOutData + i, symbol, blockSize); + + i += blockSize; } + else + { + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + (void)readHeadBackOffset; // unused in single-threaded version. - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } - uint64_t blockEndInStates = (i + blockSize); + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + uint64_t blockEndInStates = (i + blockSizeVal); - if (i > outLengthInStates) - { - if (i >= expectedOutputLength) - return expectedOutputLength; - else - break; - } + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; - decodeState.pReadHead = pReadHeadAfter; + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + if (i > outLengthInStates) + { + if (i >= expectedOutputLength) + return expectedOutputLength; + else + break; + } + + decodeState.pReadHead = pReadHeadAfter; + } } while (i < outLengthInStates); if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; @@ -149,55 +161,67 @@ size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength, decodeState.pReadHead = reinterpret_cast(pInData + inputIndex); const size_t outLengthInStates = expectedOutputLength - StateCount + 1; size_t i = 0; - hist_t hist; + hist_t hist = {}; do { - const uint64_t blockSize = *reinterpret_cast(decodeState.pReadHead); + const uint64_t blockSizeVal = *reinterpret_cast(decodeState.pReadHead); decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); + if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist + { + const uint8_t symbol = (blockSizeVal >> 54) & 0xFF; + const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1); - const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; + memset(pOutData + i, symbol, blockSize); // let's hope this isn't the last block, because otherwise we'd delay starting tasks for other stuff, but otherwise `memset` is probably faster than starting a task. - for (size_t j = 0; j < StateCount; j++) - { - decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); - decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + i += blockSize; } - - for (size_t j = 0; j < 256; j++) + else { - hist.symbolCount[j] = *decodeState.pReadHead; - decodeState.pReadHead++; - } + const uint64_t readHeadBackOffset = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t); - if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) - return 0; + const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1; - uint64_t blockEndInStates = (i + blockSize); + for (size_t j = 0; j < StateCount; j++) + { + decodeState.states[j] = *reinterpret_cast(decodeState.pReadHead); + decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t)); + } - if (blockEndInStates > outLengthInStates) - blockEndInStates = outLengthInStates; - else if ((blockEndInStates & (StateCount - 1)) != 0) - return 0; + for (size_t j = 0; j < 256; j++) + { + hist.symbolCount[j] = *decodeState.pReadHead; + decodeState.pReadHead++; + } - if (i + blockSize > blockEndInStates) - { - i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits)) + return 0; - break; - } - else - { - thread_pool_add(pThreadPool, [=]() { - auto decState = decodeState; - rans32x64_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); - }); + uint64_t blockEndInStates = (i + blockSizeVal); - i = blockEndInStates; - decodeState.pReadHead = pReadHeadAfter; + if (blockEndInStates > outLengthInStates) + blockEndInStates = outLengthInStates; + else if ((blockEndInStates & (StateCount - 1)) != 0) + return 0; + + if (i + blockSizeVal > blockEndInStates) + { + i = rans32x64_16w_decoder::decode_section(&decodeState, pOutData, i, blockEndInStates); + + break; + } + else + { + thread_pool_add(pThreadPool, [=]() { + auto decState = decodeState; + rans32x64_16w_decoder::decode_section(&decState, pOutData, i, blockEndInStates); + }); + + i = blockEndInStates; + decodeState.pReadHead = pReadHeadAfter; + } } } while (i < outLengthInStates); @@ -205,7 +229,7 @@ size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength, if (i < expectedOutputLength) { hist_dec_t histDec; - memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); + memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end. if (!inplace_make_hist_dec(&histDec)) return 0; diff --git a/src/mt_rANS32x64_16w_encode.cpp b/src/mt_rANS32x64_16w_encode.cpp index cb659f2..3ca01cc 100644 --- a/src/mt_rANS32x64_16w_encode.cpp +++ b/src/mt_rANS32x64_16w_encode.cpp @@ -160,38 +160,63 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint uint16_t *pBlockEnd = encodeState.pEnd; size_t blockBackPoint = length; + size_t numSymbols = 0; + uint8_t selectedSymbol = 0; uint32_t symCount[256]; observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - size_t extraCount = 0; + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) + if (numSymbols == 1) { - for (size_t j = 0; j < 256; j++) + int64_t idx = inputBlockTargetIndex - 1; + + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; + + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else + { + size_t extraCount = 0; + + if constexpr (IsSafeHist) { - if (symCount[j] == 0) + for (size_t j = 0; j < 256; j++) { - symCount[j] = 1; - extraCount++; + if (symCount[j] == 0) + { + symCount[j] = 1; + extraCount++; + } } } - } - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits); - while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) - { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; - } + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); - blockBackPoint = length; + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); + blockBackPoint = length; + } // Init States. for (size_t i = 0; i < StateCount; i++) @@ -201,28 +226,31 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint inputIndex &= ~(size_t)(StateCount - 1); inputIndex += StateCount; - for (int64_t j = StateCount - 1; j >= 0; j--) + if (numSymbols != 1) { - const uint8_t index = _Rans32x64_idx2idx[j]; - - if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + for (int64_t j = StateCount - 1; j >= 0; j--) { - const uint8_t in = pInData[inputIndex - StateCount + index]; - const uint32_t symbolCount = encodeState.hist.symbolCount[in]; - const uint32_t max = EncodeEmitPoint * symbolCount; + const uint8_t index = _Rans32x64_idx2idx[j]; - const size_t stateIndex = j; + if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length) + { + const uint8_t in = pInData[inputIndex - StateCount + index]; + const uint32_t symbolCount = encodeState.hist.symbolCount[in]; + const uint32_t max = EncodeEmitPoint * symbolCount; - uint32_t state = encodeState.states[stateIndex]; + const size_t stateIndex = j; - if (state >= max) - { - *encodeState.pStart = (uint16_t)(state & 0xFFFF); - encodeState.pStart--; - state >>= 16; - } + uint32_t state = encodeState.states[stateIndex]; - encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + if (state >= max) + { + *encodeState.pStart = (uint16_t)(state & 0xFFFF); + encodeState.pStart--; + state >>= 16; + } + + encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount); + } } } @@ -230,27 +258,40 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint while (true) { - rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + if (numSymbols != 1) + rans32x64_16w_encoder::template encode_section(&encodeState, pInData, inputIndex, inputBlockTargetIndex); + inputIndex = inputBlockTargetIndex; - // Write hist & states. + // Write block info. { const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex; encodeState.pStart++; - encodeState.pStart -= 256; - memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); - encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; - memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + if (numSymbols != 1) // write hist & states. + { + encodeState.pStart -= 256; + memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount)); + + encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount; + memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount); + + const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); - const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset)); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + } + else + { + const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54); - encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); - memcpy(encodeState.pStart, &blockSize, sizeof(blockSize)); + encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t); + memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator)); + } pBlockEnd = encodeState.pStart; encodeState.pStart--; @@ -261,28 +302,60 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint // Determine new histogram. { - inputBlockTargetIndex -= MinBlockSizeX; + inputBlockTargetIndex -= 1; + inputBlockTargetIndex &= ~(MinBlockSizeX - 1); + + const size_t initialSize = inputIndex - inputBlockTargetIndex; + + if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3) + inputBlockTargetIndex -= MinBlockSizeX; + + observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex); + + numSymbols = 0; - observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX); + for (size_t j = 0; j < 256; j++) + { + numSymbols += (size_t)!!symCount[j]; + selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol; + } - if constexpr (IsSafeHist) - for (size_t j = 0; j < 256; j++) - if (symCount[j] == 0) - symCount[j] = 1; + if (numSymbols == 1) + { + int64_t idx = inputBlockTargetIndex - 1; - normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + for (; idx >= 0; idx--) + if (pInData[idx] != selectedSymbol) + break; - while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + inputBlockTargetIndex = (size_t)(idx + 1); + + // Align with `StateCount`. + inputBlockTargetIndex += StateCount - 1; + inputBlockTargetIndex &= ~(StateCount - 1); + } + else { - if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) - inputBlockTargetIndex -= MinBlockSizeX; - else - break; + if constexpr (IsSafeHist) + for (size_t j = 0; j < 256; j++) + if (symCount[j] == 0) + symCount[j] = 1; + + normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits); + + while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize) + { + if (_CanExtendHist(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount)) + inputBlockTargetIndex -= MinBlockSizeX; + else + break; + } + + // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) + observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); + normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); } - // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block) - observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex); - normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits); blockBackPoint = inputIndex; } } From 49951641dd533d3aabba492940f64325a424b9a8 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 00:54:47 +0200 Subject: [PATCH 24/34] Adding more command line options --- src/main.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 123d47d..c51e684 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -53,6 +53,11 @@ static size_t _HistMax = 15; static size_t _HistMin = 10; static bool _Include32Block = false; static bool _IncludeRaw = false; +static bool _IncludeMT = false; +static bool _ExcludeBlock = false; +static bool _Exclude32x16 = false; +static bool _Exclude32x32 = false; +static bool _Exclude32x64 = false; static size_t _RunCount = 8; static size_t _EncodeRunCount = 2; static size_t _DecodeRunCount = 16; @@ -236,6 +241,11 @@ const char ArgumentHistMin[] = "--hist-min"; const char ArgumentHistMax[] = "--hist-max"; const char ArgumentInclude32Blk[] = "--include-32blk"; const char ArgumentIncludeRaw[] = "--include-raw"; +const char ArgumentIncludeMT[] = "--include-mt"; +const char ArgumentExcludeBlock[] = "--exclude-block"; +const char ArgumentExclude16[] = "--exclude-16"; +const char ArgumentExclude32[] = "--exclude-32"; +const char ArgumentExclude64[] = "--exclude-64"; const char ArgumentNoSleep[] = "--no-sleep"; const char ArgumentCpuCore[] = "--cpu-core"; const char ArgumentRuns[] = "--runs"; @@ -249,14 +259,18 @@ int32_t main(const int32_t argc, char **pArgv) if (argc == 1) { puts("Invalid Parameter.\n\nUsage: hsrans "); - printf("\t%s \tRun all variants of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentRuns); + printf("\t%s \t\t\tRun all variants of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants); printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMin); printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMax); - printf("\t%s \tRun all implementations of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants); - printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore); - printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw); + printf("\t%s \t\tRun the (single-threaded) benchmark on a specific core\n", ArgumentCpuCore); + printf("\t%s \t\tInclude multi-threading optimized variants\n", ArgumentIncludeMT); + printf("\t%s \t\tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw); printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw); - printf("\t%s \tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode); + printf("\t%s \tExclude the main (variable block size) variants form the benchmark\n", ArgumentExcludeBlock); + printf("\t%s \t\tExclude 16 state variants from the benchmark (only RAW)\n", ArgumentExclude16); + printf("\t%s \t\tExclude 32 state variants from the benchmark\n", ArgumentExclude32); + printf("\t%s \t\tExclude 64 state variants from the benchmark\n", ArgumentExclude64); + printf("\t%s \t\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode); printf("\t%s \tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode); printf("\t%s \tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode); printf("\t%s \tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep); @@ -279,6 +293,12 @@ int32_t main(const int32_t argc, char **pArgv) argsRemaining--; _OnlyRelevantCodecs = false; } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeMT, sizeof(ArgumentIncludeMT)) == 0) + { + argIndex++; + argsRemaining--; + _IncludeMT = true; + } else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeRaw, sizeof(ArgumentIncludeRaw)) == 0) { argIndex++; @@ -291,6 +311,30 @@ int32_t main(const int32_t argc, char **pArgv) argsRemaining--; _Include32Block = true; } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExcludeBlock, sizeof(ArgumentExcludeBlock)) == 0) + { + argIndex++; + argsRemaining--; + _ExcludeBlock = true; + } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude16, sizeof(ArgumentExclude16)) == 0) + { + argIndex++; + argsRemaining--; + _Exclude32x16 = true; + } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude32, sizeof(ArgumentExclude32)) == 0) + { + argIndex++; + argsRemaining--; + _Exclude32x32 = true; + } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude64, sizeof(ArgumentExclude64)) == 0) + { + argIndex++; + argsRemaining--; + _Exclude32x64 = true; + } else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentNoSleep, sizeof(ArgumentNoSleep)) == 0) { argIndex++; @@ -518,8 +562,13 @@ int32_t main(const int32_t argc, char **pArgv) make_hist(&hist, pUncompressedData, fileSize, _Codecs[codecId].totalSymbolCountBits); bool skipCodec = false; + skipCodec |= (!_IncludeMT && strstr(_Codecs[codecId].name, " (independent blocks)") != nullptr); skipCodec |= (!_IncludeRaw && strstr(_Codecs[codecId].name, " (raw)") != nullptr); skipCodec |= (!_Include32Block && strstr(_Codecs[codecId].name, " 32blk ") != nullptr); + skipCodec |= (_ExcludeBlock && strstr(_Codecs[codecId].name, " (variable block size)") != nullptr); + skipCodec |= (_Exclude32x16 && strstr(_Codecs[codecId].name, "32x16") != nullptr); + skipCodec |= (_Exclude32x32 && strstr(_Codecs[codecId].name, "32x32") != nullptr); + skipCodec |= (_Exclude32x64 && strstr(_Codecs[codecId].name, "32x64") != nullptr); skipCodec |= _Codecs[codecId].totalSymbolCountBits > _HistMax; skipCodec |= _Codecs[codecId].totalSymbolCountBits < _HistMin; From acabb9fd90cca0d83d4984d38336e809621a1aa0 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 02:51:00 +0200 Subject: [PATCH 25/34] Updating x-ray benchmark --- README.md | 311 +++++++++++++++++++++++------------------------- docs/index.html | 94 +++++++++++++++ 2 files changed, 240 insertions(+), 165 deletions(-) diff --git a/README.md b/README.md index 3da0208..b1ed15b 100644 --- a/README.md +++ b/README.md @@ -29,181 +29,162 @@ ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes) | Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 10** | 65.59 % | 12.83 clk/byte | 341.55 MiB/s | 1.43 clk/byte | 2989.66 MiB/s | -| **rANS32x64 16w 11** | 64.33 % | 12.34 clk/byte | 347.24 MiB/s | 1.44 clk/byte | 2973.71 MiB/s | -| **rANS32x64 16w 12** | 63.81 % | 12.51 clk/byte | 342.31 MiB/s | 1.44 clk/byte | 2967.92 MiB/s | -| TurboANX 63 | 63.4 % | - | 981.79 MiB/s | - | 2964.02 MiB/s | -| TurboANX 48 | 63.3 % | - | 969.72 MiB/s | - | 2917.59 MiB/s | -| TurboANX 40 | 63.2 % | - | 964.45 MiB/s | - | 2883.45 MiB/s | -| TurboANX 32 | 66.4 % | - | 951.53 MiB/s | - | 2856.26 MiB/s | -| **rANS32x32 16w 11** | 64.33 % | 12.86 clk/byte | 333.03 MiB/s | 1.50 clk/byte | 2856.20 MiB/s | -| **rANS32x32 16w 10** | 65.59 % | 12.80 clk/byte | 334.68 MiB/s | 1.51 clk/byte | 2845.56 MiB/s | -| TurboANX 24 | 63.0 % | - | 936.12 MiB/s | - | 2765.31 MiB/s | -| TurboANX 16 | 62.8 % | - | 902.32 MiB/s | - | 2631.85 MiB/s | -| **rANS32x32 16w 12** | 63.81 % | 12.83 clk/byte | 343.55 MiB/s | 1.54 clk/byte | 2784.13 MiB/s | -| fsehuf | 63.4 % | - | 1581.32 MiB/s | - | 2515.23 MiB/s | -| htscodecs_rans32avx2 0 | 63.5 % | - | 1041.93 MiB/s | - | 2374.04 MiB/s | -| TurboANX 8 | 62.7 % | - | 823.76 MiB/s | - | 2347.10 MiB/s | -| **rANS32x32 32blk 16w 12** | 63.81 % | 12.62 clk/byte | 339.50 MiB/s | 1.85 clk/byte | 2312.10 MiB/s | -| **rANS32x32 32blk 16w 11** | 64.33 % | 12.67 clk/byte | 338.00 MiB/s | 1.86 clk/byte | 2299.31 MiB/s | -| **rANS32x32 32blk 16w 10** | 65.59 % | 12.91 clk/byte | 331.80 MiB/s | 1.87 clk/byte | 2289.10 MiB/s | -| htscodecs_rans32avx512 0 | 63.5 % | - | 796.70 MiB/s | - | 2221.93 MiB/s | -| **rANS32x32 32blk 8w 11** | 64.33 % | 15.01 clk/byte | 285.45 MiB/s | 2.15 clk/byte | 1988.10 MiB/s | -| **rANS32x32 32blk 8w 12** | 63.82 % | 15.15 clk/byte | 282.80 MiB/s | 2.16 clk/byte | 1984.68 MiB/s | -| **rANS32x32 32blk 8w 10** | 65.60 % | 14.70 clk/byte | 291.41 MiB/s | 2.17 clk/byte | 1977.26 MiB/s | -| htscodecs_rans32sse 0 | 63.5 % | - | 732.08 MiB/s | - | 1948.66 MiB/s | -| TurboANX 4 | 63.0 % | - | 706.92 MiB/s | - | 1929.18 MiB/s | -| **rANS32x64 16w 13** | 63.61 % | 12.32 clk/byte | 348.13 MiB/s | 2.29 clk/byte | 1872.44 MiB/s | -| **rANS32x64 16w 14** | 63.55 % | 12.36 clk/byte | 346.57 MiB/s | 2.28 clk/byte | 1876.95 MiB/s | -| **rANS32x64 16w 15** | 63.57 % | 12.30 clk/byte | 350.49 MiB/s | 2.34 clk/byte | 1828.28 MiB/s | -| **rANS32x32 16w 13** | 63.61 % | 12.55 clk/byte | 341.20 MiB/s | 2.38 clk/byte | 1800.28 MiB/s | -| **rANS32x32 16w 14** | 63.55 % | 12.54 clk/byte | 341.70 MiB/s | 2.39 clk/byte | 1795.66 MiB/s | -| **rANS32x16 16w 10** | 65.59 % | 13.26 clk/byte | 323.07 MiB/s | 2.54 clk/byte | 1684.80 MiB/s | -| **rANS32x16 16w 12** | 63.81 % | 13.21 clk/byte | 324.24 MiB/s | 2.55 clk/byte | 1681.73 MiB/s | -| **rANS32x16 16w 11** | 64.33 % | 13.25 clk/byte | 323.17 MiB/s | 2.55 clk/byte | 1676.41 MiB/s | -| **rANS32x32 16w 15** | 63.57 % | 12.94 clk/byte | 342.60 MiB/s | 2.56 clk/byte | 1675.11 MiB/s | -| **rANS32x32 32blk 16w 14** | 63.55 % | 13.02 clk/byte | 329.08 MiB/s | 2.66 clk/byte | 1607.26 MiB/s | -| **rANS32x32 32blk 16w 13** | 63.61 % | 12.56 clk/byte | 341.16 MiB/s | 2.71 clk/byte | 1582.28 MiB/s | -| **rANS32x32 32blk 16w 15** | 63.57 % | 13.21 clk/byte | 324.33 MiB/s | 2.76 clk/byte | 1550.93 MiB/s | -| **rANS32x32 32blk 8w 13** | 63.60 % | 15.07 clk/byte | 284.24 MiB/s | 2.98 clk/byte | 1438.01 MiB/s | -| **rANS32x32 32blk 8w 14** | 63.53 % | 15.06 clk/byte | 284.45 MiB/s | 3.00 clk/byte | 1429.24 MiB/s | -| TurboANX 2 | 64.0 % | - | 656.86 MiB/s | - | 1416.33 MiB/s | -| **rANS32x32 32blk 8w 15** | 63.51 % | 15.11 clk/byte | 283.41 MiB/s | 3.10 clk/byte | 1381.63 MiB/s | -| **rANS32x16 16w 13** | 63.61 % | 13.14 clk/byte | 325.92 MiB/s | 3.60 clk/byte | 1190.23 MiB/s | -| **rANS32x16 16w 14** | 63.55 % | 13.37 clk/byte | 320.41 MiB/s | 3.64 clk/byte | 1175.92 MiB/s | -| **rANS32x16 16w 15** | 63.57 % | 13.28 clk/byte | 322.51 MiB/s | 4.21 clk/byte | 1017.12 MiB/s | -| fse | 63.2 % | - | 736.10 MiB/s | - | 966.58 MiB/s | -| TurboANX 1 | 66.4 % | - | 522.13 MiB/s | - | 942.43 MiB/s | -| htscodecs_rans32avx512 1 | 51.6 % | - | 168.22 MiB/s | - | 322.22 MiB/s | -| htscodecs_rans32avx2 1 | 51.6 % | - | 177.36 MiB/s | - | 319.15 MiB/s | -| FastHF | 63.6 % | - | 189.84 MiB/s | - | 151.62 MiB/s | -| FastAC | 63.2 % | - | 223.06 MiB/s | - | 84.37 MiB/s | -| htscodecs_arith_dyn 1 | 47.8 % | - | 89.60 MiB/s | - | 81.63 MiB/s | -| htscodecs_arith_dyn 0 | 62.0 % | - | 88.09 MiB/s | - | 75.05 MiB/s | - -The following benchmarks demonstrate, apart from incredibly high decompression speeds, how terrible the histogram generation currently is: +| **rANS32x64 16w 10** | 65.59 % | 12.83 clk/byte | 341.55 MiB/s | 1.43 clk/byte | 2,989.66 MiB/s | +| **rANS32x64 16w 11** | 64.33 % | 12.34 clk/byte | 347.24 MiB/s | 1.44 clk/byte | 2,973.71 MiB/s | +| **rANS32x64 16w 12** | 63.81 % | 12.51 clk/byte | 342.31 MiB/s | 1.44 clk/byte | 2,967.92 MiB/s | +| TurboANX 63 | 63.4 % | - | 981.79 MiB/s | - | 2,964.02 MiB/s | +| TurboANX 48 | 63.3 % | - | 969.72 MiB/s | - | 2,917.59 MiB/s | +| TurboANX 40 | 63.2 % | - | 964.45 MiB/s | - | 2,883.45 MiB/s | +| TurboANX 32 | 66.4 % | - | 951.53 MiB/s | - | 2,856.26 MiB/s | +| **rANS32x32 16w 11** | 64.33 % | 12.86 clk/byte | 333.03 MiB/s | 1.50 clk/byte | 2,856.20 MiB/s | +| **rANS32x32 16w 10** | 65.59 % | 12.80 clk/byte | 334.68 MiB/s | 1.51 clk/byte | 2,845.56 MiB/s | +| TurboANX 24 | 63.0 % | - | 936.12 MiB/s | - | 2,765.31 MiB/s | +| TurboANX 16 | 62.8 % | - | 902.32 MiB/s | - | 2,631.85 MiB/s | +| **rANS32x32 16w 12** | 63.81 % | 12.83 clk/byte | 343.55 MiB/s | 1.54 clk/byte | 2,784.13 MiB/s | +| fsehuf | 63.4 % | - | 1,581.32 MiB/s | - | 2,515.23 MiB/s | +| htscodecs rans32avx2 0 | 63.5 % | - | 1,041.93 MiB/s | - | 2,374.04 MiB/s | +| TurboANX 8 | 62.7 % | - | 823.76 MiB/s | - | 2,347.10 MiB/s | +| **rANS32x32 32blk 16w 12** | 63.81 % | 12.62 clk/byte | 339.50 MiB/s | 1.85 clk/byte | 2,312.10 MiB/s | +| **rANS32x32 32blk 16w 11** | 64.33 % | 12.67 clk/byte | 338.00 MiB/s | 1.86 clk/byte | 2,299.31 MiB/s | +| **rANS32x32 32blk 16w 10** | 65.59 % | 12.91 clk/byte | 331.80 MiB/s | 1.87 clk/byte | 2,289.10 MiB/s | +| htscodecs rans32avx512 0 | 63.5 % | - | 796.70 MiB/s | - | 2,221.93 MiB/s | +| **rANS32x32 32blk 8w 11** | 64.33 % | 15.01 clk/byte | 285.45 MiB/s | 2.15 clk/byte | 1,988.10 MiB/s | +| **rANS32x32 32blk 8w 12** | 63.82 % | 15.15 clk/byte | 282.80 MiB/s | 2.16 clk/byte | 1,984.68 MiB/s | +| **rANS32x32 32blk 8w 10** | 65.60 % | 14.70 clk/byte | 291.41 MiB/s | 2.17 clk/byte | 1,977.26 MiB/s | +| htscodecs rans32sse 0 | 63.5 % | - | 732.08 MiB/s | - | 1,948.66 MiB/s | +| TurboANX 4 | 63.0 % | - | 706.92 MiB/s | - | 1,929.18 MiB/s | +| **rANS32x64 16w 13** | 63.61 % | 12.32 clk/byte | 348.13 MiB/s | 2.29 clk/byte | 1,872.44 MiB/s | +| **rANS32x64 16w 14** | 63.55 % | 12.36 clk/byte | 346.57 MiB/s | 2.28 clk/byte | 1,876.95 MiB/s | +| **rANS32x64 16w 15** | 63.57 % | 12.30 clk/byte | 350.49 MiB/s | 2.34 clk/byte | 1,828.28 MiB/s | +| **rANS32x32 16w 13** | 63.61 % | 12.55 clk/byte | 341.20 MiB/s | 2.38 clk/byte | 1,800.28 MiB/s | +| **rANS32x32 16w 14** | 63.55 % | 12.54 clk/byte | 341.70 MiB/s | 2.39 clk/byte | 1,795.66 MiB/s | +| **rANS32x16 16w 10** | 65.59 % | 13.26 clk/byte | 323.07 MiB/s | 2.54 clk/byte | 1,684.80 MiB/s | +| **rANS32x16 16w 12** | 63.81 % | 13.21 clk/byte | 324.24 MiB/s | 2.55 clk/byte | 1,681.73 MiB/s | +| **rANS32x16 16w 11** | 64.33 % | 13.25 clk/byte | 323.17 MiB/s | 2.55 clk/byte | 1,676.41 MiB/s | +| **rANS32x32 16w 15** | 63.57 % | 12.94 clk/byte | 342.60 MiB/s | 2.56 clk/byte | 1,675.11 MiB/s | +| **rANS32x32 32blk 16w 14** | 63.55 % | 13.02 clk/byte | 329.08 MiB/s | 2.66 clk/byte | 1,607.26 MiB/s | +| **rANS32x32 32blk 16w 13** | 63.61 % | 12.56 clk/byte | 341.16 MiB/s | 2.71 clk/byte | 1,582.28 MiB/s | +| **rANS32x32 32blk 16w 15** | 63.57 % | 13.21 clk/byte | 324.33 MiB/s | 2.76 clk/byte | 1,550.93 MiB/s | +| **rANS32x32 32blk 8w 13** | 63.60 % | 15.07 clk/byte | 284.24 MiB/s | 2.98 clk/byte | 1,438.01 MiB/s | +| **rANS32x32 32blk 8w 14** | 63.53 % | 15.06 clk/byte | 284.45 MiB/s | 3.00 clk/byte | 1,429.24 MiB/s | +| TurboANX 2 | 64.0 % | - | 656.86 MiB/s | - | 1,416.33 MiB/s | +| **rANS32x32 32blk 8w 15** | 63.51 % | 15.11 clk/byte | 283.41 MiB/s | 3.10 clk/byte | 1,381.63 MiB/s | +| **rANS32x16 16w 13** | 63.61 % | 13.14 clk/byte | 325.92 MiB/s | 3.60 clk/byte | 1,190.23 MiB/s | +| **rANS32x16 16w 14** | 63.55 % | 13.37 clk/byte | 320.41 MiB/s | 3.64 clk/byte | 1,175.92 MiB/s | +| **rANS32x16 16w 15** | 63.57 % | 13.28 clk/byte | 322.51 MiB/s | 4.21 clk/byte | 1,017.12 MiB/s | +| fse | 63.2 % | - | 736.10 MiB/s | - | 966.58 MiB/s | +| TurboANX 1 | 66.4 % | - | 522.13 MiB/s | - | 942.43 MiB/s | +| htscodecs rans32avx512 1 | 51.6 % | - | 168.22 MiB/s | - | 322.22 MiB/s | +| htscodecs rans32avx2 1 | 51.6 % | - | 177.36 MiB/s | - | 319.15 MiB/s | +| FastHF | 63.6 % | - | 189.84 MiB/s | - | 151.62 MiB/s | +| FastAC | 63.2 % | - | 223.06 MiB/s | - | 84.37 MiB/s | +| htscodecs arith_dyn 1 | 47.8 % | - | 89.60 MiB/s | - | 81.63 MiB/s | +| htscodecs arith_dyn 0 | 62.0 % | - | 88.09 MiB/s | - | 75.05 MiB/s | ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus) -| Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | +| Codec Type | License | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 11** | 82.60 % | 13.75 clk/byte | 311.60 MiB/s | 1.39 clk/byte | 3079.98 MiB/s | -| **rANS32x64 16w 10** | 82.66 % | 14.03 clk/byte | 305.22 MiB/s | 1.42 clk/byte | 3026.65 MiB/s | -| TurboANX 63 | 79.6 % | - | 989.68 MiB/s | - | 2966.83 MiB/s | -| TurboANX 48 | 79.6 % | - | 979.24 MiB/s | - | 2923.90 MiB/s | -| TurboANX 40 | 79.7 % | - | 982.57 MiB/s | - | 2904.99 MiB/s | -| **rANS32x64 16w 12** | 82.57 % | 13.99 clk/byte | 306.19 MiB/s | 1.48 clk/byte | 2900.31 MiB/s | -| TurboANX 32 | 79.7 % | - | 973.82 MiB/s | - | 2860.76 MiB/s | -| **rANS32x32 16w 11** | 82.60 % | 14.31 clk/byte | 299.31 MiB/s | 1.50 clk/byte | 2851.47 MiB/s | -| **rANS32x32 16w 10** | 82.66 % | 13.82 clk/byte | 309.99 MiB/s | 1.52 clk/byte | 2822.97 MiB/s | -| TurboANX 24 | 79.8 % | - | 962.68 MiB/s | - | 2785.82 MiB/s | -| **rANS32x32 16w 12** | 82.57 % | 13.95 clk/byte | 306.97 MiB/s | 1.59 clk/byte | 2693.99 MiB/s | -| TurboANX 16 | 79.9 % | - | 937.33 MiB/s | - | 2661.07 MiB/s | -| TurboANX 8 | 80.5 % | - | 864.63 MiB/s | - | 2360.30 MiB/s | -| htscodecs_rans32avx2 0 | 80.6 % | - | 966.58 MiB/s | - | 2244.87 MiB/s | -| htscodecs_rans32avx512 0 | 80.6 % | - | 739.14 MiB/s | - | 2139.47 MiB/s | -| fsehuf | 80.0 % | - | 1395.71 MiB/s | - | 1946.34 MiB/s | -| htscodecs_rans32sse 0 | 80.6 % | - | 723.48 MiB/s | - | 1914.15 MiB/s | -| **rANS32x64 16w 13** | 82.57 % | 13.94 clk/byte | 307.28 MiB/s | 2.25 clk/byte | 1903.01 MiB/s | -| TurboANX 4 | 81.9 % | - | 677.08 MiB/s | - | 1883.40 MiB/s | -| **rANS32x64 16w 14** | 82.58 % | 14.09 clk/byte | 304.01 MiB/s | 2.29 clk/byte | 1870.17 MiB/s | -| **rANS32x32 16w 13** | 82.57 % | 13.97 clk/byte | 306.60 MiB/s | 2.31 clk/byte | 1855.99 MiB/s | -| **rANS32x64 16w 15** | 82.63 % | 13.88 clk/byte | 308.52 MiB/s | 2.39 clk/byte | 1793.13 MiB/s | -| **rANS32x32 16w 14** | 82.58 % | 13.91 clk/byte | 307.92 MiB/s | 2.45 clk/byte | 1749.16 MiB/s | -| **rANS32x32 16w 15** | 82.63 % | 14.20 clk/byte | 301.70 MiB/s | 2.59 clk/byte | 1654.49 MiB/s | -| TurboANX 2 | 83.7 % | - | 600.46 MiB/s | - | 1292.65 MiB/s | -| fse | 80.3 % | - | 696.88 MiB/s | - | 990.39 MiB/s | -| TurboANX 1 | 85.1 % | - | 387.40 MiB/s | - | 719.84 MiB/s | -| htscodecs_rans32avx2 1 | 74.4 % | - | 114.89 MiB/s | - | 229.78 MiB/s | -| htscodecs_rans32avx512 1 | 74.4 % | - | 104.87 MiB/s | - | 220.91 MiB/s | -| FastHF | 80.0 % | - | 183.35 MiB/s | - | 144.30 MiB/s | -| FastAC | 79.7 % | - | 244.35 MiB/s | - | 77.33 MiB/s | -| htscodecs_arith_dyn 1 | 67.6 % | - | 45.13 MiB/s | - | 45.67 MiB/s | -| htscodecs_arith_dyn 0 | 79.6 % | - | 47.12 MiB/s | - | 45.40 MiB/s | +| **rANS32x64 16w 11 (raw)** | BSD-2 | 82.60 % | 311.60 MiB/s | 1.39 clk/byte | 3,079.98 MiB/s | +| **rANS32x64 16w 12** | BSD-2 | 80.17 % | 193.60 MiB/s | 1.41 clk/byte | 3,048.15 MiB/s | +| **rANS32x64 16w 12 (raw)** | BSD-2 | 82.57 % | 308.10 MiB/s | 1.41 clk/byte | 3,041.07 MiB/s | +| **rANS32x64 16w 10** | BSD-2 | 80.81 % | 193.28 MiB/s | 1.41 clk/byte | 3,040.97 MiB/s | +| **rANS32x64 16w 10 (raw)** | BSD-2 | 82.83 % | 305.96 MiB/s | 1.42 clk/byte | 3,027.01 MiB/s | +| **rANS32x64 16w 11** | BSD-2 | 80.24 % | 186.41 MiB/s | 1.42 clk/byte | 3,015.25 MiB/s | +| TurboANX 63 | - | 79.6 % | 989.68 MiB/s | - | 2,966.83 MiB/s | +| TurboANX 48 | - | 79.6 % | 979.24 MiB/s | - | 2,923.90 MiB/s | +| TurboANX 40 | - | 79.7 % | 982.57 MiB/s | - | 2,904.99 MiB/s | +| **rANS32x32 16w 11 (raw)** | BSD-2 | 82.60 % | 303.34 MiB/s | 1.48 clk/byte | 2,886.18 MiB/s | +| **rANS32x32 16w 10 (raw)** | BSD-2 | 82.83 % | 301.23 MiB/s | 1.49 clk/byte | 2,881.42 MiB/s | +| **rANS32x32 16w 12 (raw)** | BSD-2 | 82.57 % | 307.10 MiB/s | 1.49 clk/byte | 2,872.78 MiB/s | +| TurboANX 32 | - | 79.7 % | 973.82 MiB/s | - | 2,860.76 MiB/s | +| **rANS32x32 16w 10** | BSD-2 | 80.81 % | 192.99 MiB/s | 1.51 clk/byte | 2,841.71 MiB/s | +| **rANS32x32 16w 11** | BSD-2 | 80.24 % | 190.01 MiB/s | 1.51 clk/byte | 2,834.43 MiB/s | +| **rANS32x32 16w 12** | BSD-2 | 80.53 % | 195.09 MiB/s | 1.54 clk/byte | 2,787.94 MiB/s | +| TurboANX 24 | - | 79.8 % | 962.68 MiB/s | - | 2,785.82 MiB/s | +| TurboANX 16 | - | 79.9 % | 937.33 MiB/s | - | 2,661.07 MiB/s | +| TurboANX 8 | - | 80.5 % | 864.63 MiB/s | - | 2,360.30 MiB/s | +| htscodecs rans32avx2 0 | BSD-3 | 80.6 % | 966.58 MiB/s | - | 2,244.87 MiB/s | +| htscodecs rans32avx512 0 | BSD-3 | 80.6 % | 739.14 MiB/s | - | 2,139.47 MiB/s | +| FSE Huff0 | BSD-2 | 80.0 % | 1,395.71 MiB/s | - | 1,946.34 MiB/s | +| htscodecs rans32sse 0 | BSD-3 | 80.6 % | 723.48 MiB/s | - | 1,914.15 MiB/s | +| **rANS32x64 16w 13 (raw)** | BSD-2 | 82.57 % | 305.45 MiB/s | 2.24 clk/byte | 1,910.60 MiB/s | +| **rANS32x64 16w 14 (raw)** | BSD-2 | 82.58 % | 308.96 MiB/s | 2.25 clk/byte | 1,903.66 MiB/s | +| **rANS32x64 16w 13** | BSD-2 | 79.98 % | 191.74 MiB/s | 2.26 clk/byte | 1,892.64 MiB/s | +| TurboANX 4 | - | 81.9 % | 677.08 MiB/s | - | 1,883.40 MiB/s | +| **rANS32x32 16w 13 (raw)** | BSD-2 | 82.57 % | 305.00 MiB/s | 2.29 clk/byte | 1,870.26 MiB/s | +| **rANS32x64 16w 15 (raw)** | BSD-2 | 82.63 % | 307.44 MiB/s | 2.30 clk/byte | 1,865.65 MiB/s | +| **rANS32x32 16w 14 (raw)** | BSD-2 | 82.58 % | 306.18 MiB/s | 2.30 clk/byte | 1,865.18 MiB/s | +| **rANS32x64 16w 14** | BSD-2 | 80.02 % | 192.71 MiB/s | 2.30 clk/byte | 1,861.42 MiB/s | +| **rANS32x32 16w 13** | BSD-2 | 80.01 % | 196.93 MiB/s | 2.37 clk/byte | 1,808.33 MiB/s | +| **rANS32x64 16w 15** | BSD-2 | 80.25 % | 193.85 MiB/s | 2.42 clk/byte | 1,773.42 MiB/s | +| **rANS32x32 16w 14** | BSD-2 | 80.06 % | 198.86 MiB/s | 2.42 clk/byte | 1,767.12 MiB/s | +| **rANS32x32 16w 15 (raw)** | BSD-2 | 82.63 % | 304.21 MiB/s | 2.44 clk/byte | 1,758.57 MiB/s | +| **rANS32x32 16w 15** | BSD-2 | 80.06 % | 191.91 MiB/s | 2.70 clk/byte | 1,585.77 MiB/s | +| TurboANX 2 | - | 83.7 % | 600.46 MiB/s | - | 1,292.65 MiB/s | +| FSE | BSD-2 | 80.3 % | 696.88 MiB/s | - | 990.39 MiB/s | +| TurboANX 1 | - | 85.1 % | 387.40 MiB/s | - | 719.84 MiB/s | +| htscodecs rans32avx2 1 | BSD-3 | 74.4 % | 114.89 MiB/s | - | 229.78 MiB/s | +| htscodecs rans32avx512 1 | BSD-3 | 74.4 % | 104.87 MiB/s | - | 220.91 MiB/s | +| FastHF | Custom | 80.0 % | 183.35 MiB/s | - | 144.30 MiB/s | +| FastAC | Custom | 79.7 % | 244.35 MiB/s | - | 77.33 MiB/s | +| htscodecs arith_dyn 1 | BSD-3 | 67.6 % | 45.13 MiB/s | - | 45.67 MiB/s | +| htscodecs arith_dyn 0 | BSD-3 | 79.6 % | 47.12 MiB/s | - | 45.40 MiB/s | ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus) | Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 11** | 77.82 % | 13.84 clk/byte | 309.39 MiB/s | 1.44 clk/byte | 2978.20 MiB/s | -| **rANS32x64 16w 10** | 77.92 % | 14.16 clk/byte | 302.46 MiB/s | 1.44 clk/byte | 2968.99 MiB/s | -| TurboANX 63 | 70.1 % | - | 965.97 MiB/s | - | 2959.13 MiB/s | -| **rANS32x64 16w 12** | 77.79 % | 14.21 clk/byte | 301.44 MiB/s | 1.45 clk/byte | 2946.52 MiB/s | -| TurboANX 48 | 69.6 % | - | 954.87 MiB/s | - | 2911.55 MiB/s | -| **rANS32x32 16w 10** | 77.92 % | 13.97 clk/byte | 306.54 MiB/s | 1.49 clk/byte | 2878.05 MiB/s | -| TurboANX 40 | 69.3 % | - | 941.29 MiB/s | - | 2869.21 MiB/s | -| **rANS32x32 16w 11** | 77.82 % | 14.34 clk/byte | 298.79 MiB/s | 1.49 clk/byte | 2867.33 MiB/s | -| TurboANX 32 | 68.9 % | - | 927.04 MiB/s | - | 2815.87 MiB/s | -| **rANS32x32 16w 12** | 77.79 % | 14.25 clk/byte | 300.51 MiB/s | 1.54 clk/byte | 2782.35 MiB/s | -| TurboANX 24 | 68.4 % | - | 900.92 MiB/s | - | 2732.74 MiB/s | -| TurboANX 16 | 67.9 % | - | 854.34 MiB/s | - | 2582.05 MiB/s | -| htscodecs_rans32avx2 0 | 69.5 % | - | 1014.19 MiB/s | - | 2250.58 MiB/s | -| TurboANX 8 | 67.2 % | - | 748.14 MiB/s | - | 2183.29 MiB/s | -| htscodecs_rans32avx512 0 | 69.5 % | - | 760.33 MiB/s | - | 2115.31 MiB/s | -| fsehuf | 69.2 % | - | 1491.60 MiB/s | - | 2092.00 MiB/s | -| **rANS32x32 16w 14** | 77.79 % | 14.02 clk/byte | 305.49 MiB/s | 2.37 clk/byte | 1804.10 MiB/s | -| **rANS32x64 16w 14** | 77.79 % | 14.09 clk/byte | 303.97 MiB/s | 2.26 clk/byte | 1891.46 MiB/s | -| htscodecs_rans32sse 0 | 69.5 % | - | 724.39 MiB/s | - | 1884.40 MiB/s | -| **rANS32x64 16w 13** | 77.79 % | 13.89 clk/byte | 308.28 MiB/s | 2.27 clk/byte | 1883.91 MiB/s | -| **rANS32x64 16w 15** | 77.85 % | 13.86 clk/byte | 309.13 MiB/s | 2.31 clk/byte | 1855.74 MiB/s | -| **rANS32x32 16w 13** | 77.78 % | 14.13 clk/byte | 303.23 MiB/s | 2.37 clk/byte | 1806.03 MiB/s | -| **rANS32x32 16w 15** | 77.84 % | 14.29 clk/byte | 299.78 MiB/s | 2.46 clk/byte | 1743.60 MiB/s | -| TurboANX 4 | 67.3 % | - | 603.91 MiB/s | - | 1658.68 MiB/s | -| TurboANX 2 | 68.5 % | - | 556.95 MiB/s | - | 1106.06 MiB/s | -| fse | 69.3 % | - | 713.08 MiB/s | - | 973.71 MiB/s | -| TurboANX 1 | 71.6 % | - | 392.67 MiB/s | - | 677.10 MiB/s | -| htscodecs_rans32avx512 1 | 55.7 % | - | 81.02 MiB/s | - | 168.42 MiB/s | -| htscodecs_rans32avx2 1 | 55.7 % | - | 83.68 MiB/s | - | 167.19 MiB/s | -| FastHF | 71.8 % | - | 174.86 MiB/s | - | 130.78 MiB/s | -| FastAC | 70.7 % | - | 234.95 MiB/s | - | 81.01 MiB/s | -| htscodecs_arith_dyn 1 | 52.1 % | - | 62.87 MiB/s | - | 62.98 MiB/s | -| htscodecs_arith_dyn 0 | 66.4 % | - | 63.82 MiB/s | - | 59.92 MiB/s | +| **rANS32x64 16w 11** | 77.82 % | 13.84 clk/byte | 309.39 MiB/s | 1.44 clk/byte | 2,978.20 MiB/s | +| **rANS32x64 16w 10** | 77.92 % | 14.16 clk/byte | 302.46 MiB/s | 1.44 clk/byte | 2,968.99 MiB/s | +| TurboANX 63 | 70.1 % | - | 965.97 MiB/s | - | 2,959.13 MiB/s | +| **rANS32x64 16w 12** | 77.79 % | 14.21 clk/byte | 301.44 MiB/s | 1.45 clk/byte | 2,946.52 MiB/s | +| TurboANX 48 | 69.6 % | - | 954.87 MiB/s | - | 2,911.55 MiB/s | +| **rANS32x32 16w 10** | 77.92 % | 13.97 clk/byte | 306.54 MiB/s | 1.49 clk/byte | 2,878.05 MiB/s | +| TurboANX 40 | 69.3 % | - | 941.29 MiB/s | - | 2,869.21 MiB/s | +| **rANS32x32 16w 11** | 77.82 % | 14.34 clk/byte | 298.79 MiB/s | 1.49 clk/byte | 2,867.33 MiB/s | +| TurboANX 32 | 68.9 % | - | 927.04 MiB/s | - | 2,815.87 MiB/s | +| **rANS32x32 16w 12** | 77.79 % | 14.25 clk/byte | 300.51 MiB/s | 1.54 clk/byte | 2,782.35 MiB/s | +| TurboANX 24 | 68.4 % | - | 900.92 MiB/s | - | 2,732.74 MiB/s | +| TurboANX 16 | 67.9 % | - | 854.34 MiB/s | - | 2,582.05 MiB/s | +| htscodecs_rans32avx2 0 | 69.5 % | - | 1,014.19 MiB/s | - | 2,250.58 MiB/s | +| TurboANX 8 | 67.2 % | - | 748.14 MiB/s | - | 2,183.29 MiB/s | +| htscodecs_rans32avx512 0 | 69.5 % | - | 760.33 MiB/s | - | 2,115.31 MiB/s | +| fsehuf | 69.2 % | - | 1,491.60 MiB/s | - | 2,092.00 MiB/s | +| **rANS32x32 16w 14** | 77.79 % | 14.02 clk/byte | 305.49 MiB/s | 2.37 clk/byte | 1,804.10 MiB/s | +| **rANS32x64 16w 14** | 77.79 % | 14.09 clk/byte | 303.97 MiB/s | 2.26 clk/byte | 1,891.46 MiB/s | +| htscodecs_rans32sse 0 | 69.5 % | - | 724.39 MiB/s | - | 1,884.40 MiB/s | +| **rANS32x64 16w 13** | 77.79 % | 13.89 clk/byte | 308.28 MiB/s | 2.27 clk/byte | 1,883.91 MiB/s | +| **rANS32x64 16w 15** | 77.85 % | 13.86 clk/byte | 309.13 MiB/s | 2.31 clk/byte | 1,855.74 MiB/s | +| **rANS32x32 16w 13** | 77.78 % | 14.13 clk/byte | 303.23 MiB/s | 2.37 clk/byte | 1,806.03 MiB/s | +| **rANS32x32 16w 15** | 77.84 % | 14.29 clk/byte | 299.78 MiB/s | 2.46 clk/byte | 1,743.60 MiB/s | +| TurboANX 4 | 67.3 % | - | 603.91 MiB/s | - | 1,658.68 MiB/s | +| TurboANX 2 | 68.5 % | - | 556.95 MiB/s | - | 1,106.06 MiB/s | +| fse | 69.3 % | - | 713.08 MiB/s | - | 973.71 MiB/s | +| TurboANX 1 | 71.6 % | - | 392.67 MiB/s | - | 677.10 MiB/s | +| htscodecs_rans32avx512 1 | 55.7 % | - | 81.02 MiB/s | - | 168.42 MiB/s | +| htscodecs_rans32avx2 1 | 55.7 % | - | 83.68 MiB/s | - | 167.19 MiB/s | +| FastHF | 71.8 % | - | 174.86 MiB/s | - | 130.78 MiB/s | +| FastAC | 70.7 % | - | 234.95 MiB/s | - | 81.01 MiB/s | +| htscodecs_arith_dyn 1 | 52.1 % | - | 62.87 MiB/s | - | 62.98 MiB/s | +| htscodecs_arith_dyn 0 | 66.4 % | - | 63.82 MiB/s | - | 59.92 MiB/s | -Thanks to [James Bonfield](https://github.com/jkbonfield) I also have benchmarks for `htscodecs` (MB/s converted to MiB/s) and `hypersonic-rANS` on an `Intel i7-1185G7` (Tiger Lake) via WSL1 compiled with GCC, where the AVX-512 versions of the 32x64 codecs seemed to be particularly fast: +## Easy Multithreading +hypersonic-rANS includes a variant that's encodes blocks independently (at the expense of compression ratio) allowing for easy multithreading. -### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes) -| Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | +### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus) +| Codec Type | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 10** | 65.59 % | 12.40 clk/byte | 230.40 MiB/s | 0.96 clk/byte | 2976.35 MiB/s | -| **rANS32x64 16w 11** | 64.33 % | 12.32 clk/byte | 231.88 MiB/s | 0.97 clk/byte | 2947.61 MiB/s | -| **rANS32x64 16w 12** | 63.81 % | 12.22 clk/byte | 233.65 MiB/s | 0.98 clk/byte | 2924.03 MiB/s | -| htscodecs r32x16 -o4 -c 0x0404 | 63.64 % | - | 956.44 MiB/s | - | 2513.31 MiB/s | -| **rANS32x64 16w 13** | 63.61 % | 12.05 clk/byte | 236.96 MiB/s | 1.24 clk/byte | 2307.39 MiB/s | -| **rANS32x64 16w 14** | 63.55 % | 12.05 clk/byte | 236.97 MiB/s | 1.25 clk/byte | 2292.33 MiB/s | -| **rANS32x64 16w 15** | 63.57 % | 11.82 clk/byte | 241.75 MiB/s | 1.27 clk/byte | 2250.60 MiB/s | -| **rANS32x32 32blk 16w 10** | 65.59 % | 12.52 clk/byte | 228.14 MiB/s | 1.44 clk/byte | 1989.49 MiB/s | -| **rANS32x32 16w 12** | 63.81 % | 12.31 clk/byte | 232.08 MiB/s | 1.44 clk/byte | 1982.58 MiB/s | -| **rANS32x32 16w 10** | 65.59 % | 12.77 clk/byte | 223.69 MiB/s | 1.45 clk/byte | 1972.50 MiB/s | -| **rANS32x32 32blk 16w 11** | 64.33 % | 11.85 clk/byte | 240.96 MiB/s | 1.48 clk/byte | 1933.49 MiB/s | -| **rANS32x32 16w 11** | 64.33 % | 12.38 clk/byte | 230.65 MiB/s | 1.49 clk/byte | 1921.87 MiB/s | -| htscodecs r32x16 -o4 -c 0x0202 | 63.64 % | - | 820.64 MiB/s | - | 1906.11 MiB/s | -| **rANS32x32 32blk 16w 12** | 63.81 % | 12.27 clk/byte | 232.80 MiB/s | 1.50 clk/byte | 1901.61 MiB/s | -| **rANS32x32 32blk 8w 10** | 65.60 % | 14.13 clk/byte | 202.20 MiB/s | 1.84 clk/byte | 1552.18 MiB/s | -| **rANS32x32 16w 13** | 63.61 % | 11.96 clk/byte | 238.82 MiB/s | 1.89 clk/byte | 1514.47 MiB/s | -| **rANS32x32 32blk 8w 11** | 64.33 % | 14.42 clk/byte | 198.05 MiB/s | 1.91 clk/byte | 1496.66 MiB/s | -| **rANS32x32 32blk 8w 12** | 63.82 % | 14.63 clk/byte | 195.21 MiB/s | 1.93 clk/byte | 1477.22 MiB/s | -| **rANS32x32 16w 15** | 63.57 % | 12.25 clk/byte | 233.13 MiB/s | 1.94 clk/byte | 1468.86 MiB/s | -| **rANS32x32 32blk 16w 15** | 63.57 % | 11.99 clk/byte | 238.28 MiB/s | 1.95 clk/byte | 1467.98 MiB/s | -| **rANS32x32 16w 14** | 63.55 % | 12.78 clk/byte | 223.47 MiB/s | 1.99 clk/byte | 1437.32 MiB/s | -| **rANS32x32 32blk 16w 14** | 63.55 % | 11.76 clk/byte | 242.82 MiB/s | 2.03 clk/byte | 1405.81 MiB/s | -| **rANS32x32 32blk 16w 13** | 63.61 % | 11.85 clk/byte | 241.08 MiB/s | 2.07 clk/byte | 1379.67 MiB/s | -| **rANS32x32 32blk 8w 14** | 63.53 % | 14.31 clk/byte | 199.62 MiB/s | 2.33 clk/byte | 1224.75 MiB/s | -| htscodecs r32x16 -o5 -c 0x0404 | 48.81 % | - | 533.10 MiB/s | - | 1221.94 MiB/s | -| **rANS32x32 32blk 8w 13** | 63.60 % | 14.43 clk/byte | 197.94 MiB/s | 2.36 clk/byte | 1209.93 MiB/s | -| htscodecs r32x16 -o4 -c 0x0101 | 63.64 % | - | 544.55 MiB/s | - | 1189.42 MiB/s | -| **rANS32x32 32blk 8w 15** | 63.51 % | 14.36 clk/byte | 198.92 MiB/s | 2.45 clk/byte | 1167.62 MiB/s | -| htscodecs r32x16 -o5 -c 0x0202 | 48.81 % | - | 406.07 MiB/s | - | 1020.72 MiB/s | -| **rANS32x16 16w 11** | 64.33 % | 14.21 clk/byte | 201.05 MiB/s | 2.85 clk/byte | 1001.90 MiB/s | -| **rANS32x16 16w 13** | 63.61 % | 12.81 clk/byte | 223.01 MiB/s | 2.86 clk/byte | 997.44 MiB/s | -| **rANS32x16 16w 10** | 65.59 % | 14.68 clk/byte | 194.52 MiB/s | 2.92 clk/byte | 976.98 MiB/s | -| **rANS32x16 16w 12** | 63.81 % | 12.80 clk/byte | 223.21 MiB/s | 3.02 clk/byte | 946.74 MiB/s | -| **rANS32x16 16w 14** | 63.55 % | 13.51 clk/byte | 211.37 MiB/s | 3.28 clk/byte | 870.14 MiB/s | -| htscodecs r32x16 -o4 -c 0x0000 | 63.64 % | - | 548.17 MiB/s | - | 847.63 MiB/s | -| **rANS32x16 16w 15** | 63.57 % | 12.67 clk/byte | 225.40 MiB/s | 3.55 clk/byte | 805.70 MiB/s | -| htscodecs r4x16 -o0 | 63.64 % | - | 543.40 MiB/s | - | 803.85 MiB/s | -| htscodecs r32x16 -o5 -c 0x0101 | 48.81 % | - | 294.49 MiB/s | - | 688.74 MiB/s | -| htscodecs r4x8 -o0 | 63.64 % | - | 323.87 MiB/s | - | 506.50 MiB/s | -| htscodecs r32x16 -o5 -c 0x0000 | 48.81 % | - | 290.49 MiB/s | - | 466.92 MiB/s | -| htscodecs r4x16 -o1 | 48.80 % | - | 328.54 MiB/s | - | 449.47 MiB/s | -| htscodecs r4x8 -o1 | 49.13 % | - | 324.06 MiB/s | - | 300.22 MiB/s | +| rANS32x64 16w 10 mt | 80.23 % | 200.03 MiB/s | 0.24 clk/byte | 18,035.77 MiB/s | +| rANS32x32 16w 10 mt | 80.17 % | 194.73 MiB/s | 0.25 clk/byte | 17,834.38 MiB/s | +| rANS32x64 16w 11 mt | 80.08 % | 202.10 MiB/s | 0.26 clk/byte | 16,210.44 MiB/s | +| rANS32x32 16w 11 mt | 80.02 % | 191.90 MiB/s | 0.27 clk/byte | 15,630.58 MiB/s | +| rANS32x64 16w 12 mt | 80.05 % | 197.62 MiB/s | 0.34 clk/byte | 13,207.00 MiB/s | +| rANS32x32 16w 12 mt | 79.99 % | 197.21 MiB/s | 0.36 clk/byte | 12,358.57 MiB/s | +| rANS32x64 16w 13 mt | 80.04 % | 199.94 MiB/s | 0.37 clk/byte | 11,938.77 MiB/s | +| rANS32x32 16w 13 mt | 79.99 % | 195.00 MiB/s | 0.37 clk/byte | 11,497.36 MiB/s | +| rANS32x64 16w 14 mt | 80.05 % | 199.87 MiB/s | 0.42 clk/byte | 10,318.01 MiB/s | +| rANS32x32 16w 14 mt | 80.01 % | 190.94 MiB/s | 0.42 clk/byte | 10,134.59 MiB/s | +| rANS32x64 16w 15 mt | 80.09 % | 200.59 MiB/s | 0.59 clk/byte | 7,308.43 MiB/s | +| rANS32x32 16w 15 mt | 80.03 % | 192.28 MiB/s | 0.62 clk/byte | 7,024.69 MiB/s | ## Building ### On Linux/WSL diff --git a/docs/index.html b/docs/index.html index 47bea27..8aa291c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -693,6 +693,100 @@

enwik8 (wikipedia extract)

obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("enwik8")); }); +
+

x-ray (medical x-ray image)

+

Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.

+ + + +
\ No newline at end of file From bcf9678a4723de0dd5f5917afc8ba0fb3bcc87fb Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 16:41:48 +0200 Subject: [PATCH 26/34] Updating mozilla benchmark --- README.md | 296 ++++++++++++++++++++++++------------------------ docs/index.html | 208 ++++++++++++++++------------------ src/main.cpp | 26 ++--- 3 files changed, 264 insertions(+), 266 deletions(-) diff --git a/README.md b/README.md index b1ed15b..5fa70cb 100644 --- a/README.md +++ b/README.md @@ -27,164 +27,170 @@ - Every best performing decoder variant requires AVX2. (The AVX-512 variants for 32x64 can be faster in rare circumstances, but they weren't in this benchmark) ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes) -| Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | +| Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 10** | 65.59 % | 12.83 clk/byte | 341.55 MiB/s | 1.43 clk/byte | 2,989.66 MiB/s | -| **rANS32x64 16w 11** | 64.33 % | 12.34 clk/byte | 347.24 MiB/s | 1.44 clk/byte | 2,973.71 MiB/s | -| **rANS32x64 16w 12** | 63.81 % | 12.51 clk/byte | 342.31 MiB/s | 1.44 clk/byte | 2,967.92 MiB/s | -| TurboANX 63 | 63.4 % | - | 981.79 MiB/s | - | 2,964.02 MiB/s | -| TurboANX 48 | 63.3 % | - | 969.72 MiB/s | - | 2,917.59 MiB/s | -| TurboANX 40 | 63.2 % | - | 964.45 MiB/s | - | 2,883.45 MiB/s | -| TurboANX 32 | 66.4 % | - | 951.53 MiB/s | - | 2,856.26 MiB/s | -| **rANS32x32 16w 11** | 64.33 % | 12.86 clk/byte | 333.03 MiB/s | 1.50 clk/byte | 2,856.20 MiB/s | -| **rANS32x32 16w 10** | 65.59 % | 12.80 clk/byte | 334.68 MiB/s | 1.51 clk/byte | 2,845.56 MiB/s | -| TurboANX 24 | 63.0 % | - | 936.12 MiB/s | - | 2,765.31 MiB/s | -| TurboANX 16 | 62.8 % | - | 902.32 MiB/s | - | 2,631.85 MiB/s | -| **rANS32x32 16w 12** | 63.81 % | 12.83 clk/byte | 343.55 MiB/s | 1.54 clk/byte | 2,784.13 MiB/s | -| fsehuf | 63.4 % | - | 1,581.32 MiB/s | - | 2,515.23 MiB/s | -| htscodecs rans32avx2 0 | 63.5 % | - | 1,041.93 MiB/s | - | 2,374.04 MiB/s | -| TurboANX 8 | 62.7 % | - | 823.76 MiB/s | - | 2,347.10 MiB/s | -| **rANS32x32 32blk 16w 12** | 63.81 % | 12.62 clk/byte | 339.50 MiB/s | 1.85 clk/byte | 2,312.10 MiB/s | -| **rANS32x32 32blk 16w 11** | 64.33 % | 12.67 clk/byte | 338.00 MiB/s | 1.86 clk/byte | 2,299.31 MiB/s | -| **rANS32x32 32blk 16w 10** | 65.59 % | 12.91 clk/byte | 331.80 MiB/s | 1.87 clk/byte | 2,289.10 MiB/s | -| htscodecs rans32avx512 0 | 63.5 % | - | 796.70 MiB/s | - | 2,221.93 MiB/s | -| **rANS32x32 32blk 8w 11** | 64.33 % | 15.01 clk/byte | 285.45 MiB/s | 2.15 clk/byte | 1,988.10 MiB/s | -| **rANS32x32 32blk 8w 12** | 63.82 % | 15.15 clk/byte | 282.80 MiB/s | 2.16 clk/byte | 1,984.68 MiB/s | -| **rANS32x32 32blk 8w 10** | 65.60 % | 14.70 clk/byte | 291.41 MiB/s | 2.17 clk/byte | 1,977.26 MiB/s | -| htscodecs rans32sse 0 | 63.5 % | - | 732.08 MiB/s | - | 1,948.66 MiB/s | -| TurboANX 4 | 63.0 % | - | 706.92 MiB/s | - | 1,929.18 MiB/s | -| **rANS32x64 16w 13** | 63.61 % | 12.32 clk/byte | 348.13 MiB/s | 2.29 clk/byte | 1,872.44 MiB/s | -| **rANS32x64 16w 14** | 63.55 % | 12.36 clk/byte | 346.57 MiB/s | 2.28 clk/byte | 1,876.95 MiB/s | -| **rANS32x64 16w 15** | 63.57 % | 12.30 clk/byte | 350.49 MiB/s | 2.34 clk/byte | 1,828.28 MiB/s | -| **rANS32x32 16w 13** | 63.61 % | 12.55 clk/byte | 341.20 MiB/s | 2.38 clk/byte | 1,800.28 MiB/s | -| **rANS32x32 16w 14** | 63.55 % | 12.54 clk/byte | 341.70 MiB/s | 2.39 clk/byte | 1,795.66 MiB/s | -| **rANS32x16 16w 10** | 65.59 % | 13.26 clk/byte | 323.07 MiB/s | 2.54 clk/byte | 1,684.80 MiB/s | -| **rANS32x16 16w 12** | 63.81 % | 13.21 clk/byte | 324.24 MiB/s | 2.55 clk/byte | 1,681.73 MiB/s | -| **rANS32x16 16w 11** | 64.33 % | 13.25 clk/byte | 323.17 MiB/s | 2.55 clk/byte | 1,676.41 MiB/s | -| **rANS32x32 16w 15** | 63.57 % | 12.94 clk/byte | 342.60 MiB/s | 2.56 clk/byte | 1,675.11 MiB/s | -| **rANS32x32 32blk 16w 14** | 63.55 % | 13.02 clk/byte | 329.08 MiB/s | 2.66 clk/byte | 1,607.26 MiB/s | -| **rANS32x32 32blk 16w 13** | 63.61 % | 12.56 clk/byte | 341.16 MiB/s | 2.71 clk/byte | 1,582.28 MiB/s | -| **rANS32x32 32blk 16w 15** | 63.57 % | 13.21 clk/byte | 324.33 MiB/s | 2.76 clk/byte | 1,550.93 MiB/s | -| **rANS32x32 32blk 8w 13** | 63.60 % | 15.07 clk/byte | 284.24 MiB/s | 2.98 clk/byte | 1,438.01 MiB/s | -| **rANS32x32 32blk 8w 14** | 63.53 % | 15.06 clk/byte | 284.45 MiB/s | 3.00 clk/byte | 1,429.24 MiB/s | -| TurboANX 2 | 64.0 % | - | 656.86 MiB/s | - | 1,416.33 MiB/s | -| **rANS32x32 32blk 8w 15** | 63.51 % | 15.11 clk/byte | 283.41 MiB/s | 3.10 clk/byte | 1,381.63 MiB/s | -| **rANS32x16 16w 13** | 63.61 % | 13.14 clk/byte | 325.92 MiB/s | 3.60 clk/byte | 1,190.23 MiB/s | -| **rANS32x16 16w 14** | 63.55 % | 13.37 clk/byte | 320.41 MiB/s | 3.64 clk/byte | 1,175.92 MiB/s | -| **rANS32x16 16w 15** | 63.57 % | 13.28 clk/byte | 322.51 MiB/s | 4.21 clk/byte | 1,017.12 MiB/s | -| fse | 63.2 % | - | 736.10 MiB/s | - | 966.58 MiB/s | -| TurboANX 1 | 66.4 % | - | 522.13 MiB/s | - | 942.43 MiB/s | -| htscodecs rans32avx512 1 | 51.6 % | - | 168.22 MiB/s | - | 322.22 MiB/s | -| htscodecs rans32avx2 1 | 51.6 % | - | 177.36 MiB/s | - | 319.15 MiB/s | -| FastHF | 63.6 % | - | 189.84 MiB/s | - | 151.62 MiB/s | -| FastAC | 63.2 % | - | 223.06 MiB/s | - | 84.37 MiB/s | -| htscodecs arith_dyn 1 | 47.8 % | - | 89.60 MiB/s | - | 81.63 MiB/s | -| htscodecs arith_dyn 0 | 62.0 % | - | 88.09 MiB/s | - | 75.05 MiB/s | +| **rANS32x64 16w 11 (raw)** | ✔️ | 64.48 % | 336.81 MiB/s | 1.42 clk/byte | 3,018.02 MiB/s | +| **rANS32x64 16w 10 (raw)** | ✔️ | 65.97 % | 335.28 MiB/s | 1.42 clk/byte | 3,013.45 MiB/s | +| **rANS32x64 16w 12 (raw)** | ✔️ | 63.83 % | 347.90 MiB/s | 1.42 clk/byte | 3,009.18 MiB/s | +| TurboANX 63 | ❌ | 63.4 % | 981.79 MiB/s | - | 2,964.02 MiB/s | +| **rANS32x64 16w 10** | ✔️ | 65.56 % | 239.77 MiB/s | 1.46 clk/byte | 2,934.64 MiB/s | +| TurboANX 48 | ❌ | 63.3 % | 969.72 MiB/s | - | 2,917.59 MiB/s | +| **rANS32x64 16w 11** | ✔️ | 64.30 % | 225.35 MiB/s | 1.47 clk/byte | 2,907.73 MiB/s | +| TurboANX 40 | ❌ | 63.2 % | 964.45 MiB/s | - | 2,883.45 MiB/s | +| **rANS32x64 16w 12** | ✔️ | 63.73 % | 230.37 MiB/s | 1.50 clk/byte | 2,856.76 MiB/s | +| TurboANX 32 | ❌ | 66.4 % | 951.53 MiB/s | - | 2,856.26 MiB/s | +| **rANS32x32 16w 10 (raw)** | ✔️ | 65.97 % | 328.77 MiB/s | 1.52 clk/byte | 2,822.60 MiB/s | +| **rANS32x32 16w 11 (raw)** | ✔️ | 64.48 % | 332.10 MiB/s | 1.52 clk/byte | 2,817.60 MiB/s | +| **rANS32x32 16w 12 (raw)** | ✔️ | 63.83 % | 341.70 MiB/s | 1.53 clk/byte | 2,800.63 MiB/s | +| TurboANX 24 | ❌ | 63.0 % | 936.12 MiB/s | - | 2,765.31 MiB/s | +| **rANS32x32 16w 10** | ✔️ | 65.56 % | 237.21 MiB/s | 1.55 clk/byte | 2,765.18 MiB/s | +| **rANS32x32 16w 11** | ✔️ | 64.30 % | 238.29 MiB/s | 1.57 clk/byte | 2,735.12 MiB/s | +| **rANS32x32 16w 12** | ✔️ | 63.71 % | 243.00 MiB/s | 1.62 clk/byte | 2,642.01 MiB/s | +| TurboANX 16 | ❌ | 62.8 % | 902.32 MiB/s | - | 2,631.85 MiB/s | +| FSE Huff0 | ✔️ | 63.4 % | 1,581.32 MiB/s | - | 2,515.23 MiB/s | +| htscodecs rans32avx2 0 | ✔️ | 63.5 % | 1,041.93 MiB/s | - | 2,374.04 MiB/s | +| TurboANX 8 | ❌ | 62.7 % | 823.76 MiB/s | - | 2,347.10 MiB/s | +| htscodecs rans32avx512 0 | ✔️ | 63.5 % | 796.70 MiB/s | - | 2,221.93 MiB/s | +| htscodecs rans32sse 0 | ✔️ | 63.5 % | 732.08 MiB/s | - | 1,948.66 MiB/s | +| TurboANX 4 | ❌ | 63.0 % | 706.92 MiB/s | - | 1,929.18 MiB/s | +| **rANS32x64 16w 14 (raw)** | ✔️ | 63.55 % | 350.13 MiB/s | 2.22 clk/byte | 1,926.82 MiB/s | +| **rANS32x64 16w 13 (raw)** | ✔️ | 63.61 % | 345.16 MiB/s | 2.23 clk/byte | 1,924.81 MiB/s | +| **rANS32x64 16w 15 (raw)** | ✔️ | 63.57 % | 340.96 MiB/s | 2.30 clk/byte | 1,861.57 MiB/s | +| **rANS32x64 16w 13** | ✔️ | 63.53 % | 232.05 MiB/s | 2.32 clk/byte | 1,846.34 MiB/s | +| **rANS32x64 16w 14** | ✔️ | 63.47 % | 235.14 MiB/s | 2.33 clk/byte | 1,837.19 MiB/s | +| **rANS32x32 16w 13 (raw)** | ✔️ | 63.61 % | 344.26 MiB/s | 2.35 clk/byte | 1,818.86 MiB/s | +| **rANS32x32 16w 14 (raw)** | ✔️ | 63.55 % | 324.44 MiB/s | 2.37 clk/byte | 1,810.24 MiB/s | +| **rANS32x32 16w 14** | ✔️ | 63.45 % | 252.28 MiB/s | 2.42 clk/byte | 1,772.88 MiB/s | +| **rANS32x32 16w 13** | ✔️ | 63.52 % | 249.07 MiB/s | 2.42 clk/byte | 1,772.30 MiB/s | +| **rANS32x64 16w 15** | ✔️ | 63.48 % | 235.02 MiB/s | 2.46 clk/byte | 1,744.39 MiB/s | +| **rANS32x32 16w 15 (raw)** | ✔️ | 63.57 % | 336.51 MiB/s | 2.55 clk/byte | 1,679.08 MiB/s | +| **rANS32x32 16w 15** | ✔️ | 63.50 % | 250.86 MiB/s | 2.64 clk/byte | 1,622.75 MiB/s | +| TurboANX 2 | ❌ | 64.0 % | 656.86 MiB/s | - | 1,416.33 MiB/s | +| FSE | ✔️ | 63.2 % | 736.10 MiB/s | - | 966.58 MiB/s | +| TurboANX 1 | ❌ | 66.4 % | 522.13 MiB/s | - | 942.43 MiB/s | +| htscodecs rans32avx512 1 | ✔️ | 51.6 % | 168.22 MiB/s | - | 322.22 MiB/s | +| htscodecs rans32avx2 1 | ✔️ | 51.6 % | 177.36 MiB/s | - | 319.15 MiB/s | +| FastHF | ✔️ | 63.6 % | 189.84 MiB/s | - | 151.62 MiB/s | +| FastAC | ✔️ | 63.2 % | 223.06 MiB/s | - | 84.37 MiB/s | +| htscodecs arith_dyn 1 | ✔️ | 47.8 % | 89.60 MiB/s | - | 81.63 MiB/s | +| htscodecs arith_dyn 0 | ✔️ | 62.0 % | 88.09 MiB/s | - | 75.05 MiB/s | ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus) -| Codec Type | License | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | +| Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 11 (raw)** | BSD-2 | 82.60 % | 311.60 MiB/s | 1.39 clk/byte | 3,079.98 MiB/s | -| **rANS32x64 16w 12** | BSD-2 | 80.17 % | 193.60 MiB/s | 1.41 clk/byte | 3,048.15 MiB/s | -| **rANS32x64 16w 12 (raw)** | BSD-2 | 82.57 % | 308.10 MiB/s | 1.41 clk/byte | 3,041.07 MiB/s | -| **rANS32x64 16w 10** | BSD-2 | 80.81 % | 193.28 MiB/s | 1.41 clk/byte | 3,040.97 MiB/s | -| **rANS32x64 16w 10 (raw)** | BSD-2 | 82.83 % | 305.96 MiB/s | 1.42 clk/byte | 3,027.01 MiB/s | -| **rANS32x64 16w 11** | BSD-2 | 80.24 % | 186.41 MiB/s | 1.42 clk/byte | 3,015.25 MiB/s | -| TurboANX 63 | - | 79.6 % | 989.68 MiB/s | - | 2,966.83 MiB/s | -| TurboANX 48 | - | 79.6 % | 979.24 MiB/s | - | 2,923.90 MiB/s | -| TurboANX 40 | - | 79.7 % | 982.57 MiB/s | - | 2,904.99 MiB/s | -| **rANS32x32 16w 11 (raw)** | BSD-2 | 82.60 % | 303.34 MiB/s | 1.48 clk/byte | 2,886.18 MiB/s | -| **rANS32x32 16w 10 (raw)** | BSD-2 | 82.83 % | 301.23 MiB/s | 1.49 clk/byte | 2,881.42 MiB/s | -| **rANS32x32 16w 12 (raw)** | BSD-2 | 82.57 % | 307.10 MiB/s | 1.49 clk/byte | 2,872.78 MiB/s | -| TurboANX 32 | - | 79.7 % | 973.82 MiB/s | - | 2,860.76 MiB/s | -| **rANS32x32 16w 10** | BSD-2 | 80.81 % | 192.99 MiB/s | 1.51 clk/byte | 2,841.71 MiB/s | -| **rANS32x32 16w 11** | BSD-2 | 80.24 % | 190.01 MiB/s | 1.51 clk/byte | 2,834.43 MiB/s | -| **rANS32x32 16w 12** | BSD-2 | 80.53 % | 195.09 MiB/s | 1.54 clk/byte | 2,787.94 MiB/s | -| TurboANX 24 | - | 79.8 % | 962.68 MiB/s | - | 2,785.82 MiB/s | -| TurboANX 16 | - | 79.9 % | 937.33 MiB/s | - | 2,661.07 MiB/s | -| TurboANX 8 | - | 80.5 % | 864.63 MiB/s | - | 2,360.30 MiB/s | -| htscodecs rans32avx2 0 | BSD-3 | 80.6 % | 966.58 MiB/s | - | 2,244.87 MiB/s | -| htscodecs rans32avx512 0 | BSD-3 | 80.6 % | 739.14 MiB/s | - | 2,139.47 MiB/s | -| FSE Huff0 | BSD-2 | 80.0 % | 1,395.71 MiB/s | - | 1,946.34 MiB/s | -| htscodecs rans32sse 0 | BSD-3 | 80.6 % | 723.48 MiB/s | - | 1,914.15 MiB/s | -| **rANS32x64 16w 13 (raw)** | BSD-2 | 82.57 % | 305.45 MiB/s | 2.24 clk/byte | 1,910.60 MiB/s | -| **rANS32x64 16w 14 (raw)** | BSD-2 | 82.58 % | 308.96 MiB/s | 2.25 clk/byte | 1,903.66 MiB/s | -| **rANS32x64 16w 13** | BSD-2 | 79.98 % | 191.74 MiB/s | 2.26 clk/byte | 1,892.64 MiB/s | -| TurboANX 4 | - | 81.9 % | 677.08 MiB/s | - | 1,883.40 MiB/s | -| **rANS32x32 16w 13 (raw)** | BSD-2 | 82.57 % | 305.00 MiB/s | 2.29 clk/byte | 1,870.26 MiB/s | -| **rANS32x64 16w 15 (raw)** | BSD-2 | 82.63 % | 307.44 MiB/s | 2.30 clk/byte | 1,865.65 MiB/s | -| **rANS32x32 16w 14 (raw)** | BSD-2 | 82.58 % | 306.18 MiB/s | 2.30 clk/byte | 1,865.18 MiB/s | -| **rANS32x64 16w 14** | BSD-2 | 80.02 % | 192.71 MiB/s | 2.30 clk/byte | 1,861.42 MiB/s | -| **rANS32x32 16w 13** | BSD-2 | 80.01 % | 196.93 MiB/s | 2.37 clk/byte | 1,808.33 MiB/s | -| **rANS32x64 16w 15** | BSD-2 | 80.25 % | 193.85 MiB/s | 2.42 clk/byte | 1,773.42 MiB/s | -| **rANS32x32 16w 14** | BSD-2 | 80.06 % | 198.86 MiB/s | 2.42 clk/byte | 1,767.12 MiB/s | -| **rANS32x32 16w 15 (raw)** | BSD-2 | 82.63 % | 304.21 MiB/s | 2.44 clk/byte | 1,758.57 MiB/s | -| **rANS32x32 16w 15** | BSD-2 | 80.06 % | 191.91 MiB/s | 2.70 clk/byte | 1,585.77 MiB/s | -| TurboANX 2 | - | 83.7 % | 600.46 MiB/s | - | 1,292.65 MiB/s | -| FSE | BSD-2 | 80.3 % | 696.88 MiB/s | - | 990.39 MiB/s | -| TurboANX 1 | - | 85.1 % | 387.40 MiB/s | - | 719.84 MiB/s | -| htscodecs rans32avx2 1 | BSD-3 | 74.4 % | 114.89 MiB/s | - | 229.78 MiB/s | -| htscodecs rans32avx512 1 | BSD-3 | 74.4 % | 104.87 MiB/s | - | 220.91 MiB/s | -| FastHF | Custom | 80.0 % | 183.35 MiB/s | - | 144.30 MiB/s | -| FastAC | Custom | 79.7 % | 244.35 MiB/s | - | 77.33 MiB/s | -| htscodecs arith_dyn 1 | BSD-3 | 67.6 % | 45.13 MiB/s | - | 45.67 MiB/s | -| htscodecs arith_dyn 0 | BSD-3 | 79.6 % | 47.12 MiB/s | - | 45.40 MiB/s | +| **rANS32x64 16w 11 (raw)** | ✔️ | 82.60 % | 311.60 MiB/s | 1.39 clk/byte | 3,079.98 MiB/s | +| **rANS32x64 16w 12** | ✔️ | 80.17 % | 193.60 MiB/s | 1.41 clk/byte | 3,048.15 MiB/s | +| **rANS32x64 16w 12 (raw)** | ✔️ | 82.57 % | 308.10 MiB/s | 1.41 clk/byte | 3,041.07 MiB/s | +| **rANS32x64 16w 10** | ✔️ | 80.81 % | 193.28 MiB/s | 1.41 clk/byte | 3,040.97 MiB/s | +| **rANS32x64 16w 10 (raw)** | ✔️ | 82.83 % | 305.96 MiB/s | 1.42 clk/byte | 3,027.01 MiB/s | +| **rANS32x64 16w 11** | ✔️ | 80.24 % | 186.41 MiB/s | 1.42 clk/byte | 3,015.25 MiB/s | +| TurboANX 63 | ❌ | 79.6 % | 989.68 MiB/s | - | 2,966.83 MiB/s | +| TurboANX 48 | ❌ | 79.6 % | 979.24 MiB/s | - | 2,923.90 MiB/s | +| TurboANX 40 | ❌ | 79.7 % | 982.57 MiB/s | - | 2,904.99 MiB/s | +| **rANS32x32 16w 11 (raw)** | ✔️ | 82.60 % | 303.34 MiB/s | 1.48 clk/byte | 2,886.18 MiB/s | +| **rANS32x32 16w 10 (raw)** | ✔️ | 82.83 % | 301.23 MiB/s | 1.49 clk/byte | 2,881.42 MiB/s | +| **rANS32x32 16w 12 (raw)** | ✔️ | 82.57 % | 307.10 MiB/s | 1.49 clk/byte | 2,872.78 MiB/s | +| TurboANX 32 | ❌ | 79.7 % | 973.82 MiB/s | - | 2,860.76 MiB/s | +| **rANS32x32 16w 10** | ✔️ | 80.81 % | 192.99 MiB/s | 1.51 clk/byte | 2,841.71 MiB/s | +| **rANS32x32 16w 11** | ✔️ | 80.24 % | 190.01 MiB/s | 1.51 clk/byte | 2,834.43 MiB/s | +| **rANS32x32 16w 12** | ✔️ | 80.53 % | 195.09 MiB/s | 1.54 clk/byte | 2,787.94 MiB/s | +| TurboANX 24 | ❌ | 79.8 % | 962.68 MiB/s | - | 2,785.82 MiB/s | +| TurboANX 16 | ❌ | 79.9 % | 937.33 MiB/s | - | 2,661.07 MiB/s | +| TurboANX 8 | ❌ | 80.5 % | 864.63 MiB/s | - | 2,360.30 MiB/s | +| htscodecs rans32avx2 0 | ✔️ | 80.6 % | 966.58 MiB/s | - | 2,244.87 MiB/s | +| htscodecs rans32avx512 0 | ✔️ | 80.6 % | 739.14 MiB/s | - | 2,139.47 MiB/s | +| FSE Huff0 | ✔️ | 80.0 % | 1,395.71 MiB/s | - | 1,946.34 MiB/s | +| htscodecs rans32sse 0 | ✔️ | 80.6 % | 723.48 MiB/s | - | 1,914.15 MiB/s | +| **rANS32x64 16w 13 (raw)** | ✔️ | 82.57 % | 305.45 MiB/s | 2.24 clk/byte | 1,910.60 MiB/s | +| **rANS32x64 16w 14 (raw)** | ✔️ | 82.58 % | 308.96 MiB/s | 2.25 clk/byte | 1,903.66 MiB/s | +| **rANS32x64 16w 13** | ✔️ | 79.98 % | 191.74 MiB/s | 2.26 clk/byte | 1,892.64 MiB/s | +| TurboANX 4 | ❌ | 81.9 % | 677.08 MiB/s | - | 1,883.40 MiB/s | +| **rANS32x32 16w 13 (raw)** | ✔️ | 82.57 % | 305.00 MiB/s | 2.29 clk/byte | 1,870.26 MiB/s | +| **rANS32x64 16w 15 (raw)** | ✔️ | 82.63 % | 307.44 MiB/s | 2.30 clk/byte | 1,865.65 MiB/s | +| **rANS32x32 16w 14 (raw)** | ✔️ | 82.58 % | 306.18 MiB/s | 2.30 clk/byte | 1,865.18 MiB/s | +| **rANS32x64 16w 14** | ✔️ | 80.02 % | 192.71 MiB/s | 2.30 clk/byte | 1,861.42 MiB/s | +| **rANS32x32 16w 13** | ✔️ | 80.01 % | 196.93 MiB/s | 2.37 clk/byte | 1,808.33 MiB/s | +| **rANS32x64 16w 15** | ✔️ | 80.25 % | 193.85 MiB/s | 2.42 clk/byte | 1,773.42 MiB/s | +| **rANS32x32 16w 14** | ✔️ | 80.06 % | 198.86 MiB/s | 2.42 clk/byte | 1,767.12 MiB/s | +| **rANS32x32 16w 15 (raw)** | ✔️ | 82.63 % | 304.21 MiB/s | 2.44 clk/byte | 1,758.57 MiB/s | +| **rANS32x32 16w 15** | ✔️ | 80.06 % | 191.91 MiB/s | 2.70 clk/byte | 1,585.77 MiB/s | +| TurboANX 2 | ❌ | 83.7 % | 600.46 MiB/s | - | 1,292.65 MiB/s | +| FSE | ✔️ | 80.3 % | 696.88 MiB/s | - | 990.39 MiB/s | +| TurboANX 1 | ❌ | 85.1 % | 387.40 MiB/s | - | 719.84 MiB/s | +| htscodecs rans32avx2 1 | ✔️ | 74.4 % | 114.89 MiB/s | - | 229.78 MiB/s | +| htscodecs rans32avx512 1 | ✔️ | 74.4 % | 104.87 MiB/s | - | 220.91 MiB/s | +| FastHF | ✔️ | 80.0 % | 183.35 MiB/s | - | 144.30 MiB/s | +| FastAC | ✔️ | 79.7 % | 244.35 MiB/s | - | 77.33 MiB/s | +| htscodecs arith_dyn 1 | ✔️ | 67.6 % | 45.13 MiB/s | - | 45.67 MiB/s | +| htscodecs arith_dyn 0 | ✔️ | 79.6 % | 47.12 MiB/s | - | 45.40 MiB/s | ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus) -| Codec Type | Ratio | Encoder
Clocks/Byte | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | +| Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | | -- | --: | --: | --: | --: | --: | -| **rANS32x64 16w 11** | 77.82 % | 13.84 clk/byte | 309.39 MiB/s | 1.44 clk/byte | 2,978.20 MiB/s | -| **rANS32x64 16w 10** | 77.92 % | 14.16 clk/byte | 302.46 MiB/s | 1.44 clk/byte | 2,968.99 MiB/s | -| TurboANX 63 | 70.1 % | - | 965.97 MiB/s | - | 2,959.13 MiB/s | -| **rANS32x64 16w 12** | 77.79 % | 14.21 clk/byte | 301.44 MiB/s | 1.45 clk/byte | 2,946.52 MiB/s | -| TurboANX 48 | 69.6 % | - | 954.87 MiB/s | - | 2,911.55 MiB/s | -| **rANS32x32 16w 10** | 77.92 % | 13.97 clk/byte | 306.54 MiB/s | 1.49 clk/byte | 2,878.05 MiB/s | -| TurboANX 40 | 69.3 % | - | 941.29 MiB/s | - | 2,869.21 MiB/s | -| **rANS32x32 16w 11** | 77.82 % | 14.34 clk/byte | 298.79 MiB/s | 1.49 clk/byte | 2,867.33 MiB/s | -| TurboANX 32 | 68.9 % | - | 927.04 MiB/s | - | 2,815.87 MiB/s | -| **rANS32x32 16w 12** | 77.79 % | 14.25 clk/byte | 300.51 MiB/s | 1.54 clk/byte | 2,782.35 MiB/s | -| TurboANX 24 | 68.4 % | - | 900.92 MiB/s | - | 2,732.74 MiB/s | -| TurboANX 16 | 67.9 % | - | 854.34 MiB/s | - | 2,582.05 MiB/s | -| htscodecs_rans32avx2 0 | 69.5 % | - | 1,014.19 MiB/s | - | 2,250.58 MiB/s | -| TurboANX 8 | 67.2 % | - | 748.14 MiB/s | - | 2,183.29 MiB/s | -| htscodecs_rans32avx512 0 | 69.5 % | - | 760.33 MiB/s | - | 2,115.31 MiB/s | -| fsehuf | 69.2 % | - | 1,491.60 MiB/s | - | 2,092.00 MiB/s | -| **rANS32x32 16w 14** | 77.79 % | 14.02 clk/byte | 305.49 MiB/s | 2.37 clk/byte | 1,804.10 MiB/s | -| **rANS32x64 16w 14** | 77.79 % | 14.09 clk/byte | 303.97 MiB/s | 2.26 clk/byte | 1,891.46 MiB/s | -| htscodecs_rans32sse 0 | 69.5 % | - | 724.39 MiB/s | - | 1,884.40 MiB/s | -| **rANS32x64 16w 13** | 77.79 % | 13.89 clk/byte | 308.28 MiB/s | 2.27 clk/byte | 1,883.91 MiB/s | -| **rANS32x64 16w 15** | 77.85 % | 13.86 clk/byte | 309.13 MiB/s | 2.31 clk/byte | 1,855.74 MiB/s | -| **rANS32x32 16w 13** | 77.78 % | 14.13 clk/byte | 303.23 MiB/s | 2.37 clk/byte | 1,806.03 MiB/s | -| **rANS32x32 16w 15** | 77.84 % | 14.29 clk/byte | 299.78 MiB/s | 2.46 clk/byte | 1,743.60 MiB/s | -| TurboANX 4 | 67.3 % | - | 603.91 MiB/s | - | 1,658.68 MiB/s | -| TurboANX 2 | 68.5 % | - | 556.95 MiB/s | - | 1,106.06 MiB/s | -| fse | 69.3 % | - | 713.08 MiB/s | - | 973.71 MiB/s | -| TurboANX 1 | 71.6 % | - | 392.67 MiB/s | - | 677.10 MiB/s | -| htscodecs_rans32avx512 1 | 55.7 % | - | 81.02 MiB/s | - | 168.42 MiB/s | -| htscodecs_rans32avx2 1 | 55.7 % | - | 83.68 MiB/s | - | 167.19 MiB/s | -| FastHF | 71.8 % | - | 174.86 MiB/s | - | 130.78 MiB/s | -| FastAC | 70.7 % | - | 234.95 MiB/s | - | 81.01 MiB/s | -| htscodecs_arith_dyn 1 | 52.1 % | - | 62.87 MiB/s | - | 62.98 MiB/s | -| htscodecs_arith_dyn 0 | 66.4 % | - | 63.82 MiB/s | - | 59.92 MiB/s | +| **rANS32x64 16w 11 (raw)** | ✔️ | 77.82 % | 309.39 MiB/s | 1.44 clk/byte | 2,978.20 MiB/s | +| TurboANX 63 | ❌ | 70.1 % | 965.97 MiB/s | - | 2,959.13 MiB/s | +| **rANS32x64 16w 12 (raw)** | ✔️ | 77.79 % | 308.29 MiB/s | 1.45 clk/byte | 2,946.52 MiB/s | +| **rANS32x64 16w 10** | ✔️ | 76.51 % | 206.82 MiB/s | 1.46 clk/byte | 2,927.30 MiB/s | +| **rANS32x64 16w 10 (raw)** | ✔️ | 77.93 % | 302.67 MiB/s | 1.47 clk/byte | 2,916.71 MiB/s | +| TurboANX 48 | ❌ | 69.6 % | 954.87 MiB/s | - | 2,911.55 MiB/s | +| **rANS32x64 16w 11** | ✔️ | 75.36 % | 201.83 MiB/s | 1.48 clk/byte | 2,894.53 MiB/s | +| TurboANX 40 | ❌ | 69.3 % | 941.29 MiB/s | - | 2,869.21 MiB/s | +| **rANS32x32 16w 11 (raw)** | ✔️ | 77.82 % | 298.79 MiB/s | 1.49 clk/byte | 2,867.33 MiB/s | +| **rANS32x32 16w 10 (raw)** | ✔️ | 77.93 % | 300.28 MiB/s | 1.52 clk/byte | 2,826.16 MiB/s | +| TurboANX 32 | ❌ | 68.9 % | 927.04 MiB/s | - | 2,815.87 MiB/s | +| **rANS32x32 16w 12 (raw)** | ✔️ | 77.79 % | 305.16 MiB/s | 1.54 clk/byte | 2,782.36 MiB/s | +| **rANS32x32 16w 10** | ✔️ | 76.51 % | 205.76 MiB/s | 1.55 clk/byte | 2,757.67 MiB/s | +| **rANS32x32 16w 11** | ✔️ | 75.36 % | 205.29 MiB/s | 1.57 clk/byte | 2,733.71 MiB/s | +| TurboANX 24 | ❌ | 68.4 % | 900.92 MiB/s | - | 2,732.74 MiB/s | +| **rANS32x64 16w 12** | ✔️ | 72.16 % | 198.68 MiB/s | 1.63 clk/byte | 2,631.17 MiB/s | +| TurboANX 16 | ❌ | 67.9 % | 854.34 MiB/s | - | 2,582.05 MiB/s | +| **rANS32x32 16w 12** | ✔️ | 71.21 % | 202.45 MiB/s | 1.85 clk/byte | 2,319.35 MiB/s | +| htscodecs rans32avx2 0 | ✔️ | 69.5 % | 1,014.19 MiB/s | - | 2,250.58 MiB/s | +| TurboANX 8 | ❌ | 67.2 % | 748.14 MiB/s | - | 2,183.29 MiB/s | +| htscodecs rans32avx512 0 | ✔️ | 69.5 % | 760.33 MiB/s | - | 2,115.31 MiB/s | +| FSE Huff0 | ✔️ | 69.2 % | 1,491.60 MiB/s | - | 2,092.00 MiB/s | +| **rANS32x64 16w 14 (raw)** | ✔️ | 77.79 % | 307.05 MiB/s | 2.26 clk/byte | 1,891.46 MiB/s | +| htscodecs rans32sse 0 | ✔️ | 69.5 % | 724.39 MiB/s | - | 1,884.40 MiB/s | +| **rANS32x64 16w 13 (raw)** | ✔️ | 77.79 % | 308.28 MiB/s | 2.27 clk/byte | 1,883.91 MiB/s | +| **rANS32x64 16w 15 (raw)** | ✔️ | 77.85 % | 309.13 MiB/s | 2.31 clk/byte | 1,855.74 MiB/s | +| **rANS32x32 16w 13 (raw)** | ✔️ | 77.78 % | 306.95 MiB/s | 2.35 clk/byte | 1,824.85 MiB/s | +| **rANS32x32 16w 14 (raw)** | ✔️ | 77.79 % | 302.09 MiB/s | 2.35 clk/byte | 1,818.82 MiB/s | +| **rANS32x64 16w 13** | ✔️ | 73.22 % | 199.90 MiB/s | 2.43 clk/byte | 1,763.01 MiB/s | +| **rANS32x32 16w 15 (raw)** | ✔️ | 77.84 % | 301.06 MiB/s | 2.44 clk/byte | 1,758.41 MiB/s | +| **rANS32x32 16w 13** | ✔️ | 73.24 % | 204.45 MiB/s | 2.54 clk/byte | 1,688.64 MiB/s | +| **rANS32x64 16w 14** | ✔️ | 73.23 % | 199.48 MiB/s | 2.56 clk/byte | 1,672.95 MiB/s | +| TurboANX 4 | ❌ | 67.3 % | 603.91 MiB/s | - | 1,658.68 MiB/s | +| **rANS32x32 16w 14** | ✔️ | 73.27 % | 204.91 MiB/s | 2.66 clk/byte | 1,611.11 MiB/s | +| **rANS32x32 16w 15** | ✔️ | 74.38 % | 204.20 MiB/s | 2.78 clk/byte | 1,543.54 MiB/s | +| **rANS32x64 16w 15** | ✔️ | 72.21 % | 198.42 MiB/s | 3.18 clk/byte | 1,345.59 MiB/s | +| TurboANX 2 | ❌ | 68.5 % | 556.95 MiB/s | - | 1,106.06 MiB/s | +| FSE | ✔️ | 69.3 % | 713.08 MiB/s | - | 973.71 MiB/s | +| TurboANX 1 | ❌ | 71.6 % | 392.67 MiB/s | - | 677.10 MiB/s | +| htscodecs rans32avx512 1 | ✔️ | 55.7 % | 81.02 MiB/s | - | 168.42 MiB/s | +| htscodecs rans32avx2 1 | ✔️ | 55.7 % | 83.68 MiB/s | - | 167.19 MiB/s | +| FastHF | ✔️ | 71.8 % | 174.86 MiB/s | - | 130.78 MiB/s | +| FastAC | ✔️ | 70.7 % | 234.95 MiB/s | - | 81.01 MiB/s | +| htscodecs arith_dyn 1 | ✔️ | 52.1 % | 62.87 MiB/s | - | 62.98 MiB/s | +| htscodecs arith_dyn 0 | ✔️ | 66.4 % | 63.82 MiB/s | - | 59.92 MiB/s | ## Easy Multithreading hypersonic-rANS includes a variant that's encodes blocks independently (at the expense of compression ratio) allowing for easy multithreading. ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus) | Codec Type | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | -| -- | --: | --: | --: | --: | --: | -| rANS32x64 16w 10 mt | 80.23 % | 200.03 MiB/s | 0.24 clk/byte | 18,035.77 MiB/s | -| rANS32x32 16w 10 mt | 80.17 % | 194.73 MiB/s | 0.25 clk/byte | 17,834.38 MiB/s | -| rANS32x64 16w 11 mt | 80.08 % | 202.10 MiB/s | 0.26 clk/byte | 16,210.44 MiB/s | -| rANS32x32 16w 11 mt | 80.02 % | 191.90 MiB/s | 0.27 clk/byte | 15,630.58 MiB/s | -| rANS32x64 16w 12 mt | 80.05 % | 197.62 MiB/s | 0.34 clk/byte | 13,207.00 MiB/s | -| rANS32x32 16w 12 mt | 79.99 % | 197.21 MiB/s | 0.36 clk/byte | 12,358.57 MiB/s | -| rANS32x64 16w 13 mt | 80.04 % | 199.94 MiB/s | 0.37 clk/byte | 11,938.77 MiB/s | -| rANS32x32 16w 13 mt | 79.99 % | 195.00 MiB/s | 0.37 clk/byte | 11,497.36 MiB/s | -| rANS32x64 16w 14 mt | 80.05 % | 199.87 MiB/s | 0.42 clk/byte | 10,318.01 MiB/s | -| rANS32x32 16w 14 mt | 80.01 % | 190.94 MiB/s | 0.42 clk/byte | 10,134.59 MiB/s | -| rANS32x64 16w 15 mt | 80.09 % | 200.59 MiB/s | 0.59 clk/byte | 7,308.43 MiB/s | -| rANS32x32 16w 15 mt | 80.03 % | 192.28 MiB/s | 0.62 clk/byte | 7,024.69 MiB/s | +| -- | --: | --: | --: | --: | +| rANS32x64 16w 10 mt | 80.23 % | 200.03 MiB/s | 0.24 clk/byte | **18,035.77 MiB/s** | +| rANS32x32 16w 10 mt | 80.17 % | 194.73 MiB/s | 0.25 clk/byte | **17,834.38 MiB/s** | +| rANS32x64 16w 11 mt | 80.08 % | 202.10 MiB/s | 0.26 clk/byte | **16,210.44 MiB/s** | +| rANS32x32 16w 11 mt | 80.02 % | 191.90 MiB/s | 0.27 clk/byte | **15,630.58 MiB/s** | +| rANS32x64 16w 12 mt | 80.05 % | 197.62 MiB/s | 0.34 clk/byte | **13,207.00 MiB/s** | +| rANS32x32 16w 12 mt | 79.99 % | 197.21 MiB/s | 0.36 clk/byte | **12,358.57 MiB/s** | +| rANS32x64 16w 13 mt | 80.04 % | 199.94 MiB/s | 0.37 clk/byte | **11,938.77 MiB/s** | +| rANS32x32 16w 13 mt | 79.99 % | 195.00 MiB/s | 0.37 clk/byte | **11,497.36 MiB/s** | +| rANS32x64 16w 14 mt | 80.05 % | 199.87 MiB/s | 0.42 clk/byte | **10,318.01 MiB/s** | +| rANS32x32 16w 14 mt | 80.01 % | 190.94 MiB/s | 0.42 clk/byte | **10,134.59 MiB/s** | +| rANS32x64 16w 15 mt | 80.09 % | 200.59 MiB/s | 0.59 clk/byte | **7,308.43 MiB/s** | +| rANS32x32 16w 15 mt | 80.03 % | 192.28 MiB/s | 0.62 clk/byte | **7,024.69 MiB/s** | ## Building ### On Linux/WSL diff --git a/docs/index.html b/docs/index.html index 8aa291c..ba30a47 100644 --- a/docs/index.html +++ b/docs/index.html @@ -596,103 +596,6 @@

some of the fastest decoding range-based asymetric numeral systems (rANS) co

benchmarks

-
-

enwik8 (wikipedia extract)

-

Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.

- - - -

x-ray (medical x-ray image)

Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.

@@ -742,17 +645,6 @@

x-ray (medical x-ray image)

add_point_no_entropy(obj, 3015.25, 80.24, 186.41, "rANS32x64 16w 11" ); add_point_no_entropy(obj, 3040.97, 80.81, 193.28, "rANS32x64 16w 10" ); - add_point_no_entropy(obj, 2966.83, 79.6, 989.68, "TurboANX 63 (Native Windows Build)", true); - add_point_no_entropy(obj, 2923.90, 79.6, 979.24, "TurboANX 48 (Native Windows Build)", true); - add_point_no_entropy(obj, 2904.99, 79.7, 982.57, "TurboANX 40 (Native Windows Build)", true); - add_point_no_entropy(obj, 2860.76, 79.7, 973.82, "TurboANX 32 (Native Windows Build)", true); - add_point_no_entropy(obj, 2785.82, 79.8, 962.68, "TurboANX 24 (Native Windows Build)", true); - add_point_no_entropy(obj, 2661.07, 79.9, 937.33, "TurboANX 16 (Native Windows Build)", true); - add_point_no_entropy(obj, 2360.30, 80.5, 864.63, "TurboANX 8 (Native Windows Build)", true); - add_point_no_entropy(obj, 1883.40, 81.9, 677.08, "TurboANX 4 (Native Windows Build)", true); - add_point_no_entropy(obj, 1292.65, 83.7, 600.46, "TurboANX 2 (Native Windows Build)", true); - add_point_no_entropy(obj, 719.84, 85.1, 387.40, "TurboANX 1 (Native Windows Build)", true); - add_point_no_entropy(obj, 1914.15, 80.6, 723.48, "htscodecs rans32sse 0" , true); add_point_no_entropy(obj, 2244.87, 80.6, 966.58, "htscodecs rans32avx2 0" , true); add_point_no_entropy(obj, 2139.47, 80.6, 739.14, "htscodecs rans32avx512 0" , true); @@ -768,6 +660,17 @@

x-ray (medical x-ray image)

add_point_no_entropy(obj, 144.30, 80.0, 183.35, "FastHF" , true); add_point_no_entropy(obj, 77.33, 79.7, 244.35, "FastAC" , true); + add_point_no_entropy(obj, 2966.83, 79.6, 989.68, "TurboANX 63 (Native Windows Build)", true); + add_point_no_entropy(obj, 2923.90, 79.6, 979.24, "TurboANX 48 (Native Windows Build)", true); + add_point_no_entropy(obj, 2904.99, 79.7, 982.57, "TurboANX 40 (Native Windows Build)", true); + add_point_no_entropy(obj, 2860.76, 79.7, 973.82, "TurboANX 32 (Native Windows Build)", true); + add_point_no_entropy(obj, 2785.82, 79.8, 962.68, "TurboANX 24 (Native Windows Build)", true); + add_point_no_entropy(obj, 2661.07, 79.9, 937.33, "TurboANX 16 (Native Windows Build)", true); + add_point_no_entropy(obj, 2360.30, 80.5, 864.63, "TurboANX 8 (Native Windows Build)", true); + add_point_no_entropy(obj, 1883.40, 81.9, 677.08, "TurboANX 4 (Native Windows Build)", true); + add_point_no_entropy(obj, 1292.65, 83.7, 600.46, "TurboANX 2 (Native Windows Build)", true); + add_point_no_entropy(obj, 719.84, 85.1, 387.40, "TurboANX 1 (Native Windows Build)", true); + add_line(obj, ["rANS32x32 16w 15", "rANS32x32 16w 14", "rANS32x32 16w 13", "rANS32x32 16w 12", "rANS32x32 16w 11", "rANS32x32 16w 10"], getColor()); add_line(obj, ["rANS32x64 16w 15", "rANS32x64 16w 14", "rANS32x64 16w 13", "rANS32x64 16w 12", "rANS32x64 16w 11", "rANS32x64 16w 10"], getColor()); @@ -787,6 +690,95 @@

x-ray (medical x-ray image)

obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("xray")); });
+
+

enwik8 (wikipedia extract)

+

Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.

+ + + +
\ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index c51e684..7d28257 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -198,19 +198,19 @@ static codec_info_t _Codecs[] = { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper, true }, {}}}, - { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, - { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, - { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, - { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, - { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, - { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, - - { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}}, - { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}}, - { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}}, - { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, - { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, - { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, + { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, + { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, + { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}}, + { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, + { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, + { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, + + { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}}, + { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}}, + { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}}, + { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}}, + { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}}, + { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}}, { "rANS32x16 16w (raw)", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}}, { "rANS32x16 16w (raw)", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}}, From 199816c669afb9ec076136372be592f5996087d7 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:13:41 +0200 Subject: [PATCH 27/34] Trying to support clang-cl, but that's currently not possible apparently --- premake5.lua | 4 ++-- project.lua | 13 ++++++++++--- src/block_codec32.h | 5 +++-- src/block_codec64.h | 9 +++++---- src/simd_platform.h | 2 +- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/premake5.lua b/premake5.lua index bd29293..8fe4c7c 100644 --- a/premake5.lua +++ b/premake5.lua @@ -7,13 +7,13 @@ solution "hsrans" configurations { "Release", "Debug", "ReleaseClang", "DebugClang" } linkgroups "On" filter { "configurations:*Clang" } - toolset "clang" + toolset "clang" filter { } elseif os.target() == "macosx" then configurations { "Release", "Debug" } toolset "clang" else - configurations { "Debug", "Release" } + configurations { "Release", "Debug", "ReleaseClang", "DebugClang" } end dofile "project.lua" diff --git a/project.lua b/project.lua index fca25fe..63eb160 100644 --- a/project.lua +++ b/project.lua @@ -7,15 +7,22 @@ project(ProjectName) staticruntime "On" filter { "system:windows" } - buildoptions { '/Gm-' } - buildoptions { '/MP' } ignoredefaultlibraries { "msvcrt" } - buildoptions { '/std:c++20' } filter { "system:linux" } buildoptions { "-mxsave" } linkoptions { "-pthread" } cppdialect "C++20" filter { } + + filter { "system:windows", "configurations:not *Clang" } + buildoptions { '/std:c++20' } + buildoptions { '/Gm-' } + buildoptions { '/MP' } + + filter { "system:windows", "configurations:*Clang" } + toolset("clang") + cppdialect "C++17" + defines { "__llvm__" } filter { "configurations:Release" } flags { "LinkTimeOptimization" } diff --git a/src/block_codec32.h b/src/block_codec32.h index 1366b5e..71159b4 100644 --- a/src/block_codec32.h +++ b/src/block_codec32.h @@ -2,6 +2,7 @@ #define block_codec32_h__ #include "hist.h" +#include "simd_platform.h" #include @@ -205,7 +206,7 @@ struct rans32x32_16w_decoder -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) __attribute__((target("avx2"))) #endif static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) @@ -441,7 +442,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) __attribute__((target("avx2"))) #endif static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) diff --git a/src/block_codec64.h b/src/block_codec64.h index 4ea95f3..42aaa32 100644 --- a/src/block_codec64.h +++ b/src/block_codec64.h @@ -2,6 +2,7 @@ #define block_codec64_h__ #include "hist.h" +#include "simd_platform.h" #include @@ -216,7 +217,7 @@ struct rans32x64_16w_decoder -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) __attribute__((target("avx2"))) #endif static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) @@ -601,7 +602,7 @@ static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) __attribute__((target("avx2"))) #endif static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex) @@ -976,7 +977,7 @@ static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) #ifdef __llvm__ __attribute__((target("avx512bw"))) #else @@ -1330,7 +1331,7 @@ static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t } template -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__llvm__) #ifdef __llvm__ __attribute__((target("avx512bw"))) #else diff --git a/src/simd_platform.h b/src/simd_platform.h index 905eba1..d1efac0 100644 --- a/src/simd_platform.h +++ b/src/simd_platform.h @@ -4,7 +4,7 @@ #include #include -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__llvm__) #include #define __builtin_popcount __popcnt #else From e7c14bc62cd890c67b269d7d8db1fc6e8317bb6b Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:35:14 +0200 Subject: [PATCH 28/34] Adding tests, simd restrictions --- .github/workflows/clang.yml | 61 +++++++++ .github/workflows/gcc.yml | 61 +++++++++ .github/workflows/msbuild.yml | 62 +++++++++ src/main.cpp | 247 ++++++++++++++++++++++++++++++---- 4 files changed, 406 insertions(+), 25 deletions(-) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index c287fc6..40c9951 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -24,3 +24,64 @@ jobs: with: name: build-clang64 path: builds/bin/hsrans + + test: + runs-on: ubuntu-latest + needs: build + + steps: + - uses: actions/download-artifact@v3 + with: + name: build-clang64 + + - name: Download Samples + working-directory: ${{env.GITHUB_WORKSPACE}} + run: git clone https://github.com/rainerzufalldererste/rle_samples.git + + - name: Test sample0 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample0 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample0 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample0 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample1 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample1 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample1 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample1 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample2 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample2 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample2 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample2 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none \ No newline at end of file diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index bc38be7..01d3de9 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -25,3 +25,64 @@ jobs: with: name: build-gcc64 path: builds/bin/hsrans + + test: + runs-on: ubuntu-latest + needs: build + + steps: + - uses: actions/download-artifact@v3 + with: + name: build-gcc64 + + - name: Download Samples + working-directory: ${{env.GITHUB_WORKSPACE}} + run: git clone https://github.com/rainerzufalldererste/rle_samples.git + + - name: Test sample0 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample0 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample0 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample0 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample1 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample1 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample1 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample1 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample2 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample2 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample2 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample2 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml index be996b5..4b58ad4 100644 --- a/.github/workflows/msbuild.yml +++ b/.github/workflows/msbuild.yml @@ -36,3 +36,65 @@ jobs: name: build-win64 path: builds/bin/hsrans.exe + test: + runs-on: windows-latest + needs: build + + steps: + - uses: actions/download-artifact@v3 + with: + name: build-win64 + + - name: Download Samples + working-directory: ${{env.GITHUB_WORKSPACE}} + run: git clone https://github.com/rainerzufalldererste/rle_samples.git + + - name: Test sample0 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample0 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample0 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample0 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample1 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample1 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample1 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample1 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + + - name: Test sample2 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + + - name: Test sample2 AVX2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + + - name: Test sample2 SSE4.2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + + - name: Test sample2 No-SIMD + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none + + diff --git a/src/main.cpp b/src/main.cpp index 7d28257..e2e67f3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -58,6 +58,7 @@ static bool _ExcludeBlock = false; static bool _Exclude32x16 = false; static bool _Exclude32x32 = false; static bool _Exclude32x64 = false; +static bool _IsTest = false; static size_t _RunCount = 8; static size_t _EncodeRunCount = 2; static size_t _DecodeRunCount = 16; @@ -168,7 +169,7 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe return func(pInData, inLength, pOutData, outCapacity, _pGlobalThreadPool); } -static codec_info_t _Codecs[] = +static const codec_info_t _Codecs[] = { { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}}, { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}}, @@ -251,6 +252,18 @@ const char ArgumentCpuCore[] = "--cpu-core"; const char ArgumentRuns[] = "--runs"; const char ArgumentRunsEncode[] = "--runs-enc"; const char ArgumentRunsDecode[] = "--runs-dec"; +const char ArgumentTest[] = "--test"; +const char ArgumentMaxSimd[] = "--max-simd"; +const char ArgumentMaxSimdAVX512BW[] = "avx512bw"; +const char ArgumentMaxSimdAVX512F[] = "avx512f"; +const char ArgumentMaxSimdAVX2[] = "avx2"; +const char ArgumentMaxSimdAVX[] = "avx"; +const char ArgumentMaxSimdSSE42[] = "sse4.2"; +const char ArgumentMaxSimdSSE41[] = "sse4.1"; +const char ArgumentMaxSimdSSSE3[] = "ssse3"; +const char ArgumentMaxSimdSSE3[] = "sse3"; +const char ArgumentMaxSimdSSE2[] = "sse2"; +const char ArgumentMaxSimdNone[] = "none"; ////////////////////////////////////////////////////////////////////////// @@ -273,7 +286,9 @@ int32_t main(const int32_t argc, char **pArgv) printf("\t%s \t\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode); printf("\t%s \tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode); printf("\t%s \tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode); - printf("\t%s \tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep); + printf("\t%s\t\tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep); + printf("\t%s\t\t\tRun as test scenario, fail on error, call codecs\n", ArgumentTest); + printf("\t%s <%s / %s / %s / %s / %s / %s / %s / %s / %s / %s>\n\t\t\t\tRestrict SIMD functions to specific instruction set\n", ArgumentMaxSimd, ArgumentMaxSimdAVX512BW, ArgumentMaxSimdAVX512F, ArgumentMaxSimdAVX2, ArgumentMaxSimdAVX, ArgumentMaxSimdSSE42, ArgumentMaxSimdSSE41, ArgumentMaxSimdSSSE3, ArgumentMaxSimdSSE3, ArgumentMaxSimdSSE2, ArgumentMaxSimdNone); return 1; } @@ -341,6 +356,19 @@ int32_t main(const int32_t argc, char **pArgv) argsRemaining--; _DisableSleep = true; } + else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentTest, sizeof(ArgumentTest)) == 0) + { + argIndex++; + argsRemaining--; + _IsTest = true; + _DisableSleep = true; + _EncodeRunCount = 1; + _DecodeRunCount = 1; + _Include32Block = true; + _IncludeRaw = true; + _IncludeMT = true; + _OnlyRelevantCodecs = false; + } else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRuns, sizeof(ArgumentRuns)) == 0) { _RunCount = strtoull(pArgv[argIndex + 1], nullptr, 10); @@ -432,6 +460,162 @@ int32_t main(const int32_t argc, char **pArgv) pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); #endif } + else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentMaxSimd, sizeof(ArgumentMaxSimd)) == 0) + { + _DetectCPUFeatures(); + + do + { + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX512BW, sizeof(ArgumentMaxSimdAVX512BW)) == 0) + { + if (!avx512BWSupported) + { + puts("AVX512BW is not supported by this platform. Aborting."); + return 1; + } + + // In future versions with other simd flavours better than avx512 supported, disable them here. + + break; + } + + avx512PFSupported = false; + avx512ERSupported = false; + avx512CDSupported = false; + avx512BWSupported = false; + avx512DQSupported = false; + avx512VLSupported = false; + avx512IFMASupported = false; + avx512VBMISupported = false; + avx512VNNISupported = false; + avx512VBMI2Supported = false; + avx512POPCNTDQSupported = false; + avx512BITALGSupported = false; + avx5124VNNIWSupported = false; + avx5124FMAPSSupported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX512F, sizeof(ArgumentMaxSimdAVX512F)) == 0) + { + if (!avx512FSupported) + { + puts("AVX512F is not supported by this platform. Aborting."); + return 1; + } + + // In future versions with other simd flavours better than avx512 supported, disable them here. + + break; + } + + avx512FSupported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX2, sizeof(ArgumentMaxSimdAVX2)) == 0) + { + if (!avx2Supported) + { + puts("AVX2 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + avx2Supported = false; + fma3Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX, sizeof(ArgumentMaxSimdAVX)) == 0) + { + if (!avxSupported) + { + puts("AVX is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + avxSupported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE42, sizeof(ArgumentMaxSimdSSE42)) == 0) + { + if (!sse42Supported) + { + puts("SSE4.2 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + sse42Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE41, sizeof(ArgumentMaxSimdSSE41)) == 0) + { + if (!sse41Supported) + { + puts("SSE4.1 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + sse41Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSSE3, sizeof(ArgumentMaxSimdSSSE3)) == 0) + { + if (!ssse3Supported) + { + puts("SSSE3 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + ssse3Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE3, sizeof(ArgumentMaxSimdSSE3)) == 0) + { + if (!sse3Supported) + { + puts("SSE3 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + sse3Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE2, sizeof(ArgumentMaxSimdSSE2)) == 0) + { + if (!sse2Supported) + { + puts("SSE2 is not supported by this platform. Aborting."); + return 1; + } + + break; + } + + sse2Supported = false; + + if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdNone, sizeof(ArgumentMaxSimdNone)) == 0) + { + printf("%s %s is only intended for testing purposes and will only restrict some codecs to no SIMD\n", ArgumentMaxSimd, ArgumentMaxSimdNone); + + break; + } + + printf("Invalid SIMD Variant '%s' specified.", pArgv[argIndex + 1]); + return 1; + + } while (false); + + argIndex += 2; + argsRemaining -= 2; + } else { printf("Invalid Parameter '%s'. Aborting.", pArgv[argIndex]); @@ -580,22 +764,22 @@ int32_t main(const int32_t argc, char **pArgv) size_t encodedSize = 0; _RunCount = _EncodeRunCount; - for (size_t i = 0; i < MaxEncoderCount; i++) + for (size_t codecFuncIndex = 0; codecFuncIndex < MaxEncoderCount; codecFuncIndex++) { - if (_Codecs[codecId].encoders[i].name == nullptr) + if (_Codecs[codecId].encoders[codecFuncIndex].name == nullptr) break; - if (_OnlyRelevantCodecs && !_Codecs[codecId].encoders[i].candidateForFastest) + if (_OnlyRelevantCodecs && !_Codecs[codecId].encoders[codecFuncIndex].candidateForFastest) continue; - if (strstr(_Codecs[codecId].encoders[i].name, " avx2 ") != nullptr && !avx2Supported) + if (strstr(_Codecs[codecId].encoders[codecFuncIndex].name, " avx2 ") != nullptr && !avx2Supported) { - printf(" %-38s | | (Skipped; No AVX2 available)\n", _Codecs[codecId].encoders[i].name); + printf(" %-38s | | (Skipped; No AVX2 available)\n", _Codecs[codecId].encoders[codecFuncIndex].name); continue; } - else if (strstr(_Codecs[codecId].encoders[i].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported)) + else if (strstr(_Codecs[codecId].encoders[codecFuncIndex].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported)) { - printf(" %-38s | | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].encoders[i].name); + printf(" %-38s | | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].encoders[codecFuncIndex].name); continue; } @@ -604,7 +788,7 @@ int32_t main(const int32_t argc, char **pArgv) if (_RunCount > 1) { printf("\r (dry run)"); - encodedSize = _Codecs[codecId].encoders[i].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist); + encodedSize = _Codecs[codecId].encoders[codecFuncIndex].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist); } SleepNs(2500ULL * 1000 * 1000); @@ -613,7 +797,7 @@ int32_t main(const int32_t argc, char **pArgv) { const uint64_t startTick = GetCurrentTimeTicks(); const uint64_t startClock = __rdtsc(); - encodedSize = _Codecs[codecId].encoders[i].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist); + encodedSize = _Codecs[codecId].encoders[codecFuncIndex].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist); const uint64_t endClock = __rdtsc(); const uint64_t endTick = GetCurrentTimeTicks(); @@ -622,12 +806,12 @@ int32_t main(const int32_t argc, char **pArgv) _NsPerRun[run] = TicksToNs(endTick - startTick); _ClocksPerRun[run] = endClock - startClock; - printf("\r %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); + printf("\r %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[codecFuncIndex].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000)); } - printf("\r %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0); + printf("\r %-38s | %6.2f %% ", _Codecs[codecId].encoders[codecFuncIndex].name, encodedSize / (double)fileSize * 100.0); print_perf_info(fileSize); if (_Codecs[codecId].decoders[0].func != nullptr) @@ -635,33 +819,41 @@ int32_t main(const int32_t argc, char **pArgv) const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize); if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize)) + { puts("Failed to validate."); + + if (_IsTest) + return 1; + } } else { puts("Unable to validate, no decoder available."); + + if (_IsTest) + return 2; } } size_t decodedSize = 0; _RunCount = _DecodeRunCount; - for (size_t i = 0; i < MaxDecoderCount; i++) + for (size_t codecFuncIndex = 0; codecFuncIndex < MaxDecoderCount; codecFuncIndex++) { - if (_Codecs[codecId].decoders[i].name == nullptr) + if (_Codecs[codecId].decoders[codecFuncIndex].name == nullptr) break; - if (_OnlyRelevantCodecs && !_Codecs[codecId].decoders[i].candidateForFastest) - continue; + if (_OnlyRelevantCodecs && !_Codecs[codecId].decoders[codecFuncIndex].candidateForFastest) + continue; - if (strstr(_Codecs[codecId].decoders[i].name, " avx2 ") != nullptr && !avx2Supported) + if (strstr(_Codecs[codecId].decoders[codecFuncIndex].name, " avx2 ") != nullptr && !avx2Supported) { - printf(" %-38s | | (Skipped; No AVX2 available)\n", _Codecs[codecId].decoders[i].name); + printf(" %-38s | | (Skipped; No AVX2 available)\n", _Codecs[codecId].decoders[codecFuncIndex].name); continue; } - else if (strstr(_Codecs[codecId].decoders[i].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported)) + else if (strstr(_Codecs[codecId].decoders[codecFuncIndex].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported)) { - printf(" %-38s | | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].decoders[i].name); + printf(" %-38s | | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].decoders[codecFuncIndex].name); continue; } @@ -670,7 +862,7 @@ int32_t main(const int32_t argc, char **pArgv) if (_RunCount > 1) { printf("\r(dry run)"); - decodedSize = _Codecs[codecId].decoders[i].func(pCompressedData, encodedSize, pDecompressedData, fileSize); + decodedSize = _Codecs[codecId].decoders[codecFuncIndex].func(pCompressedData, encodedSize, pDecompressedData, fileSize); } SleepNs(2500ULL * 1000 * 1000); @@ -679,7 +871,7 @@ int32_t main(const int32_t argc, char **pArgv) { const uint64_t startTick = GetCurrentTimeTicks(); const uint64_t startClock = __rdtsc(); - decodedSize = _Codecs[codecId].decoders[i].func(pCompressedData, encodedSize, pDecompressedData, fileSize); + decodedSize = _Codecs[codecId].decoders[codecFuncIndex].func(pCompressedData, encodedSize, pDecompressedData, fileSize); const uint64_t endClock = __rdtsc(); const uint64_t endTick = GetCurrentTimeTicks(); @@ -688,16 +880,21 @@ int32_t main(const int32_t argc, char **pArgv) _NsPerRun[run] = TicksToNs(endTick - startTick); _ClocksPerRun[run] = endClock - startClock; - printf("\r %-38s | | decompressed to %" PRIu64 " bytes. (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].decoders[i].name, decodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); + printf("\r %-38s | | decompressed to %" PRIu64 " bytes. (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].decoders[codecFuncIndex].name, decodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9)); SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000)); } - printf("\r %-38s | ", _Codecs[codecId].decoders[i].name); + printf("\r %-38s | ", _Codecs[codecId].decoders[codecFuncIndex].name); print_perf_info(fileSize); if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize)) + { puts("\nFailed to validate."); + + if (_IsTest) + return 1; + } } } From 290150d7b2a1d68f0e5a6154888fca2ba2d67d47 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:37:52 +0200 Subject: [PATCH 29/34] whoops, not executable again --- .github/workflows/clang.yml | 4 ++++ .github/workflows/gcc.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index 40c9951..f48b2ec 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -33,6 +33,10 @@ jobs: - uses: actions/download-artifact@v3 with: name: build-clang64 + + - name: Mark as Executable + working-directory: ${{env.GITHUB_WORKSPACE}} + run: chmod +x hsrans - name: Download Samples working-directory: ${{env.GITHUB_WORKSPACE}} diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 01d3de9..dd534c7 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -34,6 +34,10 @@ jobs: - uses: actions/download-artifact@v3 with: name: build-gcc64 + + - name: Mark as Executable + working-directory: ${{env.GITHUB_WORKSPACE}} + run: chmod +x hsrans - name: Download Samples working-directory: ${{env.GITHUB_WORKSPACE}} From 4674302c01baed1ac92717232496b0b530194ce5 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:40:17 +0200 Subject: [PATCH 30/34] yeah, that doesn't work, it's a different application... --- .github/workflows/clang.yml | 24 ++++++++++++------------ .github/workflows/gcc.yml | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index f48b2ec..f83692c 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -44,48 +44,48 @@ jobs: - name: Test sample0 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2 - name: Test sample0 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2 - name: Test sample0 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample0.bin --test --max-simd none - name: Test sample1 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2 - name: Test sample1 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2 - name: Test sample1 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample1.bin --test --max-simd none - name: Test sample2 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2 - name: Test sample2 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2 - name: Test sample2 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none \ No newline at end of file + run: ./hsrans rle_samples/sample2.bin --test --max-simd none \ No newline at end of file diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index dd534c7..1d8aa74 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -45,48 +45,48 @@ jobs: - name: Test sample0 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2 - name: Test sample0 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2 - name: Test sample0 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample0.bin --test --max-simd none - name: Test sample1 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2 - name: Test sample1 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2 - name: Test sample1 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample1.bin --test --max-simd none - name: Test sample2 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2 - name: Test sample2 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2 - name: Test sample2 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample2.bin --test --max-simd none From 14c4a70f463838cd422dc2a3b769d22eaa9138d5 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:48:01 +0200 Subject: [PATCH 31/34] duh... --- .github/workflows/msbuild.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml index 4b58ad4..6a0bb17 100644 --- a/.github/workflows/msbuild.yml +++ b/.github/workflows/msbuild.yml @@ -51,50 +51,50 @@ jobs: - name: Test sample0 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2 - name: Test sample0 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2 - name: Test sample0 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample0.bin --test --max-simd none - name: Test sample1 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2 - name: Test sample1 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2 - name: Test sample1 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample1.bin --test --max-simd none - name: Test sample2 AVX512BW working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2 - name: Test sample2 SSE4.2 working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2 + run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2 - name: Test sample2 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none + run: ./hsrans rle_samples/sample2.bin --test --max-simd none From ff42f64868f25636b79d77c5c1ed111f24261944 Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:51:24 +0200 Subject: [PATCH 32/34] excluding avx-512 bw, as that's not supported on the build servers (sometimes?) --- .github/workflows/clang.yml | 20 ++++++++++---------- .github/workflows/gcc.yml | 18 +++++++++--------- .github/workflows/msbuild.yml | 20 +++++++++----------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index f83692c..251f38a 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -42,9 +42,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: git clone https://github.com/rainerzufalldererste/rle_samples.git - - name: Test sample0 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw + #- name: Test sample0 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -58,9 +58,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample0.bin --test --max-simd none - - name: Test sample1 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw + #- name: Test sample1 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -74,9 +74,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample1.bin --test --max-simd none - - name: Test sample2 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw + #- name: Test sample2 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -88,4 +88,4 @@ jobs: - name: Test sample2 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd none \ No newline at end of file + run: ./hsrans rle_samples/sample2.bin --test --max-simd none diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 1d8aa74..633f12b 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -43,9 +43,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: git clone https://github.com/rainerzufalldererste/rle_samples.git - - name: Test sample0 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw + #- name: Test sample0 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -59,9 +59,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample0.bin --test --max-simd none - - name: Test sample1 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw + #- name: Test sample1 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -75,9 +75,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample1.bin --test --max-simd none - - name: Test sample2 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw + #- name: Test sample2 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml index 6a0bb17..7aaeb68 100644 --- a/.github/workflows/msbuild.yml +++ b/.github/workflows/msbuild.yml @@ -49,9 +49,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: git clone https://github.com/rainerzufalldererste/rle_samples.git - - name: Test sample0 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw + #- name: Test sample0 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -65,9 +65,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample0.bin --test --max-simd none - - name: Test sample1 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw + #- name: Test sample1 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -81,9 +81,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample1.bin --test --max-simd none - - name: Test sample2 AVX512BW - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw + #- name: Test sample2 AVX512BW + # working-directory: ${{env.GITHUB_WORKSPACE}} + # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -96,5 +96,3 @@ jobs: - name: Test sample2 No-SIMD working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample2.bin --test --max-simd none - - From 62a93571eb43d40c29703c4b558c74a5b3b83c7a Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 17:57:58 +0200 Subject: [PATCH 33/34] Removing GCC tests because they're apparently just hanging??? adding native variant in case AVX-512 BW _is_ available --- .github/workflows/clang.yml | 18 ++--- .github/workflows/gcc.yml | 128 +++++++++++++++++----------------- .github/workflows/msbuild.yml | 18 ++--- 3 files changed, 82 insertions(+), 82 deletions(-) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index 251f38a..4bca8f3 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -42,9 +42,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: git clone https://github.com/rainerzufalldererste/rle_samples.git - #- name: Test sample0 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw + - name: Test sample0 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -58,9 +58,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample0.bin --test --max-simd none - #- name: Test sample1 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw + - name: Test sample1 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -74,9 +74,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample1.bin --test --max-simd none - #- name: Test sample2 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw + - name: Test sample2 AVX512BW + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 633f12b..29f1eba 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -26,67 +26,67 @@ jobs: name: build-gcc64 path: builds/bin/hsrans - test: - runs-on: ubuntu-latest - needs: build - - steps: - - uses: actions/download-artifact@v3 - with: - name: build-gcc64 - - - name: Mark as Executable - working-directory: ${{env.GITHUB_WORKSPACE}} - run: chmod +x hsrans - - - name: Download Samples - working-directory: ${{env.GITHUB_WORKSPACE}} - run: git clone https://github.com/rainerzufalldererste/rle_samples.git - - #- name: Test sample0 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw - - - name: Test sample0 AVX2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2 - - - name: Test sample0 SSE4.2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2 - - - name: Test sample0 No-SIMD - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample0.bin --test --max-simd none - - #- name: Test sample1 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw - - - name: Test sample1 AVX2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2 - - - name: Test sample1 SSE4.2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2 - - - name: Test sample1 No-SIMD - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample1.bin --test --max-simd none - - #- name: Test sample2 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw - - - name: Test sample2 AVX2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2 - - - name: Test sample2 SSE4.2 - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2 - - - name: Test sample2 No-SIMD - working-directory: ${{env.GITHUB_WORKSPACE}} - run: ./hsrans rle_samples/sample2.bin --test --max-simd none +# test: +# runs-on: ubuntu-latest +# needs: build +# +# steps: +# - uses: actions/download-artifact@v3 +# with: +# name: build-gcc64 +# +# - name: Mark as Executable +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: chmod +x hsrans +# +# - name: Download Samples +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: git clone https://github.com/rainerzufalldererste/rle_samples.git +# +# #- name: Test sample0 +# # working-directory: ${{env.GITHUB_WORKSPACE}} +# # run: ./hsrans rle_samples/sample0.bin --test +# +# - name: Test sample0 AVX2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2 +# +# - name: Test sample0 SSE4.2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2 +# +# - name: Test sample0 No-SIMD +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample0.bin --test --max-simd none +# +# #- name: Test sample1 +# # working-directory: ${{env.GITHUB_WORKSPACE}} +# # run: ./hsrans rle_samples/sample1.bin --test +# +# - name: Test sample1 AVX2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2 +# +# - name: Test sample1 SSE4.2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2 +# +# - name: Test sample1 No-SIMD +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample1.bin --test --max-simd none +# +# #- name: Test sample2 +# # working-directory: ${{env.GITHUB_WORKSPACE}} +# # run: ./hsrans rle_samples/sample2.bin --test +# +# - name: Test sample2 AVX2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2 +# +# - name: Test sample2 SSE4.2 +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2 +# +# - name: Test sample2 No-SIMD +# working-directory: ${{env.GITHUB_WORKSPACE}} +# run: ./hsrans rle_samples/sample2.bin --test --max-simd none diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml index 7aaeb68..3bd60fa 100644 --- a/.github/workflows/msbuild.yml +++ b/.github/workflows/msbuild.yml @@ -49,9 +49,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: git clone https://github.com/rainerzufalldererste/rle_samples.git - #- name: Test sample0 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw + - name: Test sample0 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample0.bin --test - name: Test sample0 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -65,9 +65,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample0.bin --test --max-simd none - #- name: Test sample1 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw + - name: Test sample1 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample1.bin --test - name: Test sample1 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} @@ -81,9 +81,9 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} run: ./hsrans rle_samples/sample1.bin --test --max-simd none - #- name: Test sample2 AVX512BW - # working-directory: ${{env.GITHUB_WORKSPACE}} - # run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw + - name: Test sample2 + working-directory: ${{env.GITHUB_WORKSPACE}} + run: ./hsrans rle_samples/sample2.bin --test - name: Test sample2 AVX2 working-directory: ${{env.GITHUB_WORKSPACE}} From a6a054f5f46e742722a14e2ebc94af3a00386b7c Mon Sep 17 00:00:00 2001 From: Christoph Stiller Date: Sat, 8 Jul 2023 18:07:39 +0200 Subject: [PATCH 34/34] Center aligning checkmarks --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5fa70cb..29ebdcc 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes) | Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | -| -- | --: | --: | --: | --: | --: | +| -- | :-: | --: | --: | --: | --: | | **rANS32x64 16w 11 (raw)** | ✔️ | 64.48 % | 336.81 MiB/s | 1.42 clk/byte | 3,018.02 MiB/s | | **rANS32x64 16w 10 (raw)** | ✔️ | 65.97 % | 335.28 MiB/s | 1.42 clk/byte | 3,013.45 MiB/s | | **rANS32x64 16w 12 (raw)** | ✔️ | 63.83 % | 347.90 MiB/s | 1.42 clk/byte | 3,009.18 MiB/s | @@ -77,7 +77,7 @@ ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus) | Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | -| -- | --: | --: | --: | --: | --: | +| -- | :-: | --: | --: | --: | --: | | **rANS32x64 16w 11 (raw)** | ✔️ | 82.60 % | 311.60 MiB/s | 1.39 clk/byte | 3,079.98 MiB/s | | **rANS32x64 16w 12** | ✔️ | 80.17 % | 193.60 MiB/s | 1.41 clk/byte | 3,048.15 MiB/s | | **rANS32x64 16w 12 (raw)** | ✔️ | 82.57 % | 308.10 MiB/s | 1.41 clk/byte | 3,041.07 MiB/s | @@ -126,7 +126,7 @@ ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus) | Codec Type | Open-
Source | Ratio | Encoder
Throughput | Decoder
Clocks/Byte | Decoder
Throughput | -| -- | --: | --: | --: | --: | --: | +| -- | :-: | --: | --: | --: | --: | | **rANS32x64 16w 11 (raw)** | ✔️ | 77.82 % | 309.39 MiB/s | 1.44 clk/byte | 2,978.20 MiB/s | | TurboANX 63 | ❌ | 70.1 % | 965.97 MiB/s | - | 2,959.13 MiB/s | | **rANS32x64 16w 12 (raw)** | ✔️ | 77.79 % | 308.29 MiB/s | 1.45 clk/byte | 2,946.52 MiB/s |