From 049b99bed605e80944dd4a7b0a45020c89da6726 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Mon, 3 Jul 2023 22:58:36 +0200
Subject: [PATCH 01/34] Investigating block sizes

---
 src/block_rANS32x32_16w.cpp | 299 ++++++++++++++++++++++++++++++++++++
 src/block_rANS32x32_16w.h   |  22 +++
 src/hist.cpp                |  35 +++--
 src/hist.h                  |   4 +
 src/main.cpp                | 113 +++++++++-----
 5 files changed, 420 insertions(+), 53 deletions(-)
 create mode 100644 src/block_rANS32x32_16w.cpp
 create mode 100644 src/block_rANS32x32_16w.h

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
new file mode 100644
index 0000000..76d80fb
--- /dev/null
+++ b/src/block_rANS32x32_16w.cpp
@@ -0,0 +1,299 @@
+#include "block_rANS32x32_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+#include <stdio.h>
+#include <inttypes.h>
+
+constexpr size_t StateCount = 32; // Needs to be a power of two.
+constexpr bool EncodeNoBranch = false;
+//constexpr bool DecodeNoBranch = false;
+constexpr size_t SafeHistBitMax = 0;
+constexpr size_t MinBlockSize = 1 << 15;
+
+template <size_t TotalSymbolCountBits>
+struct HistReplaceMul
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 200; } };
+
+size_t block_rANS32x32_16w_capacity(const size_t inputSize)
+{
+  const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
+  const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1;
+  const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t);
+
+  return baseSize + blockCount * perBlockExtraSize; // i hope this covers all of our bases.
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (outCapacity < block_rANS32x32_16w_capacity(length))
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+
+  uint32_t states[StateCount];
+  uint16_t *pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
+  uint16_t *pStart = pEnd;
+  uint16_t *pBlockBack = pStart;
+  size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
+  size_t blockLowCmp = blockLowI + StateCount;
+
+  size_t histCount = 1;
+  size_t histPotentialCount = 1;
+  size_t histDiff = 0;
+  size_t histPotentialDiff = 0;
+  size_t histRejectedDiff = 0;
+
+  if (blockLowI > MinBlockSize)
+    blockLowI -= MinBlockSize;
+
+  uint32_t symCount[256];
+  observe_hist(symCount, pInData + blockLowI, length - blockLowI);
+
+  if constexpr (IsSafeHist)
+    for (size_t j = 0; j < 256; j++)
+      if (symCount[j] == 0)
+        symCount[j] = 1;
+
+  hist_t hist;
+  normalize_hist(&hist, symCount, length - blockLowI, TotalSymbolCountBits);
+
+  // Init States.
+  for (size_t i = 0; i < StateCount; i++)
+    states[i] = DecodeConsumePoint16;
+
+  const uint8_t idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+  static_assert(sizeof(idx2idx) == StateCount);
+
+  int64_t i = length - 1;
+  i &= ~(size_t)(StateCount - 1);
+  i += StateCount;
+
+  for (int64_t j = StateCount - 1; j >= 0; j--)
+  {
+    const uint8_t index = idx2idx[j];
+
+    if (i - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    {
+      const uint8_t in = pInData[i - StateCount + index];
+      const uint32_t symbolCount = hist.symbolCount[in];
+      const uint32_t max = EncodeEmitPoint * symbolCount;
+
+      const size_t stateIndex = j;
+
+      uint32_t state = states[stateIndex];
+
+      if constexpr (EncodeNoBranch)
+      {
+        const bool write = state >= max;
+        *pStart = (uint16_t)(state & 0xFFFF);
+        *pStart -= (size_t)write;
+        state = write ? state >> 16 : state;
+      }
+      else
+      {
+        if (state >= max)
+        {
+          *pStart = (uint16_t)(state & 0xFFFF);
+          pStart--;
+          state >>= 16;
+        }
+      }
+
+      states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount);
+    }
+  }
+
+  i -= StateCount;
+
+  while (true)
+  {
+    for (; i >= (int64_t)blockLowCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pStart = (uint16_t)(state & 0xFFFF);
+          *pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pStart = (uint16_t)(state & 0xFFFF);
+            pStart--;
+            state >>= 16;
+          }
+        }
+
+        states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount);
+      }
+    }
+
+    if (i == 0)
+      break;
+
+    // Potentially replace histogram.
+    {
+      blockLowI = i - MinBlockSize;
+      blockLowCmp = blockLowI + StateCount;
+
+      memset(symCount, 0, sizeof(symCount));
+      observe_hist(symCount, pInData + blockLowI, MinBlockSize);
+
+      bool mustReplaceHist = false;
+
+      if constexpr (!IsSafeHist)
+      {
+        for (size_t j = 0; j < 256; j++)
+        {
+          if (symCount[j] > 0 && hist.symbolCount[j] == 0)
+          {
+            mustReplaceHist = true;
+            normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits);
+            break;
+          }
+        }
+      }
+
+      if (!mustReplaceHist)
+      {
+        if constexpr (IsSafeHist)
+          for (size_t j = 0; j < 256; j++)
+            if (symCount[j] == 0)
+              symCount[j] = 1;
+
+        hist_t newHist;
+
+        if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize)
+        {
+          for (size_t j = 0; j < 256; j++)
+            newHist.symbolCount[j] = (uint16_t)symCount[j];
+        }
+        else
+        {
+          normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits);
+        }
+
+        size_t accumAbsDiff = 0;
+
+        for (size_t j = 0; j < 256; j++)
+          accumAbsDiff += (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]);
+
+        histPotentialCount++;
+        histPotentialDiff += accumAbsDiff;
+
+        constexpr size_t histReplacePoint = ((1 << TotalSymbolCountBits) * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 10;
+
+        if (accumAbsDiff >= histReplacePoint)
+        {
+          histDiff += accumAbsDiff;
+          mustReplaceHist = true;
+          hist = newHist;
+        }
+        else
+        {
+          histRejectedDiff += accumAbsDiff;
+        }
+      }
+
+      if (mustReplaceHist)
+      {
+        const uint64_t blockSize = pBlockBack - pStart;
+
+        pStart++;
+        pStart -= 256;
+        memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount));
+        pStart -= sizeof(uint64_t);
+        memcpy(pStart, &blockSize, sizeof(blockSize));
+
+        pStart--;
+        pBlockBack = pStart;
+
+        histCount++;
+      }
+    }
+  }
+
+  uint8_t *pWrite = pOutData;
+  size_t outIndex = 0;
+
+  *reinterpret_cast<uint64_t *>(pWrite + outIndex) = (uint64_t)length;
+  outIndex += sizeof(uint64_t);
+
+  // compressed expected length.
+  outIndex += sizeof(uint64_t);
+
+  for (size_t j = 0; j < 256; j++)
+  {
+    *reinterpret_cast<uint16_t *>(pWrite + outIndex) = hist.symbolCount[j];
+    outIndex += sizeof(uint16_t);
+  }
+
+  for (size_t j = 0; j < StateCount; j++)
+  {
+    *reinterpret_cast<uint32_t *>(pWrite + outIndex) = states[j];
+    outIndex += sizeof(uint32_t);
+  }
+
+  const size_t size = (pEnd - pStart) * sizeof(uint16_t);
+
+  memmove(pWrite + outIndex, pStart + 1, size);
+  outIndex += size;
+
+  *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
+
+  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3fk, total: %5.3fk, rejected: %5.3fk\n", histCount, histPotentialCount, (length / 1024.0) / histCount, (histDiff / 1024.0) / histCount, (histPotentialDiff / 1024.0) / histPotentialCount, (histRejectedDiff / 1024.0) / (histPotentialCount - histCount));
+
+  return outIndex;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<14>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<13>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<12>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<11>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<10>(pInData, length, pOutData, outCapacity); }
+
+size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
+size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
+size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
+size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
+size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
+size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
diff --git a/src/block_rANS32x32_16w.h b/src/block_rANS32x32_16w.h
new file mode 100644
index 0000000..47b6bb3
--- /dev/null
+++ b/src/block_rANS32x32_16w.h
@@ -0,0 +1,22 @@
+#ifndef block_rANS32x32_16w_h__
+#define block_rANS32x32_16w_h__
+
+#include "hist.h"
+
+size_t block_rANS32x32_16w_capacity(const size_t inputSize);
+
+size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+
+size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+
+#endif // block_rANS32x32_16w_h__
diff --git a/src/hist.cpp b/src/hist.cpp
index 3aab176..4124ef7 100644
--- a/src/hist.cpp
+++ b/src/hist.cpp
@@ -4,21 +4,19 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits)
+void observe_hist(uint32_t hist[256], const uint8_t *pData, const size_t size)
 {
-  uint32_t hist[256];
-  memset(hist, 0, sizeof(hist));
-
-  const uint32_t totalSymbolCount = ((uint32_t)1 << totalSymbolCountBits);
+  memset(hist, 0, sizeof(uint32_t) * 256);
 
   for (size_t i = 0; i < size; i++)
     hist[pData[i]]++;
+}
 
-  uint32_t counter = 0;
-
-  for (size_t i = 0; i < 256; i++)
-    counter += hist[i];
+void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBytes, const size_t totalSymbolCountBits)
+{
+  const uint32_t totalSymbolCount = ((uint32_t)1 << totalSymbolCountBits);
 
+  size_t counter = dataBytes;
   uint16_t capped[256];
   size_t cappedSum = 0;
 
@@ -73,7 +71,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
   }
   else
   {
-    const uint32_t div = counter / totalSymbolCount;
+    const uint32_t div = (uint32_t)(counter / (size_t)totalSymbolCount);
 
     if (div)
     {
@@ -91,7 +89,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
     }
     else
     {
-      const uint32_t mul = totalSymbolCount / counter;
+      const uint32_t mul = (uint32_t)((size_t)totalSymbolCount / counter);
 
       for (size_t i = 0; i < 256; i++)
       {
@@ -109,13 +107,13 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
 
       while (true)
       {
-        size_t found = totalSymbolCount;
+        size_t found = totalSymbolCount + 1;
 
         for (size_t i = 0; i < 256; i++)
           if (capped[i] > target && capped[i] < found)
             found = capped[i];
 
-        if (found == totalSymbolCount)
+        if (found == totalSymbolCount + 1)
           break;
 
         for (size_t i = 0; i < 256; i++)
@@ -136,7 +134,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
 
     while (cappedSum < totalSymbolCount) // Start a charity.
     {
-      size_t target = totalSymbolCount;
+      size_t target = totalSymbolCount + 1;
 
       while (true)
       {
@@ -156,7 +154,7 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
             capped[i]++;
             cappedSum++;
 
-            if (cappedSum == totalSymbolCount)
+            if (cappedSum == totalSymbolCount + 1)
               goto hist_ready;
           }
         }
@@ -177,6 +175,13 @@ void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const siz
   }
 }
 
+void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits)
+{
+  uint32_t hist[256];
+  observe_hist(hist, pData, size);
+  normalize_hist(pHist, hist, size, totalSymbolCountBits);
+}
+
 void make_enc_hist(hist_enc_t *pHistEnc, const hist_t *pHist)
 {
   for (size_t i = 0; i < 256; i++)
diff --git a/src/hist.h b/src/hist.h
index 1a1c449..78d7074 100644
--- a/src/hist.h
+++ b/src/hist.h
@@ -51,6 +51,10 @@ struct hist_dec_pack_t
 
 //////////////////////////////////////////////////////////////////////////
 
+void observe_hist(uint32_t hist[256], const uint8_t *pData, const size_t size);
+
+void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBytes, const size_t totalSymbolCountBits);
+
 // `totalSymbolCountBits` should be <= 15
 void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits);
 
diff --git a/src/main.cpp b/src/main.cpp
index 44d7e9a..15a9075 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,6 +10,7 @@
 #include "rANS32x32_16w.h"
 #include "rANS32x16_16w.h"
 #include "rANS32x64_16w.h"
+#include "block_rANS32x32_16w.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -48,6 +49,7 @@ static bool _OnlyRelevantCodecs = true;
 static size_t _HistMax = 15;
 static size_t _HistMin = 10;
 static bool _Include32Block = false;
+static bool _IncludeRaw = false;
 static size_t _RunCount = 8;
 
 constexpr size_t MaxRunCount = 256;
@@ -139,42 +141,55 @@ struct codec_info_t
   func_info_t<decodeFunc> decoders[MaxDecoderCount];
 };
 
+template <size_t (*func)(const uint8_t *, const size_t, uint8_t *, const size_t)>
+size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity, const hist_t *)
+{
+  return func(pInData, length, pOutData, outCapacity);
+}
+
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
-  { "rANS32x32 16w", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
-  { "rANS32x32 16w", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
-  { "rANS32x32 16w", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
-  { "rANS32x32 16w", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
-  { "rANS32x32 16w", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
-
-  { "rANS32x64 16w", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}},
-  { "rANS32x64 16w", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}},
-  { "rANS32x64 16w", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}},
-  { "rANS32x64 16w", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
-  { "rANS32x64 16w", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
-  { "rANS32x64 16w", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
-
-  { "rANS32x32 32blk 16w", 15, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_15, true }, {}}},
-  { "rANS32x32 32blk 16w", 14, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_14, true }, {}}},
-  { "rANS32x32 32blk 16w", 13, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_13, true }, {}}},
-  { "rANS32x32 32blk 16w", 12, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_12, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_12, true }, {}}},
-  { "rANS32x32 32blk 16w", 11, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_11, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_11, true }, {}}},
-  { "rANS32x32 32blk 16w", 10, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_10, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_10, true }, {}}},
-
-  { "rANS32x32 32blk 8w", 15, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_15, true }, {}}},
-  { "rANS32x32 32blk 8w", 14, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_14, true }, {}}},
-  { "rANS32x32 32blk 8w", 13, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_13, true }, {}}},
-  { "rANS32x32 32blk 8w", 12, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_12 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_12, true }, {}}},
-  { "rANS32x32 32blk 8w", 11, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_11 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_11, true }, {}}},
-  { "rANS32x32 32blk 8w", 10, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_10 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_10, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  
+  { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
+  { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
+  { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
+  { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
+  { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
+  { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
+
+  { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}},
+  { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}},
+  { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}},
+  { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
+  { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
+  { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
   
-  { "rANS32x16 16w", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
-  { "rANS32x16 16w", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
-  { "rANS32x16 16w", 13, {{ "enc scalar", rANS32x16_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
-  { "rANS32x16 16w", 12, {{ "enc scalar", rANS32x16_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_12 }, {}}},
-  { "rANS32x16 16w", 11, {{ "enc scalar", rANS32x16_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_11 }, {}}},
-  { "rANS32x16 16w", 10, {{ "enc scalar", rANS32x16_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_10 }, {}}},
+  { "rANS32x16 16w (raw)", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
+  { "rANS32x16 16w (raw)", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
+  { "rANS32x16 16w (raw)", 13, {{ "enc scalar", rANS32x16_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
+  { "rANS32x16 16w (raw)", 12, {{ "enc scalar", rANS32x16_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_12 }, {}}},
+  { "rANS32x16 16w (raw)", 11, {{ "enc scalar", rANS32x16_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_11 }, {}}},
+  { "rANS32x16 16w (raw)", 10, {{ "enc scalar", rANS32x16_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varC_10 }, {}}},
+
+  { "rANS32x32 32blk 16w (raw)", 15, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_15, true }, {}}},
+  { "rANS32x32 32blk 16w (raw)", 14, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_14, true }, {}}},
+  { "rANS32x32 32blk 16w (raw)", 13, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_13, true }, {}}},
+  { "rANS32x32 32blk 16w (raw)", 12, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_12, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_12, true }, {}}},
+  { "rANS32x32 32blk 16w (raw)", 11, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_11, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_11, true }, {}}},
+  { "rANS32x32 32blk 16w (raw)", 10, {{ "enc scalar", rANS32x32_32blk_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_16w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_16w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varA2_10, }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_16w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_16w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_16w_decode_avx2_varC2_10, true }, {}}},
+
+  { "rANS32x32 32blk 8w (raw)", 15, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_15 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_15 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_15, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_15 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_15, true }, {}}},
+  { "rANS32x32 32blk 8w (raw)", 14, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_14 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_14 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_14, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_14 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_14, true }, {}}},
+  { "rANS32x32 32blk 8w (raw)", 13, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_13 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_13 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_13, true }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_13 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_13, true }, {}}},
+  { "rANS32x32 32blk 8w (raw)", 12, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_12 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_12 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_12 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_12 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_12 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_12 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_12, true }, {}}},
+  { "rANS32x32 32blk 8w (raw)", 11, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_11 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_11 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_11 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_11 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_11 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_11 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_11, true }, {}}},
+  { "rANS32x32 32blk 8w (raw)", 10, {{ "enc scalar", rANS32x32_32blk_8w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_32blk_8w_decode_scalar_10 }, { "dec avx2 (sym dep gthr)", rANS32x32_32blk_8w_decode_avx2_varA_10 }, { "dec avx2 (sym dep gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varA2_10 }, { "dec avx2 (sym idp gthr)", rANS32x32_32blk_8w_decode_avx2_varB_10 }, { "dec avx2 (sym idp gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varB2_10 }, { "dec avx2 (1x gthr)", rANS32x32_32blk_8w_decode_avx2_varC_10 }, { "dec avx2 (1x gthr 2x)", rANS32x32_32blk_8w_decode_avx2_varC2_10, true }, {}}},
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -183,6 +198,7 @@ const char ArgumentAllVariants[] = "--all";
 const char ArgumentHistMin[] = "--hist-min";
 const char ArgumentHistMax[] = "--hist-max";
 const char ArgumentInclude32Blk[] = "--include-32blk";
+const char ArgumentIncludeRaw[] = "--include-raw";
 const char ArgumentNoSleep[] = "--no-sleep";
 const char ArgumentCpuCore[] = "--cpu-core";
 const char ArgumentRuns[] = "--runs";
@@ -199,7 +215,8 @@ int32_t main(const int32_t argc, char **pArgv)
     printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMax);
     printf("\t%s \tRun all implementations of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants);
     printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore);
-    printf("\t%s \tInclude 32 block variants (which are generally quite slow)\n", ArgumentInclude32Blk);
+    printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw);
+    printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw);
     printf("\t%s <uint>\tRun the benchmark for a specified amount of times (default: 8)\n", ArgumentNoSleep);
     return 1;
   }
@@ -220,6 +237,12 @@ int32_t main(const int32_t argc, char **pArgv)
         argsRemaining--;
         _OnlyRelevantCodecs = false;
       }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeRaw, sizeof(ArgumentIncludeRaw)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _IncludeRaw = true;
+      }
       else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentInclude32Blk, sizeof(ArgumentInclude32Blk)) == 0)
       {
         argIndex++;
@@ -318,7 +341,13 @@ int32_t main(const int32_t argc, char **pArgv)
     pUncompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize);
     pDecompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize);
 
-    compressedDataCapacity = rans_max(rans_max(rans_max(rANS32x64_16w_capacity(fileSize), rANS32x16_16w_capacity(fileSize)), rANS32x32_16w_capacity(fileSize)), rans_max(rANS32x32_32blk_16w_capacity(fileSize), rANS32x32_32blk_8w_capacity(fileSize)));
+    compressedDataCapacity = rANS32x64_16w_capacity(fileSize);
+    compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x16_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize));
+
     pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity);
 
     if (pUncompressedData == nullptr || pDecompressedData == nullptr || pCompressedData == nullptr)
@@ -405,6 +434,7 @@ int32_t main(const int32_t argc, char **pArgv)
     make_hist(&hist, pUncompressedData, fileSize, _Codecs[codecId].totalSymbolCountBits);
     bool skipCodec = false;
 
+    skipCodec |= (!_IncludeRaw && strstr(_Codecs[codecId].name, " (raw)") != nullptr);
     skipCodec |= (!_Include32Block && strstr(_Codecs[codecId].name, " 32blk ") != nullptr);
     skipCodec |= _Codecs[codecId].totalSymbolCountBits > _HistMax;
     skipCodec |= _Codecs[codecId].totalSymbolCountBits < _HistMin;
@@ -466,10 +496,17 @@ int32_t main(const int32_t argc, char **pArgv)
       printf("\r  %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
       print_perf_info(fileSize);
 
-      const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
+      if (_Codecs[codecId].decoders[0].func != nullptr)
+      {
+        const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
 
-      if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize))
-        puts("Failed to validate.");
+        if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize))
+          puts("Failed to validate.");
+      }
+      else
+      {
+        puts("Unable to validate, no decoder available.");
+      }
     }
 
     size_t decodedSize = 0;

From ffb8e634048a25aa0143c11ed747e46a9f724bea Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Tue, 4 Jul 2023 00:29:26 +0200
Subject: [PATCH 02/34] making things worse-better

---
 src/block_rANS32x32_16w.cpp | 19 ++++++++++---------
 src/main.cpp                | 12 ++++++------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
index 76d80fb..cb34b82 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w.cpp
@@ -21,12 +21,12 @@ struct HistReplaceMul
   constexpr static size_t GetValue();
 };
 
-template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 200; } };
-template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 200; } };
-template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 200; } };
-template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 200; } };
-template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 200; } };
-template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 200; } };
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 52450; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 33915; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 16800; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 8140; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 3865; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 2898; } };
 
 size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 {
@@ -210,7 +210,10 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
         size_t accumAbsDiff = 0;
 
         for (size_t j = 0; j < 256; j++)
-          accumAbsDiff += (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]);
+        {
+          const size_t diff = (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]);
+          accumAbsDiff += diff * diff;
+        }
 
         histPotentialCount++;
         histPotentialDiff += accumAbsDiff;
@@ -280,8 +283,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
   return outIndex;
 }
 
-
-
 //////////////////////////////////////////////////////////////////////////
 
 size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); }
diff --git a/src/main.cpp b/src/main.cpp
index 15a9075..99c5ca1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8
 
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},

From 1bfd2df6b8c0986356eb975be4cbeba548bd0328 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Tue, 4 Jul 2023 07:18:18 +0200
Subject: [PATCH 03/34] block stuff

---
 src/block_rANS32x32_16w.cpp | 81 +++++++++++++++++++++++++++----------
 src/main.cpp                |  6 +--
 2 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
index cb34b82..5895fcd 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w.cpp
@@ -21,12 +21,12 @@ struct HistReplaceMul
   constexpr static size_t GetValue();
 };
 
-template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 52450; } };
-template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 33915; } };
-template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 16800; } };
-template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 8140; } };
-template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 3865; } };
-template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 2898; } };
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 110; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 110; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 110; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 110; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 110; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 90; } };
 
 size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 {
@@ -57,11 +57,14 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
   size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
   size_t blockLowCmp = blockLowI + StateCount;
 
+  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
+  constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 10;
+
   size_t histCount = 1;
   size_t histPotentialCount = 1;
-  size_t histDiff = 0;
-  size_t histPotentialDiff = 0;
-  size_t histRejectedDiff = 0;
+  int64_t histDiff = 0;
+  int64_t histPotentialDiff = 0;
+  int64_t histRejectedDiff = 0;
 
   if (blockLowI > MinBlockSize)
     blockLowI -= MinBlockSize;
@@ -167,6 +170,8 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
     // Potentially replace histogram.
     {
+      histPotentialCount++;
+
       blockLowI = i - MinBlockSize;
       blockLowCmp = blockLowI + StateCount;
 
@@ -192,8 +197,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
       {
         if constexpr (IsSafeHist)
           for (size_t j = 0; j < 256; j++)
-            if (symCount[j] == 0)
-              symCount[j] = 1;
+            symCount[j]++;
 
         hist_t newHist;
 
@@ -201,34 +205,67 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
         {
           for (size_t j = 0; j < 256; j++)
             newHist.symbolCount[j] = (uint16_t)symCount[j];
+
+          size_t counter = 0;
+
+          for (size_t j = 0; j < 256; j++)
+          {
+            newHist.cumul[j] = (uint16_t)counter;
+            counter += newHist.symbolCount[j];
+          }
         }
         else
         {
           normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits);
         }
 
-        size_t accumAbsDiff = 0;
+        double costBefore = 0;
+        double costAfter = 0;
 
-        for (size_t j = 0; j < 256; j++)
+        if constexpr (IsSafeHist)
         {
-          const size_t diff = (size_t)llabs(hist.symbolCount[j] - newHist.symbolCount[j]);
-          accumAbsDiff += diff * diff;
+          for (size_t j = 0; j < 256; j++)
+          {
+            if (symCount[j] == 0)
+              continue;
+
+            const double before = (symCount[j] - 1) * log2(hist.symbolCount[j] / (double)totalSymbolCount);
+            const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+
+            costBefore -= before;
+            costAfter -= after;
+          }
+        }
+        else
+        {
+          for (size_t j = 0; j < 256; j++)
+          {
+            if (symCount[j] == 0)
+              continue;
+
+            const double before = symCount[j] * log2(hist.symbolCount[j] / (double)totalSymbolCount);
+            const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+
+            costBefore -= before;
+            costAfter -= after;
+          }
         }
 
-        histPotentialCount++;
-        histPotentialDiff += accumAbsDiff;
+        const double accumDiff = costBefore - costAfter;
+        
+        //printf("Block %" PRIu64": %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", histPotentialCount, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), accumDiff * 100.0 / histReplacePoint, accumDiff >= histReplacePoint ? "Accepted" : "Rejected");
 
-        constexpr size_t histReplacePoint = ((1 << TotalSymbolCountBits) * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 10;
+        histPotentialDiff += (int64_t)accumDiff;
 
-        if (accumAbsDiff >= histReplacePoint)
+        if (accumDiff >= histReplacePoint)
         {
-          histDiff += accumAbsDiff;
+          histDiff += (int64_t)accumDiff;
           mustReplaceHist = true;
           hist = newHist;
         }
         else
         {
-          histRejectedDiff += accumAbsDiff;
+          histRejectedDiff += (int64_t)accumDiff;
         }
       }
 
@@ -278,7 +315,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
   *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
 
-  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3fk, total: %5.3fk, rejected: %5.3fk\n", histCount, histPotentialCount, (length / 1024.0) / histCount, (histDiff / 1024.0) / histCount, (histPotentialDiff / 1024.0) / histPotentialCount, (histRejectedDiff / 1024.0) / (histPotentialCount - histCount));
+  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3f, total: %5.3f, rejected: %5.3f\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount, (histDiff * 100.0 / histReplacePoint) / histCount, (histPotentialDiff * 100.0 / histReplacePoint) / histPotentialCount, (histRejectedDiff * 100.0 / histReplacePoint) / (histPotentialCount - histCount));
 
   return outIndex;
 }
diff --git a/src/main.cpp b/src/main.cpp
index 99c5ca1..72ecd2e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -488,12 +488,12 @@ int32_t main(const int32_t argc, char **pArgv)
         _NsPerRun[run] = TicksToNs(endTick - startTick);
         _ClocksPerRun[run] = endClock - startClock;
 
-        printf("\r  %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
+        printf("\r  %-38s | %7.3f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
 
         SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000));
       }
 
-      printf("\r  %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
+      printf("\r  %-38s | %7.3f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
       print_perf_info(fileSize);
 
       if (_Codecs[codecId].decoders[0].func != nullptr)
@@ -505,7 +505,7 @@ int32_t main(const int32_t argc, char **pArgv)
       }
       else
       {
-        puts("Unable to validate, no decoder available.");
+        //puts("Unable to validate, no decoder available.");
       }
     }
 

From c9d4a8531bba2324a2a4265430f0dbd6e2436cb8 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Tue, 4 Jul 2023 21:56:45 +0200
Subject: [PATCH 04/34] slow block pre-calc

---
 src/block_rANS32x32_16w.cpp | 280 +++++++++++++++++++++---------------
 1 file changed, 161 insertions(+), 119 deletions(-)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
index 5895fcd..93e7a9e 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w.cpp
@@ -21,12 +21,12 @@ struct HistReplaceMul
   constexpr static size_t GetValue();
 };
 
-template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 110; } };
-template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 110; } };
-template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 110; } };
-template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 110; } };
-template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 110; } };
-template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 90; } };
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } };
 
 size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 {
@@ -39,6 +39,97 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 
 //////////////////////////////////////////////////////////////////////////
 
+template <uint32_t TotalSymbolCountBits>
+static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
+{
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+
+  memset(symCount, 0, sizeof(uint32_t) * 256);
+  observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize);
+
+  // Do we include a symbol that hasn't been included before?
+  if constexpr (!IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+      if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0)
+        return false;
+  }
+
+  hist_t newHist;
+
+  if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize)
+  {
+    for (size_t j = 0; j < 256; j++)
+      newHist.symbolCount[j] = (uint16_t)symCount[j];
+
+    size_t counter = 0;
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      newHist.cumul[j] = (uint16_t)counter;
+      counter += newHist.symbolCount[j];
+    }
+  }
+  else
+  {
+    if constexpr (IsSafeHist)
+    {
+      for (size_t j = 0; j < 256; j++)
+        symCount[j]++;
+
+      normalize_hist(&newHist, symCount, MinBlockSize + 256, TotalSymbolCountBits);
+    }
+    else
+    {
+      normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits);
+    }
+  }
+
+  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
+  constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
+
+  // this comparison isn't fair or fast, but should be a good starting point hopefully.
+  double costBefore = 0;
+  double costAfter = 0;
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const double before = (symCount[j] - 1) * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount);
+      const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+  else
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const double before = symCount[j] * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount);
+      const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+
+  const double diff = costBefore - costAfter;
+
+  //printf("[%8" PRIX64 " ~ %8" PRIX64 "]  %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", nextBlockStartOffset, nextBlockStartOffset + nextBlockSize, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), diff * 100.0 / histReplacePoint, diff >= histReplacePoint ? "Accepted" : "Rejected");
+
+  return (diff < histReplacePoint);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
 template <uint32_t TotalSymbolCountBits>
 size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
 {
@@ -53,24 +144,20 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
   uint32_t states[StateCount];
   uint16_t *pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
   uint16_t *pStart = pEnd;
-  uint16_t *pBlockBack = pStart;
+  
   size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
-  size_t blockLowCmp = blockLowI + StateCount;
 
-  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
-  constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 10;
+  if (blockLowI > MinBlockSize)
+    blockLowI -= MinBlockSize;
+
+  size_t blockLowCmp = blockLowI + StateCount;
+  size_t blockBackPoint = length;
 
   size_t histCount = 1;
   size_t histPotentialCount = 1;
-  int64_t histDiff = 0;
-  int64_t histPotentialDiff = 0;
-  int64_t histRejectedDiff = 0;
-
-  if (blockLowI > MinBlockSize)
-    blockLowI -= MinBlockSize;
 
   uint32_t symCount[256];
-  observe_hist(symCount, pInData + blockLowI, length - blockLowI);
+  observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
 
   if constexpr (IsSafeHist)
     for (size_t j = 0; j < 256; j++)
@@ -78,7 +165,28 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
         symCount[j] = 1;
 
   hist_t hist;
-  normalize_hist(&hist, symCount, length - blockLowI, TotalSymbolCountBits);
+  normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
+
+  while (blockLowI > 0)
+  {
+    histPotentialCount++;
+
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount))
+    {
+      blockLowI -= MinBlockSize;
+      blockLowCmp -= MinBlockSize;
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+  observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
+  normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
+  //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]\n", blockLowI, blockBackPoint);
+  blockBackPoint = blockLowI;
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
@@ -164,126 +272,60 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
         states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount);
       }
     }
+    
+    // Write hist.
+    {
+      const uint64_t blockSize = blockBackPoint - blockLowI;
+
+      pStart++;
+      pStart -= 256;
+      memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount));
+
+      pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(pStart, &blockSize, sizeof(blockSize));
+
+      pStart--;
+
+      histCount++;
+    }
 
     if (i == 0)
       break;
 
     // Potentially replace histogram.
     {
-      histPotentialCount++;
-
-      blockLowI = i - MinBlockSize;
-      blockLowCmp = blockLowI + StateCount;
+      blockLowI -= MinBlockSize;
+      blockLowCmp -= MinBlockSize;
 
-      memset(symCount, 0, sizeof(symCount));
       observe_hist(symCount, pInData + blockLowI, MinBlockSize);
 
-      bool mustReplaceHist = false;
-
-      if constexpr (!IsSafeHist)
-      {
+      if constexpr (IsSafeHist)
         for (size_t j = 0; j < 256; j++)
-        {
-          if (symCount[j] > 0 && hist.symbolCount[j] == 0)
-          {
-            mustReplaceHist = true;
-            normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits);
-            break;
-          }
-        }
-      }
-
-      if (!mustReplaceHist)
-      {
-        if constexpr (IsSafeHist)
-          for (size_t j = 0; j < 256; j++)
-            symCount[j]++;
-
-        hist_t newHist;
-
-        if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize)
-        {
-          for (size_t j = 0; j < 256; j++)
-            newHist.symbolCount[j] = (uint16_t)symCount[j];
-
-          size_t counter = 0;
-
-          for (size_t j = 0; j < 256; j++)
-          {
-            newHist.cumul[j] = (uint16_t)counter;
-            counter += newHist.symbolCount[j];
-          }
-        }
-        else
-        {
-          normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits);
-        }
-
-        double costBefore = 0;
-        double costAfter = 0;
-
-        if constexpr (IsSafeHist)
-        {
-          for (size_t j = 0; j < 256; j++)
-          {
-            if (symCount[j] == 0)
-              continue;
-
-            const double before = (symCount[j] - 1) * log2(hist.symbolCount[j] / (double)totalSymbolCount);
-            const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
-
-            costBefore -= before;
-            costAfter -= after;
-          }
-        }
-        else
-        {
-          for (size_t j = 0; j < 256; j++)
-          {
-            if (symCount[j] == 0)
-              continue;
-
-            const double before = symCount[j] * log2(hist.symbolCount[j] / (double)totalSymbolCount);
-            const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+          if (symCount[j] == 0)
+            symCount[j] = 1;
 
-            costBefore -= before;
-            costAfter -= after;
-          }
-        }
+      normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits);
 
-        const double accumDiff = costBefore - costAfter;
-        
-        //printf("Block %" PRIu64": %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", histPotentialCount, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), accumDiff * 100.0 / histReplacePoint, accumDiff >= histReplacePoint ? "Accepted" : "Rejected");
-
-        histPotentialDiff += (int64_t)accumDiff;
+      while (blockLowI > 0)
+      {
+        histPotentialCount++;
 
-        if (accumDiff >= histReplacePoint)
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount))
         {
-          histDiff += (int64_t)accumDiff;
-          mustReplaceHist = true;
-          hist = newHist;
+          blockLowI -= MinBlockSize;
+          blockLowCmp -= MinBlockSize;
         }
         else
         {
-          histRejectedDiff += (int64_t)accumDiff;
+          break;
         }
       }
 
-      if (mustReplaceHist)
-      {
-        const uint64_t blockSize = pBlockBack - pStart;
-
-        pStart++;
-        pStart -= 256;
-        memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount));
-        pStart -= sizeof(uint64_t);
-        memcpy(pStart, &blockSize, sizeof(blockSize));
-
-        pStart--;
-        pBlockBack = pStart;
-
-        histCount++;
-      }
+      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+      observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
+      normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
+      //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]: i: %" PRIX64 "\n", blockLowI, blockBackPoint, i);
+      blockBackPoint = blockLowI;
     }
   }
 
@@ -315,7 +357,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
   *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
 
-  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB. avg diff selected: %5.3f, total: %5.3f, rejected: %5.3f\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount, (histDiff * 100.0 / histReplacePoint) / histCount, (histPotentialDiff * 100.0 / histReplacePoint) / histPotentialCount, (histRejectedDiff * 100.0 / histReplacePoint) / (histPotentialCount - histCount));
+  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB.\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount);
 
   return outIndex;
 }

From 79fc1db9a70a817eb1643ef0f9760a75af39d32f Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Tue, 4 Jul 2023 21:56:53 +0200
Subject: [PATCH 05/34] cleanup

---
 src/block_rANS32x32_16w.cpp | 25 ++++++++-----------------
 src/main.cpp                | 18 +++++++++---------
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
index 93e7a9e..8a2d0c0 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w.cpp
@@ -6,9 +6,6 @@
 #include <string.h>
 #include <math.h>
 
-#include <stdio.h>
-#include <inttypes.h>
-
 constexpr size_t StateCount = 32; // Needs to be a power of two.
 constexpr bool EncodeNoBranch = false;
 //constexpr bool DecodeNoBranch = false;
@@ -86,11 +83,11 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
   }
 
   constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
-  constexpr int64_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
+  constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
 
   // this comparison isn't fair or fast, but should be a good starting point hopefully.
-  double costBefore = 0;
-  double costAfter = 0;
+  float costBefore = 0;
+  float costAfter = 0;
 
   if constexpr (IsSafeHist)
   {
@@ -99,8 +96,8 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
       if (symCount[j] == 0)
         continue;
 
-      const double before = (symCount[j] - 1) * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount);
-      const double after = (symCount[j] - 1) * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+      const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = (symCount[j] - 1) * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
 
       costBefore -= before;
       costAfter -= after;
@@ -113,17 +110,15 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
       if (symCount[j] == 0)
         continue;
 
-      const double before = symCount[j] * log2(pOldHist->symbolCount[j] / (double)totalSymbolCount);
-      const double after = symCount[j] * log2(newHist.symbolCount[j] / (double)totalSymbolCount);
+      const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
 
       costBefore -= before;
       costAfter -= after;
     }
   }
 
-  const double diff = costBefore - costAfter;
-
-  //printf("[%8" PRIX64 " ~ %8" PRIX64 "]  %7.5f before, %7.5f after => %7.5f diff: (%5.3f %% => %s)\n", nextBlockStartOffset, nextBlockStartOffset + nextBlockSize, costBefore / (double)(MinBlockSize * TotalSymbolCountBits), costAfter / (double)(MinBlockSize * TotalSymbolCountBits), (costAfter - costBefore) / (double)(MinBlockSize * TotalSymbolCountBits), diff * 100.0 / histReplacePoint, diff >= histReplacePoint ? "Accepted" : "Rejected");
+  const float diff = costBefore - costAfter;
 
   return (diff < histReplacePoint);
 }
@@ -185,7 +180,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
   // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
   observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
   normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
-  //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]\n", blockLowI, blockBackPoint);
   blockBackPoint = blockLowI;
 
   // Init States.
@@ -324,7 +318,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
       // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
       observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
       normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
-      //printf(">> USING HIST FOR [%8" PRIX64 " ~ %" PRIX64 "]: i: %" PRIX64 "\n", blockLowI, blockBackPoint, i);
       blockBackPoint = blockLowI;
     }
   }
@@ -357,8 +350,6 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
   *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
 
-  printf("\t>>>>> %" PRIu64 " / %" PRIu64 " (%5.3f %%) histograms used. approx block size: %6.3f KiB.\n", histCount, histPotentialCount, histCount * 100.0 / histPotentialCount, (length / 1024.0) / histCount);
-
   return outIndex;
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index 72ecd2e..15a9075 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8
 
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{}, { "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
@@ -488,12 +488,12 @@ int32_t main(const int32_t argc, char **pArgv)
         _NsPerRun[run] = TicksToNs(endTick - startTick);
         _ClocksPerRun[run] = endClock - startClock;
 
-        printf("\r  %-38s | %7.3f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
+        printf("\r  %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
 
         SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000));
       }
 
-      printf("\r  %-38s | %7.3f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
+      printf("\r  %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
       print_perf_info(fileSize);
 
       if (_Codecs[codecId].decoders[0].func != nullptr)
@@ -505,7 +505,7 @@ int32_t main(const int32_t argc, char **pArgv)
       }
       else
       {
-        //puts("Unable to validate, no decoder available.");
+        puts("Unable to validate, no decoder available.");
       }
     }
 

From b6d0dec9f8c838b9986b5ac05af614ec3f60d5cb Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 02:36:32 +0200
Subject: [PATCH 06/34] Further cleanup

---
 src/block_rANS32x32_16w.cpp | 239 +++++++++++++++++-------------------
 1 file changed, 111 insertions(+), 128 deletions(-)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w.cpp
index 8a2d0c0..07be88b 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w.cpp
@@ -10,7 +10,8 @@ constexpr size_t StateCount = 32; // Needs to be a power of two.
 constexpr bool EncodeNoBranch = false;
 //constexpr bool DecodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
-constexpr size_t MinBlockSize = 1 << 15;
+constexpr size_t MinBlockSizeBits = 15;
+constexpr size_t MinBlockSize = 1 << MinBlockSizeBits;
 
 template <size_t TotalSymbolCountBits>
 struct HistReplaceMul
@@ -31,11 +32,16 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize)
   const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1;
   const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t);
 
-  return baseSize + blockCount * perBlockExtraSize; // i hope this covers all of our bases.
+  return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases.
 }
 
 //////////////////////////////////////////////////////////////////////////
 
+static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
 template <uint32_t TotalSymbolCountBits>
 static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
 {
@@ -54,7 +60,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
 
   hist_t newHist;
 
-  if constexpr (!IsSafeHist && (1 << TotalSymbolCountBits) == MinBlockSize)
+  if constexpr (!IsSafeHist && TotalSymbolCountBits == MinBlockSizeBits)
   {
     for (size_t j = 0; j < 256; j++)
       newHist.symbolCount[j] = (uint16_t)symCount[j];
@@ -125,6 +131,56 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
 
 //////////////////////////////////////////////////////////////////////////
 
+struct _rans_encode_state_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+template <uint32_t TotalSymbolCountBits>
+static void rans32x32_16w_encode_internal_scalar(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+{
+  int64_t targetCmp = targetIndex + StateCount;
+
+  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+  for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+  {
+    for (int64_t j = StateCount - 1; j >= 0; j--)
+    {
+      const uint8_t index = _Rans32x32_idx2idx[j];
+
+      const uint8_t in = pInData[i - StateCount + index];
+      const uint32_t symbolCount = pState->hist.symbolCount[in];
+      const uint32_t max = EncodeEmitPoint * symbolCount;
+
+      const size_t stateIndex = j;
+
+      uint32_t state = pState->states[stateIndex];
+
+      if constexpr (EncodeNoBranch)
+      {
+        const bool write = state >= max;
+        *pState->pStart = (uint16_t)(state & 0xFFFF);
+        *pState->pStart -= (size_t)write;
+        state = write ? state >> 16 : state;
+      }
+      else
+      {
+        if (state >= max)
+        {
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          pState->pStart--;
+          state >>= 16;
+        }
+      }
+
+      pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+    }
+  }
+}
+
 template <uint32_t TotalSymbolCountBits>
 size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
 {
@@ -136,189 +192,122 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
   constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
 
-  uint32_t states[StateCount];
-  uint16_t *pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
-  uint16_t *pStart = pEnd;
+  _rans_encode_state_t encodeState;
+  encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
+  encodeState.pStart = encodeState.pEnd;
   
-  size_t blockLowI = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
+  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
 
-  if (blockLowI > MinBlockSize)
-    blockLowI -= MinBlockSize;
+  if (inputBlockTargetIndex > MinBlockSize)
+    inputBlockTargetIndex -= MinBlockSize;
 
-  size_t blockLowCmp = blockLowI + StateCount;
   size_t blockBackPoint = length;
 
-  size_t histCount = 1;
-  size_t histPotentialCount = 1;
-
   uint32_t symCount[256];
-  observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
   if constexpr (IsSafeHist)
     for (size_t j = 0; j < 256; j++)
       if (symCount[j] == 0)
         symCount[j] = 1;
 
-  hist_t hist;
-  normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
 
-  while (blockLowI > 0)
+  while (inputBlockTargetIndex > 0)
   {
-    histPotentialCount++;
-
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount))
-    {
-      blockLowI -= MinBlockSize;
-      blockLowCmp -= MinBlockSize;
-    }
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount))
+      inputBlockTargetIndex -= MinBlockSize;
     else
-    {
       break;
-    }
   }
 
   // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-  observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
-  normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
-  blockBackPoint = blockLowI;
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+  blockBackPoint = inputBlockTargetIndex;
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
-    states[i] = DecodeConsumePoint16;
-
-  const uint8_t idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
-  static_assert(sizeof(idx2idx) == StateCount);
+    encodeState.states[i] = DecodeConsumePoint16;
 
-  int64_t i = length - 1;
-  i &= ~(size_t)(StateCount - 1);
-  i += StateCount;
+  int64_t inputIndex = length - 1;
+  inputIndex &= ~(size_t)(StateCount - 1);
+  inputIndex += StateCount;
 
   for (int64_t j = StateCount - 1; j >= 0; j--)
   {
-    const uint8_t index = idx2idx[j];
+    const uint8_t index = _Rans32x32_idx2idx[j];
 
-    if (i - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
     {
-      const uint8_t in = pInData[i - StateCount + index];
-      const uint32_t symbolCount = hist.symbolCount[in];
+      const uint8_t in = pInData[inputIndex - StateCount + index];
+      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
       const uint32_t max = EncodeEmitPoint * symbolCount;
 
       const size_t stateIndex = j;
 
-      uint32_t state = states[stateIndex];
+      uint32_t state = encodeState.states[stateIndex];
 
-      if constexpr (EncodeNoBranch)
-      {
-        const bool write = state >= max;
-        *pStart = (uint16_t)(state & 0xFFFF);
-        *pStart -= (size_t)write;
-        state = write ? state >> 16 : state;
-      }
-      else
+      if (state >= max)
       {
-        if (state >= max)
-        {
-          *pStart = (uint16_t)(state & 0xFFFF);
-          pStart--;
-          state >>= 16;
-        }
+        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+        encodeState.pStart--;
+        state >>= 16;
       }
 
-      states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount);
+      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
     }
   }
 
-  i -= StateCount;
+  inputIndex -= StateCount;
 
   while (true)
   {
-    for (; i >= (int64_t)blockLowCmp; i -= StateCount)
-    {
-      for (int64_t j = StateCount - 1; j >= 0; j--)
-      {
-        const uint8_t index = idx2idx[j];
+    rans32x32_16w_encode_internal_scalar<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    inputIndex = inputBlockTargetIndex;
 
-        const uint8_t in = pInData[i - StateCount + index];
-        const uint32_t symbolCount = hist.symbolCount[in];
-        const uint32_t max = EncodeEmitPoint * symbolCount;
-
-        const size_t stateIndex = j;
-
-        uint32_t state = states[stateIndex];
-
-        if constexpr (EncodeNoBranch)
-        {
-          const bool write = state >= max;
-          *pStart = (uint16_t)(state & 0xFFFF);
-          *pStart -= (size_t)write;
-          state = write ? state >> 16 : state;
-        }
-        else
-        {
-          if (state >= max)
-          {
-            *pStart = (uint16_t)(state & 0xFFFF);
-            pStart--;
-            state >>= 16;
-          }
-        }
-
-        states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)hist.cumul[in] + (state % symbolCount);
-      }
-    }
-    
     // Write hist.
     {
-      const uint64_t blockSize = blockBackPoint - blockLowI;
-
-      pStart++;
-      pStart -= 256;
-      memcpy(pStart, hist.symbolCount, sizeof(hist.symbolCount));
+      const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
 
-      pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(pStart, &blockSize, sizeof(blockSize));
+      encodeState.pStart++;
+      encodeState.pStart -= 256;
+      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
 
-      pStart--;
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
 
-      histCount++;
+      encodeState.pStart--;
     }
 
-    if (i == 0)
+    if (inputIndex == 0)
       break;
 
-    // Potentially replace histogram.
+    // Determine new histogram.
     {
-      blockLowI -= MinBlockSize;
-      blockLowCmp -= MinBlockSize;
+      inputBlockTargetIndex -= MinBlockSize;
 
-      observe_hist(symCount, pInData + blockLowI, MinBlockSize);
+      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSize);
 
       if constexpr (IsSafeHist)
         for (size_t j = 0; j < 256; j++)
           if (symCount[j] == 0)
             symCount[j] = 1;
 
-      normalize_hist(&hist, symCount, MinBlockSize, TotalSymbolCountBits);
+      normalize_hist(&encodeState.hist, symCount, MinBlockSize, TotalSymbolCountBits);
 
-      while (blockLowI > 0)
+      while (inputBlockTargetIndex > 0)
       {
-        histPotentialCount++;
-
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, blockLowI - MinBlockSize, MinBlockSize, &hist, symCount))
-        {
-          blockLowI -= MinBlockSize;
-          blockLowCmp -= MinBlockSize;
-        }
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount))
+          inputBlockTargetIndex -= MinBlockSize;
         else
-        {
           break;
-        }
       }
 
       // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-      observe_hist(symCount, pInData + blockLowI, blockBackPoint - blockLowI);
-      normalize_hist(&hist, symCount, blockBackPoint - blockLowI, TotalSymbolCountBits);
-      blockBackPoint = blockLowI;
+      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+      blockBackPoint = inputBlockTargetIndex;
     }
   }
 
@@ -331,21 +320,15 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
   // compressed expected length.
   outIndex += sizeof(uint64_t);
 
-  for (size_t j = 0; j < 256; j++)
-  {
-    *reinterpret_cast<uint16_t *>(pWrite + outIndex) = hist.symbolCount[j];
-    outIndex += sizeof(uint16_t);
-  }
-
   for (size_t j = 0; j < StateCount; j++)
   {
-    *reinterpret_cast<uint32_t *>(pWrite + outIndex) = states[j];
+    *reinterpret_cast<uint32_t *>(pWrite + outIndex) = encodeState.states[j];
     outIndex += sizeof(uint32_t);
   }
 
-  const size_t size = (pEnd - pStart) * sizeof(uint16_t);
+  const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t);
 
-  memmove(pWrite + outIndex, pStart + 1, size);
+  memmove(pWrite + outIndex, encodeState.pStart + 1, size);
   outIndex += size;
 
   *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.

From 4c5006184b0c118beb441b9bfe92f8e06b38f35a Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 02:46:46 +0200
Subject: [PATCH 07/34] Even more cleanup

---
 ...16w.cpp => block_rANS32x32_16w_encode.cpp} | 137 ++++++++++--------
 1 file changed, 77 insertions(+), 60 deletions(-)
 rename src/{block_rANS32x32_16w.cpp => block_rANS32x32_16w_encode.cpp} (81%)

diff --git a/src/block_rANS32x32_16w.cpp b/src/block_rANS32x32_16w_encode.cpp
similarity index 81%
rename from src/block_rANS32x32_16w.cpp
rename to src/block_rANS32x32_16w_encode.cpp
index 07be88b..62cda4f 100644
--- a/src/block_rANS32x32_16w.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -8,7 +8,6 @@
 
 constexpr size_t StateCount = 32; // Needs to be a power of two.
 constexpr bool EncodeNoBranch = false;
-//constexpr bool DecodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
 constexpr size_t MinBlockSizeBits = 15;
 constexpr size_t MinBlockSize = 1 << MinBlockSizeBits;
@@ -42,6 +41,74 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
 
 //////////////////////////////////////////////////////////////////////////
 
+struct _rans_encode_state_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+enum rans32x32_encoder_type_t
+{
+  r32x32_et_scalar,
+};
+
+template <rans32x32_encoder_type_t type>
+struct rans32x32_16w_encoder
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+};
+
+template <>
+struct rans32x32_16w_encoder<r32x32_et_scalar>
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  {
+    int64_t targetCmp = targetIndex + StateCount;
+
+    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = pState->hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = pState->states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          *pState->pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pState->pStart = (uint16_t)(state & 0xFFFF);
+            pState->pStart--;
+            state >>= 16;
+          }
+        }
+
+        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+      }
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
 template <uint32_t TotalSymbolCountBits>
 static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
 {
@@ -131,58 +198,8 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
 
 //////////////////////////////////////////////////////////////////////////
 
-struct _rans_encode_state_t
-{
-  uint32_t states[StateCount];
-  hist_t hist;
-  uint16_t *pEnd, *pStart; // both compressed.
-};
-
-template <uint32_t TotalSymbolCountBits>
-static void rans32x32_16w_encode_internal_scalar(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
-{
-  int64_t targetCmp = targetIndex + StateCount;
-
-  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
-
-  for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
-  {
-    for (int64_t j = StateCount - 1; j >= 0; j--)
-    {
-      const uint8_t index = _Rans32x32_idx2idx[j];
-
-      const uint8_t in = pInData[i - StateCount + index];
-      const uint32_t symbolCount = pState->hist.symbolCount[in];
-      const uint32_t max = EncodeEmitPoint * symbolCount;
-
-      const size_t stateIndex = j;
-
-      uint32_t state = pState->states[stateIndex];
-
-      if constexpr (EncodeNoBranch)
-      {
-        const bool write = state >= max;
-        *pState->pStart = (uint16_t)(state & 0xFFFF);
-        *pState->pStart -= (size_t)write;
-        state = write ? state >> 16 : state;
-      }
-      else
-      {
-        if (state >= max)
-        {
-          *pState->pStart = (uint16_t)(state & 0xFFFF);
-          pState->pStart--;
-          state >>= 16;
-        }
-      }
-
-      pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
-    }
-  }
-}
-
-template <uint32_t TotalSymbolCountBits>
-size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
+template <uint32_t TotalSymbolCountBits, rans32x32_encoder_type_t Impl>
+size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
 {
   if (outCapacity < block_rANS32x32_16w_capacity(length))
     return 0;
@@ -263,7 +280,7 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
   while (true)
   {
-    rans32x32_16w_encode_internal_scalar<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
     inputIndex = inputBlockTargetIndex;
 
     // Write hist.
@@ -338,12 +355,12 @@ size_t block_rANS32x32_16w_encode_scalar(const uint8_t *pInData, const size_t le
 
 //////////////////////////////////////////////////////////////////////////
 
-size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<15>(pInData, length, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<14>(pInData, length, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<13>(pInData, length, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<12>(pInData, length, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<11>(pInData, length, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode_scalar<10>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<15, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<14, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<13, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
 
 size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
 size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }

From 0b60d3210d125979d7145c5160f722e288ba8cd8 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 07:16:09 +0200
Subject: [PATCH 08/34] End not working yet, but generally getting close

---
 src/block_rANS32x32_16w_decode.cpp | 215 +++++++++++++++++++++++++++++
 src/block_rANS32x32_16w_encode.cpp |  11 +-
 2 files changed, 217 insertions(+), 9 deletions(-)
 create mode 100644 src/block_rANS32x32_16w_decode.cpp

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
new file mode 100644
index 0000000..fe2eef1
--- /dev/null
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -0,0 +1,215 @@
+#include "block_rANS32x32_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t StateCount = 32; // Needs to be a power of two.
+constexpr bool DecodeNoBranch = false;
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+template <typename hist_type>
+struct _rans_decode_state_t
+{
+  uint32_t states[StateCount];
+  hist_type hist;
+};
+
+enum rans32x32_decoder_type_t
+{
+  r32x32_dt_scalar,
+};
+
+template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
+struct rans32x32_16w_decoder
+{
+  static const uint16_t *decode_section(_rans_decode_state_t<hist_type> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
+{
+  static const uint16_t *decode_section(_rans_decode_state_t<hist_dec_t<TotalSymbolCountBits>> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+    size_t i = startIndex;
+
+    for (; i < endIndex; i += StateCount)
+    {
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+        uint32_t state = pState->states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = pState->hist.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pReadHead;
+          state = read ? newState : state;
+          pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pReadHead;
+            pReadHead++;
+          }
+        }
+
+        pState->states[j] = state;
+      }
+    }
+
+    return pReadHead;
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  (void)totalSymbolCountBits;
+
+  memcpy(pDecHist, pIncompleteHist, sizeof(hist_t));
+
+  return inplace_make_hist_dec(pDecHist);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
+size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state_t<hist_type> decodeState;
+
+  for (size_t i = 0; i < StateCount; i++)
+  {
+    decodeState.states[i] = *reinterpret_cast<const uint32_t *>(pInData + inputIndex);
+    inputIndex += sizeof(uint32_t);
+  }
+
+  const uint16_t *pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(pReadHead);
+    pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *pReadHead;
+      pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    pReadHead = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pReadHead, pOutData, i, blockEndInStates);
+
+    i = blockEndInStates;
+
+    if (i + StateCount > outLengthInStates)
+      break;
+  }
+  while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec, &hist, sizeof(hist));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x32_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pReadHead;
+          state = read ? newState : state;
+          pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pReadHead;
+            pReadHead++;
+          }
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<15, r32x32_dt_scalar, hist_dec_t<15>>(pInData, inLength, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<14, r32x32_dt_scalar, hist_dec_t<14>>(pInData, inLength, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<13, r32x32_dt_scalar, hist_dec_t<13>>(pInData, inLength, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<12, r32x32_dt_scalar, hist_dec_t<12>>(pInData, inLength, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<11, r32x32_dt_scalar, hist_dec_t<11>>(pInData, inLength, pOutData, outCapacity); }
+size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<10, r32x32_dt_scalar, hist_dec_t<10>>(pInData, inLength, pOutData, outCapacity); }
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index 62cda4f..911b847 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -241,7 +241,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
   // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
   normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
-  blockBackPoint = inputBlockTargetIndex;
+  blockBackPoint = length;
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
@@ -324,7 +324,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
       // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
       observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
       normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
-      blockBackPoint = inputBlockTargetIndex;
+      blockBackPoint = inputIndex;
     }
   }
 
@@ -361,10 +361,3 @@ size_t block_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length
 size_t block_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
 size_t block_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
 size_t block_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
-
-size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
-size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
-size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
-size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
-size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }
-size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { (void)pInData; (void)inLength; (void)pOutData; (void)outCapacity; return 0; }

From 6ba49d9ca5d0d372ebaba97b5f0277811ce438b9 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 18:04:06 +0200
Subject: [PATCH 09/34] decoder fixed

---
 src/block_rANS32x32_16w_decode.cpp | 52 ++++++++++++++++--------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index fe2eef1..94c25ca 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -21,6 +21,7 @@ struct _rans_decode_state_t
 {
   uint32_t states[StateCount];
   hist_type hist;
+  const uint16_t *pReadHead;
 };
 
 enum rans32x32_decoder_type_t
@@ -31,13 +32,13 @@ enum rans32x32_decoder_type_t
 template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
 struct rans32x32_16w_decoder
 {
-  static const uint16_t *decode_section(_rans_decode_state_t<hist_type> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+  static size_t decode_section(_rans_decode_state_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
 };
 
 template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
 {
-  static const uint16_t *decode_section(_rans_decode_state_t<hist_dec_t<TotalSymbolCountBits>> *pState, const uint16_t *pReadHead, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
 
@@ -59,16 +60,16 @@ struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<
         if constexpr (DecodeNoBranch)
         {
           const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *pReadHead;
+          const uint32_t newState = state << 16 | *pState->pReadHead;
           state = read ? newState : state;
-          pReadHead += (size_t)read;
+          pState->pReadHead += (size_t)read;
         }
         else
         {
           if (state < DecodeConsumePoint16)
           {
-            state = state << 16 | *pReadHead;
-            pReadHead++;
+            state = state << 16 | *pState->pReadHead;
+            pState->pReadHead++;
           }
         }
 
@@ -76,7 +77,7 @@ struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<
       }
     }
 
-    return pReadHead;
+    return i;
   }
 };
 
@@ -87,7 +88,7 @@ static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *
 {
   (void)totalSymbolCountBits;
 
-  memcpy(pDecHist, pIncompleteHist, sizeof(hist_t));
+  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
 
   return inplace_make_hist_dec(pDecHist);
 }
@@ -124,20 +125,20 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
     inputIndex += sizeof(uint32_t);
   }
 
-  const uint16_t *pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
   hist_t hist;
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(pReadHead);
-    pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
     for (size_t j = 0; j < 256; j++)
     {
-      hist.symbolCount[j] = *pReadHead;
-      pReadHead++;
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
     }
 
     if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
@@ -150,19 +151,22 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
     else if ((blockEndInStates & (StateCount - 1)) != 0)
       return 0;
 
-    pReadHead = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pReadHead, pOutData, i, blockEndInStates);
+    i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
 
-    i = blockEndInStates;
+    if (i > outLengthInStates)
+    {
+      if (i >= expectedOutputLength)
+        return expectedOutputLength;
+      else
+        break;
+    }
 
-    if (i + StateCount > outLengthInStates)
-      break;
-  }
-  while (i < outLengthInStates);
+  } while (i < outLengthInStates);
 
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec, &hist, sizeof(hist));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
@@ -184,16 +188,16 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
         if constexpr (DecodeNoBranch)
         {
           const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *pReadHead;
+          const uint32_t newState = state << 16 | *decodeState.pReadHead;
           state = read ? newState : state;
-          pReadHead += (size_t)read;
+          decodeState.pReadHead += (size_t)read;
         }
         else
         {
           if (state < DecodeConsumePoint16)
           {
-            state = state << 16 | *pReadHead;
-            pReadHead++;
+            state = state << 16 | *decodeState.pReadHead;
+            decodeState.pReadHead++;
           }
         }
 

From b7c38a7dd5a2b8c868ae48e6d81da129fbed4b77 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 18:32:23 +0200
Subject: [PATCH 10/34] SIMD for 13-15

---
 src/block_rANS32x32_16w_decode.cpp | 313 ++++++++++++++++++++++++++++-
 src/main.cpp                       |  12 +-
 2 files changed, 313 insertions(+), 12 deletions(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index 94c25ca..5ce9784 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -16,6 +16,12 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
 
 //////////////////////////////////////////////////////////////////////////
 
+extern const uint8_t _ShuffleLutShfl32[256 * 8];
+extern const uint8_t _ShuffleLutPerm32[256 * 8];
+extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
+
+//////////////////////////////////////////////////////////////////////////
+
 template <typename hist_type>
 struct _rans_decode_state_t
 {
@@ -27,6 +33,7 @@ struct _rans_decode_state_t
 enum rans32x32_decoder_type_t
 {
   r32x32_dt_scalar,
+  r32x32_dt_avx2_large_cache_15_to_13,
 };
 
 template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
@@ -81,6 +88,249 @@ struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<
   }
 };
 
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_loadu_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const simd_t _1 = _mm256_set1_epi32(1);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // const uint8_t symbol = pHist->cumulInv[slot];
+    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
+    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
+    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
+    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
+
+    // since they were int32_t turn into uint8_t
+    symbol0 = _mm256_and_si256(symbol0, lower8);
+    symbol1 = _mm256_and_si256(symbol1, lower8);
+    symbol2 = _mm256_and_si256(symbol2, lower8);
+    symbol3 = _mm256_and_si256(symbol3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
+    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
+    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
+    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_storeu_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits>
@@ -93,6 +343,17 @@ static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *
   return inplace_make_hist_dec(pDecHist);
 }
 
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec2_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
@@ -211,9 +472,49 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
 
 //////////////////////////////////////////////////////////////////////////
 
-size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<15, r32x32_dt_scalar, hist_dec_t<15>>(pInData, inLength, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<14, r32x32_dt_scalar, hist_dec_t<14>>(pInData, inLength, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<13, r32x32_dt_scalar, hist_dec_t<13>>(pInData, inLength, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<12, r32x32_dt_scalar, hist_dec_t<12>>(pInData, inLength, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<11, r32x32_dt_scalar, hist_dec_t<11>>(pInData, inLength, pOutData, outCapacity); }
-size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x32_16w_decode<10, r32x32_dt_scalar, hist_dec_t<10>>(pInData, inLength, pOutData, outCapacity); }
+template <uint32_t TotalSymbolCountBits>
+static size_t block_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  _DetectCPUFeatures();
+
+  if (avx2Supported)
+  {
+    if constexpr (TotalSymbolCountBits >= 13)
+      return block_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+  }
+
+  // Fallback.
+  return block_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t block_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity);
+}
diff --git a/src/main.cpp b/src/main.cpp
index 15a9075..d076555 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -149,12 +149,12 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8
 
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w (adaptive blocks)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  { "rANS32x32 16w (adaptive blocks)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},

From 8e958b6dc6678922cf6f1bd5da90f00a0c533e48 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 18:46:09 +0200
Subject: [PATCH 11/34] improving clang compat

---
 src/block_rANS32x32_16w_decode.cpp | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index 5ce9784..f97e1c3 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -34,6 +34,9 @@ enum rans32x32_decoder_type_t
 {
   r32x32_dt_scalar,
   r32x32_dt_avx2_large_cache_15_to_13,
+  r32x32_dt_avx2_small_cache_15_to_13,
+  r32x32_dt_avx2_large_cache_12_to_10,
+  r32x32_dt_avx2_small_cache_12_to_10,
 };
 
 template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
@@ -89,6 +92,9 @@ struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<
 };
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
 static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
 {
   if constexpr (!WriteAligned32)
@@ -110,7 +116,6 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<his
   const simd_t lower8 = _mm256_set1_epi32(0xFF);
   const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
   const simd_t _16 = _mm256_set1_epi32(16);
-  const simd_t _1 = _mm256_set1_epi32(1);
   const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
   const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
 
@@ -331,6 +336,16 @@ struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCou
   }
 };
 
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits>

From 04d52d9fc147ca468bbbd94c75474a139ab8f0fe Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 19:05:34 +0200
Subject: [PATCH 12/34] adding 10-12

---
 src/block_rANS32x32_16w_decode.cpp | 264 +++++++++++++++++++++++++++++
 src/main.cpp                       |  49 +++++-
 2 files changed, 312 insertions(+), 1 deletion(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index f97e1c3..fe5a8be 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -102,6 +102,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<his
       return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
 
   constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
 
   typedef __m256i simd_t;
   simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
@@ -326,6 +327,236 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<his
   return i;
 }
 
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  static_assert(TotalSymbolCountBits <= 12);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_loadu_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot3, sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
+    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
+    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
+    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_storeu_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
 template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
 {
@@ -346,6 +577,26 @@ struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCou
   }
 };
 
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits>
@@ -369,6 +620,17 @@ static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t
   return true;
 }
 
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec_pack_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
@@ -496,6 +758,8 @@ static size_t block_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_
   {
     if constexpr (TotalSymbolCountBits >= 13)
       return block_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    else
+      return block_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
   }
 
   // Fallback.
diff --git a/src/main.cpp b/src/main.cpp
index d076555..80650d0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -51,6 +51,8 @@ static size_t _HistMin = 10;
 static bool _Include32Block = false;
 static bool _IncludeRaw = false;
 static size_t _RunCount = 8;
+static size_t _EncodeRunCount = 2;
+static size_t _DecodeRunCount = 16;
 
 constexpr size_t MaxRunCount = 256;
 static uint64_t _ClocksPerRun[MaxRunCount];
@@ -202,6 +204,8 @@ const char ArgumentIncludeRaw[] = "--include-raw";
 const char ArgumentNoSleep[] = "--no-sleep";
 const char ArgumentCpuCore[] = "--cpu-core";
 const char ArgumentRuns[] = "--runs";
+const char ArgumentRunsEncode[] = "--runs-enc";
+const char ArgumentRunsDecode[] = "--runs-dec";
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -217,7 +221,10 @@ int32_t main(const int32_t argc, char **pArgv)
     printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore);
     printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw);
     printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw);
-    printf("\t%s <uint>\tRun the benchmark for a specified amount of times (default: 8)\n", ArgumentNoSleep);
+    printf("\t%s <uint>\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode);
+    printf("\t%s <uint>\tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode);
+    printf("\t%s <uint>\tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode);
+    printf("\t%s <uint>\tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep);
     return 1;
   }
 
@@ -270,6 +277,44 @@ int32_t main(const int32_t argc, char **pArgv)
           _DisableSleep = true;
         }
 
+        _EncodeRunCount = _DecodeRunCount = _RunCount;
+
+        argIndex += 2;
+        argsRemaining -= 2;
+      }
+      else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRunsEncode, sizeof(ArgumentRunsEncode)) == 0)
+      {
+        _EncodeRunCount = strtoull(pArgv[argIndex + 1], nullptr, 10);
+
+        if (_EncodeRunCount > MaxRunCount)
+        {
+          puts("Invalid Parameter.");
+          return 1;
+        }
+        else if (_EncodeRunCount == 0)
+        {
+          _EncodeRunCount = 1;
+          _DisableSleep = true;
+        }
+
+        argIndex += 2;
+        argsRemaining -= 2;
+      }
+      else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRunsDecode, sizeof(ArgumentRunsDecode)) == 0)
+      {
+        _DecodeRunCount = strtoull(pArgv[argIndex + 1], nullptr, 10);
+
+        if (_DecodeRunCount > MaxRunCount)
+        {
+          puts("Invalid Parameter.");
+          return 1;
+        }
+        else if (_DecodeRunCount == 0)
+        {
+          _DecodeRunCount = 1;
+          _DisableSleep = true;
+        }
+
         argIndex += 2;
         argsRemaining -= 2;
       }
@@ -445,6 +490,7 @@ int32_t main(const int32_t argc, char **pArgv)
     printf("%-37s %2" PRIu32 " | -------- | ---------------- | ------------------------------------ | -------------- | ------------------------------------\n", _Codecs[codecId].name, _Codecs[codecId].totalSymbolCountBits);
 
     size_t encodedSize = 0;
+    _RunCount = _EncodeRunCount;
 
     for (size_t i = 0; i < MaxEncoderCount; i++)
     {
@@ -510,6 +556,7 @@ int32_t main(const int32_t argc, char **pArgv)
     }
 
     size_t decodedSize = 0;
+    _RunCount = _DecodeRunCount;
 
     for (size_t i = 0; i < MaxDecoderCount; i++)
     {

From 9c11b44924db9bb87657979d4541df75b3cce91f Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 22:20:29 +0200
Subject: [PATCH 13/34] Trying to choose good cutoff values

---
 src/block_rANS32x32_16w_decode.cpp |  6 +++
 src/block_rANS32x32_16w_encode.cpp | 63 ++++++++++++++++++------------
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index fe5a8be..ab59d85 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -25,7 +25,13 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
 template <typename hist_type>
 struct _rans_decode_state_t
 {
+#ifdef _MSC_VER
+  __declspec(align(32))
+#else
+  __attribute__((aligned(32)))
+#endif
   uint32_t states[StateCount];
+
   hist_type hist;
   const uint16_t *pReadHead;
 };
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index 911b847..04e8306 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -9,8 +9,9 @@
 constexpr size_t StateCount = 32; // Needs to be a power of two.
 constexpr bool EncodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
-constexpr size_t MinBlockSizeBits = 15;
-constexpr size_t MinBlockSize = 1 << MinBlockSizeBits;
+
+constexpr size_t MinMinBlockSizeBits = 15;
+constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits;
 
 template <size_t TotalSymbolCountBits>
 struct HistReplaceMul
@@ -25,10 +26,29 @@ template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { ret
 template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } };
 template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } };
 
+template <size_t TotalSymbolCountBits>
+struct MinBlockSizeBits
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } };
+template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } };
+template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } };
+template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } };
+template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } };
+template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } };
+
+template <uint32_t TotalSymbolCountBits>
+constexpr size_t MinBlockSize()
+{
+  return (size_t)1 << MinBlockSizeBits<TotalSymbolCountBits>::GetValue();
+}
+
 size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 {
   const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
-  const size_t blockCount = (inputSize + MinBlockSize) / MinBlockSize + 1;
+  const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1;
   const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t);
 
   return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases.
@@ -127,7 +147,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
 
   hist_t newHist;
 
-  if constexpr (!IsSafeHist && TotalSymbolCountBits == MinBlockSizeBits)
+  if constexpr (TotalSymbolCountBits == MinBlockSize<TotalSymbolCountBits>())
   {
     for (size_t j = 0; j < 256; j++)
       newHist.symbolCount[j] = (uint16_t)symCount[j];
@@ -142,17 +162,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
   }
   else
   {
-    if constexpr (IsSafeHist)
-    {
-      for (size_t j = 0; j < 256; j++)
-        symCount[j]++;
-
-      normalize_hist(&newHist, symCount, MinBlockSize + 256, TotalSymbolCountBits);
-    }
-    else
-    {
-      normalize_hist(&newHist, symCount, MinBlockSize, TotalSymbolCountBits);
-    }
+    normalize_hist(&newHist, symCount, MinBlockSize<TotalSymbolCountBits>(), TotalSymbolCountBits);
   }
 
   constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
@@ -170,7 +180,7 @@ static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffs
         continue;
 
       const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
-      const float after = (symCount[j] - 1) * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
 
       costBefore -= before;
       costAfter -= after;
@@ -208,15 +218,16 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
   constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
 
   constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+  constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
 
   _rans_encode_state_t encodeState;
   encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
   encodeState.pStart = encodeState.pEnd;
   
-  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSize - 1));
+  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1));
 
-  if (inputBlockTargetIndex > MinBlockSize)
-    inputBlockTargetIndex -= MinBlockSize;
+  if (inputBlockTargetIndex > MinBlockSizeX)
+    inputBlockTargetIndex -= MinBlockSizeX;
 
   size_t blockBackPoint = length;
 
@@ -232,8 +243,8 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
 
   while (inputBlockTargetIndex > 0)
   {
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount))
-      inputBlockTargetIndex -= MinBlockSize;
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+      inputBlockTargetIndex -= MinBlockSizeX;
     else
       break;
   }
@@ -302,21 +313,21 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
 
     // Determine new histogram.
     {
-      inputBlockTargetIndex -= MinBlockSize;
+      inputBlockTargetIndex -= MinBlockSizeX;
 
-      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSize);
+      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
 
       if constexpr (IsSafeHist)
         for (size_t j = 0; j < 256; j++)
           if (symCount[j] == 0)
             symCount[j] = 1;
 
-      normalize_hist(&encodeState.hist, symCount, MinBlockSize, TotalSymbolCountBits);
+      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
 
       while (inputBlockTargetIndex > 0)
       {
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSize, MinBlockSize, &encodeState.hist, symCount))
-          inputBlockTargetIndex -= MinBlockSize;
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+          inputBlockTargetIndex -= MinBlockSizeX;
         else
           break;
       }

From d4ca35afd0bcea34ba45dbef5e73b95cc781b5d0 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Wed, 5 Jul 2023 23:19:06 +0200
Subject: [PATCH 14/34] Fixing issue with histogram creation

---
 src/block_rANS32x32_16w_decode.cpp |  4 ++--
 src/block_rANS32x32_16w_encode.cpp | 11 ++++++++++-
 src/hist.cpp                       |  7 ++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index ab59d85..a616441 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -114,7 +114,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<his
   simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
 
   for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_loadu_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
 
   size_t i = startIndex;
 
@@ -328,7 +328,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<his
   }
 
   for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_storeu_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
 
   return i;
 }
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index 04e8306..5a53a10 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -234,12 +234,21 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
   uint32_t symCount[256];
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
+  size_t extraCount = 0;
+
   if constexpr (IsSafeHist)
+  {
     for (size_t j = 0; j < 256; j++)
+    {
       if (symCount[j] == 0)
+      {
         symCount[j] = 1;
+        extraCount++;
+      }
+    }
+  }
 
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
   while (inputBlockTargetIndex > 0)
   {
diff --git a/src/hist.cpp b/src/hist.cpp
index 4124ef7..1c2179a 100644
--- a/src/hist.cpp
+++ b/src/hist.cpp
@@ -154,7 +154,7 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy
             capped[i]++;
             cappedSum++;
 
-            if (cappedSum == totalSymbolCount + 1)
+            if (cappedSum == totalSymbolCount)
               goto hist_ready;
           }
         }
@@ -173,6 +173,11 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy
     pHist->symbolCount[i] = capped[i];
     counter += capped[i];
   }
+
+#if defined(_DEBUG) && defined(_MSC_VER)
+  if (counter != totalSymbolCount)
+    __debugbreak();
+#endif
 }
 
 void make_hist(hist_t *pHist, const uint8_t *pData, const size_t size, const size_t totalSymbolCountBits)

From a0c04e4b95a2c9f4de074ab2c3dbc4e98187f36e Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Thu, 6 Jul 2023 00:39:35 +0200
Subject: [PATCH 15/34] 32x64 block based (no tweaking yet)

---
 src/block_rANS32x32_16w_decode.cpp |   24 +-
 src/block_rANS32x32_16w_encode.cpp |    8 +-
 src/block_rANS32x64_16w.h          |   22 +
 src/block_rANS32x64_16w_decode.cpp | 1860 ++++++++++++++++++++++++++++
 src/block_rANS32x64_16w_encode.cpp |  390 ++++++
 src/main.cpp                       |   24 +-
 6 files changed, 2305 insertions(+), 23 deletions(-)
 create mode 100644 src/block_rANS32x64_16w.h
 create mode 100644 src/block_rANS32x64_16w_decode.cpp
 create mode 100644 src/block_rANS32x64_16w_encode.cpp

diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index a616441..dc4b341 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -23,7 +23,7 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
 //////////////////////////////////////////////////////////////////////////
 
 template <typename hist_type>
-struct _rans_decode_state_t
+struct _rans_decode_state32_t
 {
 #ifdef _MSC_VER
   __declspec(align(32))
@@ -48,13 +48,13 @@ enum rans32x32_decoder_type_t
 template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
 struct rans32x32_16w_decoder
 {
-  static size_t decode_section(_rans_decode_state_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+  static size_t decode_section(_rans_decode_state32_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
 };
 
 template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
 {
-  static size_t decode_section(_rans_decode_state_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
 
@@ -101,7 +101,7 @@ template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32
 #ifndef _MSC_VER
 __attribute__((target("avx2")))
 #endif
-static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
 {
   if constexpr (!WriteAligned32)
     if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
@@ -337,7 +337,7 @@ template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32
 #ifndef _MSC_VER
 __attribute__((target("avx2")))
 #endif
-static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
 {
   if constexpr (!WriteAligned32)
     if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
@@ -350,7 +350,7 @@ static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t<his
   simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
 
   for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_loadu_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
 
   size_t i = startIndex;
 
@@ -558,7 +558,7 @@ static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state_t<his
   }
 
   for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_storeu_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
 
   return i;
 }
@@ -567,7 +567,7 @@ template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
 {
   template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
   }
@@ -577,7 +577,7 @@ template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
 {
   template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
   }
@@ -587,7 +587,7 @@ template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
 {
   template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
   }
@@ -597,7 +597,7 @@ template <uint32_t TotalSymbolCountBits>
 struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
 {
   template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
   {
     return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
   }
@@ -661,7 +661,7 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
   if (inLength < expectedInputLength)
     return 0;
 
-  _rans_decode_state_t<hist_type> decodeState;
+  _rans_decode_state32_t<hist_type> decodeState;
 
   for (size_t i = 0; i < StateCount; i++)
   {
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index 5a53a10..ec85a1d 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -61,7 +61,7 @@ static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
 
 //////////////////////////////////////////////////////////////////////////
 
-struct _rans_encode_state_t
+struct _rans_encode_state32_t
 {
   uint32_t states[StateCount];
   hist_t hist;
@@ -77,14 +77,14 @@ template <rans32x32_encoder_type_t type>
 struct rans32x32_16w_encoder
 {
   template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
 };
 
 template <>
 struct rans32x32_16w_encoder<r32x32_et_scalar>
 {
   template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
   {
     int64_t targetCmp = targetIndex + StateCount;
 
@@ -220,7 +220,7 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
   constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
   constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
 
-  _rans_encode_state_t encodeState;
+  _rans_encode_state32_t encodeState;
   encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
   encodeState.pStart = encodeState.pEnd;
   
diff --git a/src/block_rANS32x64_16w.h b/src/block_rANS32x64_16w.h
new file mode 100644
index 0000000..a06166e
--- /dev/null
+++ b/src/block_rANS32x64_16w.h
@@ -0,0 +1,22 @@
+#ifndef block_rANS32x64_16w_h__
+#define block_rANS32x64_16w_h__
+
+#include "hist.h"
+
+size_t block_rANS32x64_16w_capacity(const size_t inputSize);
+
+size_t block_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+
+size_t block_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t block_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+
+#endif // block_rANS32x64_16w_h__
diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp
new file mode 100644
index 0000000..1e7b4c8
--- /dev/null
+++ b/src/block_rANS32x64_16w_decode.cpp
@@ -0,0 +1,1860 @@
+#include "block_rANS32x64_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t StateCount = 64; // Needs to be a power of two.
+constexpr bool DecodeNoBranch = false;
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x64_idx2idx[] =
+{
+  0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+  0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F,
+  0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37,
+  0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F,
+};
+
+static_assert(sizeof(_Rans32x64_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+extern const uint8_t _ShuffleLutShfl32[256 * 8];
+extern const uint8_t _ShuffleLutPerm32[256 * 8];
+extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
+
+//////////////////////////////////////////////////////////////////////////
+
+template <typename hist_type>
+struct _rans_decode_state64_t
+{
+#ifdef _MSC_VER
+  __declspec(align(64))
+#else
+  __attribute__((aligned(64)))
+#endif
+  uint32_t states[StateCount];
+
+  hist_type hist;
+  const uint16_t *pReadHead;
+};
+
+enum rans32x64_decoder_type_t
+{
+  r32x64_dt_scalar,
+  r32x64_dt_avx2_large_cache_15_to_13,
+  r32x64_dt_avx2_small_cache_15_to_13,
+  r32x64_dt_avx2_large_cache_12_to_10,
+  r32x64_dt_avx2_small_cache_12_to_10,
+  r32x64_dt_avx512_large_cache_15_to_13,
+  r32x64_dt_avx512_small_cache_15_to_13,
+  r32x64_dt_avx512_large_cache_12_to_10,
+  r32x64_dt_avx512_small_cache_12_to_10,
+};
+
+template <rans32x64_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
+struct rans32x64_16w_decoder
+{
+  static size_t decode_section(_rans_decode_state64_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
+{
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+    size_t i = startIndex;
+
+    for (; i < endIndex; i += StateCount)
+    {
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        const uint8_t index = _Rans32x64_idx2idx[j];
+        uint32_t state = pState->states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = pState->hist.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pState->pReadHead;
+          state = read ? newState : state;
+          pState->pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pState->pReadHead;
+            pState->pReadHead++;
+          }
+        }
+
+        pState->states[j] = state;
+      }
+    }
+
+    return i;
+  }
+};
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
+    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
+    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
+    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
+
+    // const uint8_t symbol = pHist->cumulInv[slot];
+    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
+    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
+    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
+    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
+    simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot4, sizeof(uint8_t));
+    simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot5, sizeof(uint8_t));
+    simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot6, sizeof(uint8_t));
+    simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot7, sizeof(uint8_t));
+
+    // since they were int32_t turn into uint8_t
+    symbol0 = _mm256_and_si256(symbol0, lower8);
+    symbol1 = _mm256_and_si256(symbol1, lower8);
+    symbol2 = _mm256_and_si256(symbol2, lower8);
+    symbol3 = _mm256_and_si256(symbol3, lower8);
+    symbol4 = _mm256_and_si256(symbol4, lower8);
+    symbol5 = _mm256_and_si256(symbol5, lower8);
+    symbol6 = _mm256_and_si256(symbol6, lower8);
+    symbol7 = _mm256_and_si256(symbol7, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
+    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
+    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
+
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
+    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol4, sizeof(uint32_t));
+    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol5, sizeof(uint32_t));
+    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol6, sizeof(uint32_t));
+    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol7, sizeof(uint32_t));
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
+    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
+    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
+    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
+    const simd_t cumul4 = _mm256_srli_epi32(pack4, 16);
+    const simd_t freq4 = _mm256_and_si256(pack4, lower16);
+    const simd_t cumul5 = _mm256_srli_epi32(pack5, 16);
+    const simd_t freq5 = _mm256_and_si256(pack5, lower16);
+    const simd_t cumul6 = _mm256_srli_epi32(pack6, 16);
+    const simd_t freq6 = _mm256_and_si256(pack6, lower16);
+    const simd_t cumul7 = _mm256_srli_epi32(pack7, 16);
+    const simd_t freq7 = _mm256_and_si256(pack7, lower16);
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
+    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
+    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
+    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
+    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
+    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
+    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
+    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
+    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
+    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
+      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
+      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
+      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
+        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
+        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
+        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
+    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
+    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
+    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot3, sizeof(uint32_t));
+    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot4, sizeof(uint32_t));
+    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot5, sizeof(uint32_t));
+    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot6, sizeof(uint32_t));
+    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot7, sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
+    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
+    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
+    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
+    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
+    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
+    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
+    const simd_t symbol4 = _mm256_and_si256(pack4, lower8);
+    const simd_t symbol5 = _mm256_and_si256(pack5, lower8);
+    const simd_t symbol6 = _mm256_and_si256(pack6, lower8);
+    const simd_t symbol7 = _mm256_and_si256(pack7, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
+    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
+    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
+
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
+    const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12);
+    const simd_t freq4 = _mm256_srli_epi32(pack4, 20);
+    const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12);
+    const simd_t freq5 = _mm256_srli_epi32(pack5, 20);
+    const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12);
+    const simd_t freq6 = _mm256_srli_epi32(pack6, 20);
+    const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12);
+    const simd_t freq7 = _mm256_srli_epi32(pack7, 20);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
+    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
+    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
+    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
+    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
+    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
+    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
+      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
+      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
+      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
+        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
+        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
+        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
+#ifndef _MSC_VER
+#ifdef __llvm__
+__attribute__((target("avx512bw")))
+#else
+__attribute__((target("avx512f", "avx512bw")))
+#endif
+#endif
+static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned64)
+    if ((reinterpret_cast<size_t>(pOutData) & (64 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m512i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm512_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm512_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
+  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    symbol0 = _mm512_and_si512(symbol0, lower8);
+    symbol1 = _mm512_and_si512(symbol1, lower8);
+    symbol2 = _mm512_and_si512(symbol2, lower8);
+    symbol3 = _mm512_and_si512(symbol3, lower8);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+
+    // pack symbols to one si512.
+    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
+    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm512_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm512_and_si512(pack0, lower16);
+    const simd_t cumul1 = _mm512_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm512_and_si512(pack1, lower16);
+    const simd_t cumul2 = _mm512_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm512_and_si512(pack2, lower16);
+    const simd_t cumul3 = _mm512_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm512_and_si512(pack3, lower16);
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned64)
+      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+    else
+      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
+    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
+    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
+    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
+      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
+      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
+      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
+        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
+        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
+        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
+        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
+        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
+        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
+        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
+        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // finalize lookups.
+        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
+#ifndef _MSC_VER
+#ifdef __llvm__
+__attribute__((target("avx512bw")))
+#else
+__attribute__((target("avx512f", "avx512bw")))
+#endif
+#endif
+static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned64)
+    if ((reinterpret_cast<size_t>(pOutData) & (63 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m512i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm512_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
+  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm512_and_si512(pack0, lower8);
+    const simd_t symbol1 = _mm512_and_si512(pack1, lower8);
+    const simd_t symbol2 = _mm512_and_si512(pack2, lower8);
+    const simd_t symbol3 = _mm512_and_si512(pack3, lower8);
+
+    // pack symbols to one si512.
+    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
+    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm512_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm512_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm512_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm512_srli_epi32(pack3, 20);
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned64)
+      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+    else
+      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
+    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
+    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
+    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
+      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
+      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
+      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
+        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
+        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
+        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
+        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
+        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
+        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
+        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
+        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // finalize lookups.
+        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  (void)totalSymbolCountBits;
+
+  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
+
+  return inplace_make_hist_dec(pDecHist);
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec2_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec_pack_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x64_decoder_type_t Impl, typename hist_type>
+size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state64_t<hist_type> decodeState;
+
+  for (size_t i = 0; i < StateCount; i++)
+  {
+    decodeState.states[i] = *reinterpret_cast<const uint32_t *>(pInData + inputIndex);
+    inputIndex += sizeof(uint32_t);
+  }
+
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+    if (i > outLengthInStates)
+    {
+      if (i >= expectedOutputLength)
+        return expectedOutputLength;
+      else
+        break;
+    }
+
+  } while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x64_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *decodeState.pReadHead;
+          state = read ? newState : state;
+          decodeState.pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *decodeState.pReadHead;
+            decodeState.pReadHead++;
+          }
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static size_t block_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  _DetectCPUFeatures();
+
+  if (avx512FSupported && avx512BWSupported && avx512DQSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4))
+  {
+    if constexpr (TotalSymbolCountBits >= 13)
+      return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx512_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    else
+      return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx512_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+  }
+  if (avx2Supported)
+  {
+    if constexpr (TotalSymbolCountBits >= 13)
+      return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    else
+      return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+  }
+
+  // Fallback.
+  return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t block_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t block_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return block_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity);
+}
diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp
new file mode 100644
index 0000000..cc3767d
--- /dev/null
+++ b/src/block_rANS32x64_16w_encode.cpp
@@ -0,0 +1,390 @@
+#include "block_rANS32x64_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t StateCount = 64; // Needs to be a power of two.
+constexpr bool EncodeNoBranch = false;
+constexpr size_t SafeHistBitMax = 0;
+
+constexpr size_t MinMinBlockSizeBits = 15;
+constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits;
+
+template <size_t TotalSymbolCountBits>
+struct HistReplaceMul
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } };
+
+template <size_t TotalSymbolCountBits>
+struct MinBlockSizeBits
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } };
+template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } };
+template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } };
+template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } };
+template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } };
+template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } };
+
+template <uint32_t TotalSymbolCountBits>
+constexpr size_t MinBlockSize()
+{
+  return (size_t)1 << MinBlockSizeBits<TotalSymbolCountBits>::GetValue();
+}
+
+size_t block_rANS32x64_16w_capacity(const size_t inputSize)
+{
+  const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
+  const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1;
+  const size_t perBlockExtraSize = sizeof(uint64_t) + 256 * sizeof(uint16_t);
+
+  return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases.
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x64_idx2idx[] =
+{
+  0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+  0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F,
+  0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37,
+  0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F,
+};
+
+static_assert(sizeof(_Rans32x64_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+struct _rans_encode_state64_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+enum rans32x64_encoder_type_t
+{
+  r32x64_et_scalar,
+};
+
+template <rans32x64_encoder_type_t type>
+struct rans32x64_16w_encoder
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+};
+
+template <>
+struct rans32x64_16w_encoder<r32x64_et_scalar>
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  {
+    int64_t targetCmp = targetIndex + StateCount;
+
+    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = _Rans32x64_idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = pState->hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = pState->states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          *pState->pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pState->pStart = (uint16_t)(state & 0xFFFF);
+            pState->pStart--;
+            state >>= 16;
+          }
+        }
+
+        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+      }
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
+{
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+
+  memset(symCount, 0, sizeof(uint32_t) * 256);
+  observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize);
+
+  // Do we include a symbol that hasn't been included before?
+  if constexpr (!IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+      if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0)
+        return false;
+  }
+
+  hist_t newHist;
+
+  if constexpr (TotalSymbolCountBits == MinBlockSize<TotalSymbolCountBits>())
+  {
+    for (size_t j = 0; j < 256; j++)
+      newHist.symbolCount[j] = (uint16_t)symCount[j];
+
+    size_t counter = 0;
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      newHist.cumul[j] = (uint16_t)counter;
+      counter += newHist.symbolCount[j];
+    }
+  }
+  else
+  {
+    normalize_hist(&newHist, symCount, MinBlockSize<TotalSymbolCountBits>(), TotalSymbolCountBits);
+  }
+
+  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
+  constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
+
+  // this comparison isn't fair or fast, but should be a good starting point hopefully.
+  float costBefore = 0;
+  float costAfter = 0;
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+  else
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+
+  const float diff = costBefore - costAfter;
+
+  return (diff < histReplacePoint);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x64_encoder_type_t Impl>
+size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (outCapacity < block_rANS32x64_16w_capacity(length))
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+  constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
+
+  _rans_encode_state64_t encodeState;
+  encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
+  encodeState.pStart = encodeState.pEnd;
+  
+  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1));
+
+  if (inputBlockTargetIndex > MinBlockSizeX)
+    inputBlockTargetIndex -= MinBlockSizeX;
+
+  size_t blockBackPoint = length;
+
+  uint32_t symCount[256];
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+
+  size_t extraCount = 0;
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+      {
+        symCount[j] = 1;
+        extraCount++;
+      }
+    }
+  }
+
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+
+  while (inputBlockTargetIndex > 0)
+  {
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+      inputBlockTargetIndex -= MinBlockSizeX;
+    else
+      break;
+  }
+
+  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+  blockBackPoint = length;
+
+  // Init States.
+  for (size_t i = 0; i < StateCount; i++)
+    encodeState.states[i] = DecodeConsumePoint16;
+
+  int64_t inputIndex = length - 1;
+  inputIndex &= ~(size_t)(StateCount - 1);
+  inputIndex += StateCount;
+
+  for (int64_t j = StateCount - 1; j >= 0; j--)
+  {
+    const uint8_t index = _Rans32x64_idx2idx[j];
+
+    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    {
+      const uint8_t in = pInData[inputIndex - StateCount + index];
+      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+      const uint32_t max = EncodeEmitPoint * symbolCount;
+
+      const size_t stateIndex = j;
+
+      uint32_t state = encodeState.states[stateIndex];
+
+      if (state >= max)
+      {
+        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+        encodeState.pStart--;
+        state >>= 16;
+      }
+
+      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+    }
+  }
+
+  inputIndex -= StateCount;
+
+  while (true)
+  {
+    rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    inputIndex = inputBlockTargetIndex;
+
+    // Write hist.
+    {
+      const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
+
+      encodeState.pStart++;
+      encodeState.pStart -= 256;
+      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+
+      encodeState.pStart--;
+    }
+
+    if (inputIndex == 0)
+      break;
+
+    // Determine new histogram.
+    {
+      inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+
+      if constexpr (IsSafeHist)
+        for (size_t j = 0; j < 256; j++)
+          if (symCount[j] == 0)
+            symCount[j] = 1;
+
+      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+      while (inputBlockTargetIndex > 0)
+      {
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+          inputBlockTargetIndex -= MinBlockSizeX;
+        else
+          break;
+      }
+
+      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+      blockBackPoint = inputIndex;
+    }
+  }
+
+  uint8_t *pWrite = pOutData;
+  size_t outIndex = 0;
+
+  *reinterpret_cast<uint64_t *>(pWrite + outIndex) = (uint64_t)length;
+  outIndex += sizeof(uint64_t);
+
+  // compressed expected length.
+  outIndex += sizeof(uint64_t);
+
+  for (size_t j = 0; j < StateCount; j++)
+  {
+    *reinterpret_cast<uint32_t *>(pWrite + outIndex) = encodeState.states[j];
+    outIndex += sizeof(uint32_t);
+  }
+
+  const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t);
+
+  memmove(pWrite + outIndex, encodeState.pStart + 1, size);
+  outIndex += size;
+
+  *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
+
+  return outIndex;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t block_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<15, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<14, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<13, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<12, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<11, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t block_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return block_rANS32x64_16w_encode<10, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
diff --git a/src/main.cpp b/src/main.cpp
index 80650d0..9e75bd0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,6 +11,7 @@
 #include "rANS32x16_16w.h"
 #include "rANS32x64_16w.h"
 #include "block_rANS32x32_16w.h"
+#include "block_rANS32x64_16w.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -151,12 +152,19 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8
 
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  
+  { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
@@ -386,12 +394,14 @@ int32_t main(const int32_t argc, char **pArgv)
     pUncompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize);
     pDecompressedData = (uint8_t *)ALIGNED_ALLOC(64, fileSize);
 
-    compressedDataCapacity = rANS32x64_16w_capacity(fileSize);
+    compressedDataCapacity = 0;
+    compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x64_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x16_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize));
 
     pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity);
 

From bea25968893bcecd674be172682b2dfccda9f136 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Thu, 6 Jul 2023 02:09:48 +0200
Subject: [PATCH 16/34] fine tuning stuff

---
 src/block_rANS32x64_16w_decode.cpp |  4 ++--
 src/block_rANS32x64_16w_encode.cpp | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp
index 1e7b4c8..c47f81a 100644
--- a/src/block_rANS32x64_16w_decode.cpp
+++ b/src/block_rANS32x64_16w_decode.cpp
@@ -1808,14 +1808,14 @@ static size_t block_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_
 {
   _DetectCPUFeatures();
 
-  if (avx512FSupported && avx512BWSupported && avx512DQSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4))
+  if (avx512FSupported && avx512BWSupported && (_CpuVendor != cpu_vendor_AMD || _CpuFamily != cpu_family_amd_zen3_zen4))
   {
     if constexpr (TotalSymbolCountBits >= 13)
       return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx512_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
     else
       return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx512_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
   }
-  if (avx2Supported)
+  else if (avx2Supported)
   {
     if constexpr (TotalSymbolCountBits >= 13)
       return block_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp
index cc3767d..7b77a81 100644
--- a/src/block_rANS32x64_16w_encode.cpp
+++ b/src/block_rANS32x64_16w_encode.cpp
@@ -19,9 +19,9 @@ struct HistReplaceMul
   constexpr static size_t GetValue();
 };
 
-template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 822; } };
-template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 2087; } };
-template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 3120; } };
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 850; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 1500; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 2500; } };
 template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 5600; } };
 template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 7730; } };
 template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 4000; } };
@@ -32,10 +32,10 @@ struct MinBlockSizeBits
   constexpr static size_t GetValue();
 };
 
-template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 18; } };
+template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } };
 template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 17; } };
 template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 17; } };
-template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 15; } };
+template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } };
 template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 19; } };
 template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 20; } };
 

From d5bde3141085cf6e2a668af0c2b7a86f009b4571 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Thu, 6 Jul 2023 06:38:49 +0200
Subject: [PATCH 17/34] 32x32 multithreading

---
 src/hist.cpp                    | 114 ++--
 src/iacaMarks.h                 |  53 ++
 src/main.cpp                    |  44 +-
 src/mt_rANS32x32_16w.h          |  30 +
 src/mt_rANS32x32_16w_decode.cpp | 984 ++++++++++++++++++++++++++++++++
 src/mt_rANS32x32_16w_encode.cpp | 387 +++++++++++++
 src/thread_pool.cpp             | 164 ++++++
 src/thread_pool.h               |  19 +
 8 files changed, 1743 insertions(+), 52 deletions(-)
 create mode 100644 src/iacaMarks.h
 create mode 100644 src/mt_rANS32x32_16w.h
 create mode 100644 src/mt_rANS32x32_16w_decode.cpp
 create mode 100644 src/mt_rANS32x32_16w_encode.cpp
 create mode 100644 src/thread_pool.cpp
 create mode 100644 src/thread_pool.h

diff --git a/src/hist.cpp b/src/hist.cpp
index 1c2179a..2b8e816 100644
--- a/src/hist.cpp
+++ b/src/hist.cpp
@@ -1,6 +1,7 @@
 #include "hist.h"
 
 #include <string.h>
+#include <algorithm>
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -101,65 +102,98 @@ void normalize_hist(hist_t *pHist, const uint32_t hist[256], const size_t dataBy
 
   if (cappedSum != totalSymbolCount)
   {
-    while (cappedSum > totalSymbolCount) // Start stealing.
-    {
-      size_t target = 2;
+    uint8_t sortedIdx[256];
+
+    for (size_t i = 0; i < 256; i++)
+      sortedIdx[i] = (uint8_t)i;
 
-      while (true)
+    struct _internal
+    {
+      static void heapify(uint8_t *pIdx, const uint16_t *pVal, const int64_t n, const int64_t i)
       {
-        size_t found = totalSymbolCount + 1;
+        const int64_t left = 2 * i + 1;
+        const int64_t right = 2 * i + 2;
+        int64_t largest = i;
 
-        for (size_t i = 0; i < 256; i++)
-          if (capped[i] > target && capped[i] < found)
-            found = capped[i];
+        if (left < n && pVal[pIdx[left]] > pVal[pIdx[largest]])
+          largest = left;
 
-        if (found == totalSymbolCount + 1)
-          break;
+        if (right < n && pVal[pIdx[right]] > pVal[pIdx[largest]])
+          largest = right;
 
-        for (size_t i = 0; i < 256; i++)
+        if (largest != i)
         {
-          if (capped[i] == found)
-          {
-            capped[i]--;
-            cappedSum--;
-
-            if (cappedSum == totalSymbolCount)
-              goto hist_ready;
-          }
+          std::swap(pIdx[i], pIdx[largest]);
+          heapify(pIdx, pVal, n, largest);
         }
+      }
+
+      static void heapSort(uint8_t *pIdx, const uint16_t *pVal, const size_t length)
+      {
+        for (int64_t i = (int64_t)length / 2 - 1; i >= 0; i--)
+          heapify(pIdx, pVal, length, i);
 
-        target = found + 1;
+        for (int64_t i = length - 1; i >= 0; i--)
+        {
+          std::swap(pIdx[0], pIdx[i]);
+          heapify(pIdx, pVal, i, 0);
+        }
       }
-    }
+    };
 
-    while (cappedSum < totalSymbolCount) // Start a charity.
+    _internal::heapSort(sortedIdx, capped, 256);
+    size_t minTwo = 0;
+
+    for (size_t i = 0; i < 256; i++)
     {
-      size_t target = totalSymbolCount + 1;
+      if (capped[sortedIdx[i]] >= 2)
+      {
+        minTwo = i;
+        break;
+      }
+    }
 
-      while (true)
+    while (cappedSum > totalSymbolCount) // Start stealing.
+    {
+      for (size_t i = minTwo; i < 256; i++)
       {
-        size_t found = 1;
+        capped[sortedIdx[i]]--;
+        cappedSum--;
 
-        for (size_t i = 0; i < 256; i++)
-          if (capped[i] < target && capped[i] > found)
-            found = capped[i];
+        if (cappedSum == totalSymbolCount)
+          goto hist_ready;
+      }
 
-        if (found == 1)
+      // Re-Adjust `minTwo`.
+      for (size_t i = minTwo; i < 256; i++)
+      {
+        if (capped[sortedIdx[i]] >= 2)
+        {
+          minTwo = i;
           break;
+        }
+      }
+    }
+
+    while (cappedSum < totalSymbolCount) // Start a charity.
+    {
+      for (int64_t i = 255; i >= (int64_t)minTwo; i--)
+      {
+        capped[sortedIdx[i]]++;
+        cappedSum++;
+
+        if (cappedSum == totalSymbolCount)
+          goto hist_ready;
+      }
 
-        for (size_t i = 0; i < 256; i++)
+      // Re-Adjust `minTwo`.
+      for (size_t i = minTwo; i < 256; i++)
+      {
+        if (capped[sortedIdx[i]] >= 2)
         {
-          if (capped[i] == found)
-          {
-            capped[i]++;
-            cappedSum++;
-
-            if (cappedSum == totalSymbolCount)
-              goto hist_ready;
-          }
+          minTwo = i;
+          break;
         }
-
-        target = found - 1;
       }
     }
   }
diff --git a/src/iacaMarks.h b/src/iacaMarks.h
new file mode 100644
index 0000000..be1973e
--- /dev/null
+++ b/src/iacaMarks.h
@@ -0,0 +1,53 @@
+/*
+* Copyright (2008-2009) Intel Corporation All Rights Reserved. 
+* The source code contained or described herein and all documents 
+* related to the source code ("Material") are owned by Intel Corporation 
+* or its suppliers or licensors. Title to the Material remains with 
+* Intel Corporation or its suppliers and licensors. The Material 
+* contains trade secrets and proprietary and confidential information 
+* of Intel or its suppliers and licensors. The Material is protected 
+* by worldwide copyright and trade secret laws and treaty provisions. 
+* No part of the Material may be used, copied, reproduced, modified, 
+* published, uploaded, posted, transmitted, distributed, or disclosed 
+* in any way without Intel(R)s prior express written permission.
+*
+* No license under any patent, copyright, trade secret or other 
+* intellectual property right is granted to or conferred upon you by 
+* disclosure or delivery of the Materials, either expressly, by implication,
+* inducement, estoppel or otherwise. Any license under such intellectual 
+* property rights must be express and approved by Intel in writing.
+*/
+
+#if defined (__GNUC__) 
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#else
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+#endif
+
+#define IACA_START {IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222)}
+
+#ifdef _WIN64
+#include <intrin.h>
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+#endif
+
+/**************** asm *****************
+;START_MARKER
+mov ebx, 111
+db 0x64, 0x67, 0x90
+
+;END_MARKER
+mov ebx, 222
+db 0x64, 0x67, 0x90
+
+**************************************/
diff --git a/src/main.cpp b/src/main.cpp
index 9e75bd0..9cf1652 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -12,6 +12,7 @@
 #include "rANS32x64_16w.h"
 #include "block_rANS32x32_16w.h"
 #include "block_rANS32x64_16w.h"
+#include "mt_rANS32x32_16w.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -120,6 +121,8 @@ uint8_t *pDecompressedData = nullptr;
 
 size_t compressedLength = 0;
 
+thread_pool *_pGlobalThreadPool = nullptr;
+
 //////////////////////////////////////////////////////////////////////////
 
 template <typename func_t>
@@ -150,21 +153,37 @@ size_t encode_no_hist_wrapper(const uint8_t *pInData, const size_t length, uint8
   return func(pInData, length, pOutData, outCapacity);
 }
 
+template <size_t(*func)(const uint8_t *, const size_t, uint8_t *, const size_t, thread_pool *)>
+size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (_pGlobalThreadPool == nullptr)
+    _pGlobalThreadPool = thread_pool_new((size_t)rans_max((int64_t)1, (int64_t)thread_pool_max_threads() - 1));
+
+  return func(pInData, inLength, pOutData, outCapacity, _pGlobalThreadPool);
+}
+
 static codec_info_t _Codecs[] =
 {
-  //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  // { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  
+  // { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
+  // { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},
+  // { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}},
+  // { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}},
+  // { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}},
+  // { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}},
   
-  { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
-  { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},
-  { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}},
-  { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}},
-  { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}},
-  { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_15>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_14>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_13>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_12>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_11>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_10>, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
@@ -402,6 +421,7 @@ int32_t main(const int32_t argc, char **pArgv)
     compressedDataCapacity = rans_max(compressedDataCapacity, rANS32x32_32blk_8w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x32_16w_capacity(fileSize));
 
     pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity);
 
diff --git a/src/mt_rANS32x32_16w.h b/src/mt_rANS32x32_16w.h
new file mode 100644
index 0000000..06a5a50
--- /dev/null
+++ b/src/mt_rANS32x32_16w.h
@@ -0,0 +1,30 @@
+#ifndef mt_rANS32x32_16w_h__
+#define mt_rANS32x32_16w_h__
+
+#include "hist.h"
+#include "thread_pool.h"
+
+size_t mt_rANS32x32_16w_capacity(const size_t inputSize);
+
+size_t mt_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+
+size_t mt_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+
+size_t mt_rANS32x32_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x32_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x32_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x32_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x32_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x32_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+
+#endif // mt_rANS32x32_16w_h__
diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp
new file mode 100644
index 0000000..b62beae
--- /dev/null
+++ b/src/mt_rANS32x32_16w_decode.cpp
@@ -0,0 +1,984 @@
+#include "mt_rANS32x32_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t StateCount = 32; // Needs to be a power of two.
+constexpr bool DecodeNoBranch = false;
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+extern const uint8_t _ShuffleLutShfl32[256 * 8];
+extern const uint8_t _ShuffleLutPerm32[256 * 8];
+extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
+
+//////////////////////////////////////////////////////////////////////////
+
+template <typename hist_type>
+struct _rans_decode_state32mt_t
+{
+#ifdef _MSC_VER
+  __declspec(align(32))
+#else
+  __attribute__((aligned(32)))
+#endif
+  uint32_t states[StateCount];
+
+  hist_type hist;
+  const uint16_t *pReadHead;
+};
+
+enum rans32x32_decoder_type_t
+{
+  r32x32_dt_scalar,
+  r32x32_dt_avx2_large_cache_15_to_13,
+  r32x32_dt_avx2_small_cache_15_to_13,
+  r32x32_dt_avx2_large_cache_12_to_10,
+  r32x32_dt_avx2_small_cache_12_to_10,
+};
+
+template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
+struct rans32x32_16w_decoder
+{
+  static size_t decode_section(_rans_decode_state32mt_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
+{
+  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+    size_t i = startIndex;
+
+    for (; i < endIndex; i += StateCount)
+    {
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+        uint32_t state = pState->states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = pState->hist.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pState->pReadHead;
+          state = read ? newState : state;
+          pState->pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pState->pReadHead;
+            pState->pReadHead++;
+          }
+        }
+
+        pState->states[j] = state;
+      }
+    }
+
+    return i;
+  }
+};
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _mt_rans32x32_decode_section_avx2_varA(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // const uint8_t symbol = pHist->cumulInv[slot];
+    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
+    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
+    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
+    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
+
+    // since they were int32_t turn into uint8_t
+    symbol0 = _mm256_and_si256(symbol0, lower8);
+    symbol1 = _mm256_and_si256(symbol1, lower8);
+    symbol2 = _mm256_and_si256(symbol2, lower8);
+    symbol3 = _mm256_and_si256(symbol3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
+    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
+    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
+    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _mt_rans32x32_decode_section_avx2_varC(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  static_assert(TotalSymbolCountBits <= 12);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot3, sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
+    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
+    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
+    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  (void)totalSymbolCountBits;
+
+  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
+
+  return inplace_make_hist_dec(pDecHist);
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec2_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec_pack_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
+size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state32mt_t<hist_type> decodeState;
+
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    (void)readHeadBackOffset; // unused in single-threaded version.
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+    }
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+    if (i > outLengthInStates)
+    {
+      if (i >= expectedOutputLength)
+        return expectedOutputLength;
+      else
+        break;
+    }
+
+  } while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x32_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *decodeState.pReadHead;
+          state = read ? newState : state;
+          decodeState.pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *decodeState.pReadHead;
+            decodeState.pReadHead++;
+          }
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
+size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state32mt_t<hist_type> decodeState;
+
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+    }
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    if (i + blockSize > blockEndInStates)
+    {
+      i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+      break;
+    }
+    else
+    {
+      thread_pool_add(pThreadPool, [=]() {
+        auto decState = decodeState;
+        rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
+      });
+
+      i = blockEndInStates;
+      decodeState.pReadHead = pReadHeadAfter;
+    }
+
+  } while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x32_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *decodeState.pReadHead;
+          state = read ? newState : state;
+          decodeState.pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *decodeState.pReadHead;
+            decodeState.pReadHead++;
+          }
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  thread_pool_await(pThreadPool);
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static size_t mt_rANS32x32_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool = nullptr)
+{
+  _DetectCPUFeatures();
+
+  if (avx2Supported)
+  {
+    if constexpr (TotalSymbolCountBits >= 13)
+    {
+      if (pThreadPool)
+        return mt_rANS32x32_16w_decode_mt<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+      else
+        return mt_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    }
+    else
+    {
+      if (pThreadPool)
+        return mt_rANS32x32_16w_decode_mt<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+      else
+        return mt_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    }
+  }
+
+  // Fallback.
+  if (pThreadPool)
+    return mt_rANS32x32_16w_decode_mt<TotalSymbolCountBits, r32x32_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+  else
+    return mt_rANS32x32_16w_decode<TotalSymbolCountBits, r32x32_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x32_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x32_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x32_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x32_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x32_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x32_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x32_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x32_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x32_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x32_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x32_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x32_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x32_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp
new file mode 100644
index 0000000..07401a1
--- /dev/null
+++ b/src/mt_rANS32x32_16w_encode.cpp
@@ -0,0 +1,387 @@
+#include "mt_rANS32x32_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t StateCount = 32; // Needs to be a power of two.
+constexpr bool EncodeNoBranch = false;
+constexpr size_t SafeHistBitMax = 0;
+
+constexpr size_t MinMinBlockSizeBits = 15;
+constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits;
+
+template <size_t TotalSymbolCountBits>
+struct HistReplaceMul
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 50; } };
+
+template <size_t TotalSymbolCountBits>
+struct MinBlockSizeBits
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 16; } };
+
+template <uint32_t TotalSymbolCountBits>
+constexpr size_t MinBlockSize()
+{
+  return (size_t)1 << MinBlockSizeBits<TotalSymbolCountBits>::GetValue();
+}
+
+size_t mt_rANS32x32_16w_capacity(const size_t inputSize)
+{
+  const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
+  const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1;
+  const size_t perBlockExtraSize = sizeof(uint64_t) * 2 + 256 * sizeof(uint16_t) + StateCount * sizeof(uint32_t);
+
+  return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases.
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+struct _rans_encode_state32mt_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+enum rans32x32_encoder_type_t
+{
+  r32x32_et_scalar,
+};
+
+template <rans32x32_encoder_type_t type>
+struct rans32x32_16w_encoder
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+};
+
+template <>
+struct rans32x32_16w_encoder<r32x32_et_scalar>
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  {
+    int64_t targetCmp = targetIndex + StateCount;
+
+    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = pState->hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = pState->states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          *pState->pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pState->pStart = (uint16_t)(state & 0xFFFF);
+            pState->pStart--;
+            state >>= 16;
+          }
+        }
+
+        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+      }
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
+{
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+
+  memset(symCount, 0, sizeof(uint32_t) * 256);
+  observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize);
+
+  // Do we include a symbol that hasn't been included before?
+  if constexpr (!IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+      if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0)
+        return false;
+  }
+
+  hist_t newHist;
+
+  if constexpr (TotalSymbolCountBits == MinBlockSize<TotalSymbolCountBits>())
+  {
+    for (size_t j = 0; j < 256; j++)
+      newHist.symbolCount[j] = (uint16_t)symCount[j];
+
+    size_t counter = 0;
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      newHist.cumul[j] = (uint16_t)counter;
+      counter += newHist.symbolCount[j];
+    }
+  }
+  else
+  {
+    normalize_hist(&newHist, symCount, MinBlockSize<TotalSymbolCountBits>(), TotalSymbolCountBits);
+  }
+
+  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
+  constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
+
+  // this comparison isn't fair or fast, but should be a good starting point hopefully.
+  float costBefore = 0;
+  float costAfter = (float)(sizeof(uint16_t) * 256 + StateCount * sizeof(uint32_t) + sizeof(uint64_t) * 2) * 0.5f; // let's assume that block will be able to share it's histogram with someone else.
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+  else
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+
+  const float diff = costBefore - costAfter;
+
+  return (diff < histReplacePoint);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x32_encoder_type_t Impl>
+size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (outCapacity < mt_rANS32x32_16w_capacity(length))
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+  constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
+
+  _rans_encode_state32mt_t encodeState;
+  encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
+  encodeState.pStart = encodeState.pEnd;
+  
+  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1));
+
+  if (inputBlockTargetIndex > MinBlockSizeX)
+    inputBlockTargetIndex -= MinBlockSizeX;
+
+  uint16_t *pBlockEnd = encodeState.pEnd;
+  size_t blockBackPoint = length;
+
+  uint32_t symCount[256];
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+
+  size_t extraCount = 0;
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+      {
+        symCount[j] = 1;
+        extraCount++;
+      }
+    }
+  }
+
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+
+  while (inputBlockTargetIndex > 0)
+  {
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+      inputBlockTargetIndex -= MinBlockSizeX;
+    else
+      break;
+  }
+
+  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+  blockBackPoint = length;
+
+  // Init States.
+  for (size_t i = 0; i < StateCount; i++)
+    encodeState.states[i] = DecodeConsumePoint16;
+
+  int64_t inputIndex = length - 1;
+  inputIndex &= ~(size_t)(StateCount - 1);
+  inputIndex += StateCount;
+
+  for (int64_t j = StateCount - 1; j >= 0; j--)
+  {
+    const uint8_t index = _Rans32x32_idx2idx[j];
+
+    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    {
+      const uint8_t in = pInData[inputIndex - StateCount + index];
+      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+      const uint32_t max = EncodeEmitPoint * symbolCount;
+
+      const size_t stateIndex = j;
+
+      uint32_t state = encodeState.states[stateIndex];
+
+      if (state >= max)
+      {
+        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+        encodeState.pStart--;
+        state >>= 16;
+      }
+
+      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+    }
+  }
+
+  inputIndex -= StateCount;
+
+  while (true)
+  {
+    rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    inputIndex = inputBlockTargetIndex;
+
+    // Write hist & states.
+    {
+      const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
+
+      encodeState.pStart++;
+      encodeState.pStart -= 256;
+      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+      encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
+      memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+
+      const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
+
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
+
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+
+      pBlockEnd = encodeState.pStart;
+      encodeState.pStart--;
+    }
+
+    if (inputIndex == 0)
+      break;
+
+    // Determine new histogram.
+    {
+      inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+
+      if constexpr (IsSafeHist)
+        for (size_t j = 0; j < 256; j++)
+          if (symCount[j] == 0)
+            symCount[j] = 1;
+
+      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+      while (inputBlockTargetIndex > 0)
+      {
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+          inputBlockTargetIndex -= MinBlockSizeX;
+        else
+          break;
+      }
+
+      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+      blockBackPoint = inputIndex;
+    }
+  }
+
+  uint8_t *pWrite = pOutData;
+  size_t outIndex = 0;
+
+  *reinterpret_cast<uint64_t *>(pWrite + outIndex) = (uint64_t)length;
+  outIndex += sizeof(uint64_t);
+
+  // compressed expected length.
+  outIndex += sizeof(uint64_t);
+
+  const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t);
+
+  memmove(pWrite + outIndex, encodeState.pStart + 1, size);
+  outIndex += size;
+
+  *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
+
+  return outIndex;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x32_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<15, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x32_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<14, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x32_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<13, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x32_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<12, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x32_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<11, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x32_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x32_16w_encode<10, r32x32_et_scalar>(pInData, length, pOutData, outCapacity); }
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
new file mode 100644
index 0000000..63afda4
--- /dev/null
+++ b/src/thread_pool.cpp
@@ -0,0 +1,164 @@
+// Improved Version of https://github.com/rainerzufalldererste/slapcodec/blob/master/slapcodec/src/threadpool.cpp
+
+#include "thread_pool.h"
+
+#include <thread>
+#include <mutex>
+#include <queue>
+#include <atomic>
+#include <condition_variable>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+struct thread_pool
+{
+  std::queue<std::function<void(void)>> tasks;
+  
+  std::thread *pThreads;
+  size_t threadCount;
+
+  std::atomic<size_t> taskCount;
+  std::atomic<bool> isRunning;
+  std::mutex mutex;
+  std::condition_variable condition_var;
+
+  thread_pool(const size_t threadCount);
+  ~thread_pool();
+};
+
+void thread_pool_ThreadFunc(thread_pool *pThreadPool, const size_t index)
+{
+#ifdef _WIN32
+  SetThreadIdealProcessor(GetCurrentThread(), (DWORD)index);
+  SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
+#else
+  (void)index;
+#endif
+
+  while (pThreadPool->isRunning)
+  {
+    std::function<void(void)> task = nullptr;
+
+    {
+      std::unique_lock<std::mutex> lock(pThreadPool->mutex);
+      pThreadPool->condition_var.wait_for(lock, std::chrono::milliseconds(1));
+
+      if (!pThreadPool->tasks.empty())
+      {
+        task = pThreadPool->tasks.front();
+        pThreadPool->tasks.pop();
+      }
+    }
+
+    if (task)
+    {
+      task();
+      --pThreadPool->taskCount;
+      continue;
+    }
+  }
+}
+
+thread_pool::thread_pool(const size_t threads) :
+  tasks(),
+  pThreads(nullptr),
+  threadCount(threads),
+  taskCount(0),
+  isRunning(true),
+  mutex(),
+  condition_var()
+{
+  pThreads = reinterpret_cast<std::thread *>(malloc(sizeof(std::thread) * threads));
+
+  for (size_t i = 0; i < threads; i++)
+    new (&pThreads[i]) std::thread(thread_pool_ThreadFunc, this, i);
+}
+
+thread_pool::~thread_pool()
+{
+  thread_pool_await(this);
+   
+  isRunning = false;
+  condition_var.notify_all();
+
+  for (size_t i = 0; i < threadCount; i++)
+  {
+    pThreads[i].join();
+    pThreads[i].~thread();
+  }
+
+  free(pThreads);
+}
+
+thread_pool * thread_pool_new(const size_t threads)
+{
+  return new thread_pool(threads);
+}
+
+void thread_pool_destroy(thread_pool **ppThreadPool)
+{
+  if (ppThreadPool == nullptr || *ppThreadPool == nullptr)
+    return;
+
+  delete *ppThreadPool;
+}
+
+size_t thread_pool_thread_count(thread_pool *pPool)
+{
+  if (pPool == nullptr)
+    return 1;
+
+  return pPool->threadCount == 0 ? 1 : pPool->threadCount;
+}
+
+void thread_pool_add(thread_pool *pThreadPool, const std::function<void(void)> &task)
+{
+  pThreadPool->taskCount++;
+
+  pThreadPool->mutex.lock();
+  pThreadPool->tasks.push(task);
+  pThreadPool->mutex.unlock();
+
+  pThreadPool->condition_var.notify_one();
+}
+
+void thread_pool_await(thread_pool *pThreadPool)
+{
+  while (true)
+  {
+    std::function<void(void)> task = nullptr;
+
+    // Locked by mutex.
+    {
+      pThreadPool->mutex.lock();
+
+      if (!pThreadPool->tasks.empty())
+      {
+        task = pThreadPool->tasks.front();
+        pThreadPool->tasks.pop();
+      }
+
+      pThreadPool->mutex.unlock();
+    }
+
+    if (task)
+    {
+      task();
+      pThreadPool->taskCount--;
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  while (pThreadPool->taskCount > 0)
+    std::this_thread::yield(); // Wait for all other threads to finish their tasks.
+}
+
+size_t thread_pool_max_threads()
+{
+  return std::thread::hardware_concurrency();
+}
diff --git a/src/thread_pool.h b/src/thread_pool.h
new file mode 100644
index 0000000..39de4ec
--- /dev/null
+++ b/src/thread_pool.h
@@ -0,0 +1,19 @@
+#ifndef thread_pool_h__
+#define thread_pool_h__
+
+#include <stddef.h>
+#include <functional>
+
+struct thread_pool;
+
+thread_pool * thread_pool_new(const size_t threads);
+void thread_pool_destroy(thread_pool **ppThreadPool);
+
+size_t thread_pool_thread_count(thread_pool *pThreadPool);
+
+void thread_pool_add(thread_pool *pThreadPool, const std::function<void(void)> &func);
+void thread_pool_await(thread_pool *pThreadPool);
+
+size_t thread_pool_max_threads();
+
+#endif // thread_pool_h__

From e186a491940b324fbf7d81dd6da3c9457bf2dc01 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 02:17:11 +0200
Subject: [PATCH 18/34] hopefully fixing weird issue with decoding 0 size
 blocks

---
 src/hist.cpp                    |  9 +++++++--
 src/mt_rANS32x32_16w_decode.cpp |  3 +++
 src/mt_rANS32x32_16w_encode.cpp | 17 ++++++++++-------
 src/thread_pool.cpp             | 10 +++++++++-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/hist.cpp b/src/hist.cpp
index 2b8e816..ab0dee6 100644
--- a/src/hist.cpp
+++ b/src/hist.cpp
@@ -307,7 +307,7 @@ void make_dec_pack_hist(hist_dec_pack_t<TotalSymbolCountBits> *pHistDec, const h
 
 bool inplace_complete_hist(hist_t *pHist, const size_t totalSymbolCountBits)
 {
-  uint16_t counter = 0;
+  uint32_t counter = 0;
 
   for (size_t i = 0; i < 256; i++)
   {
@@ -315,7 +315,12 @@ bool inplace_complete_hist(hist_t *pHist, const size_t totalSymbolCountBits)
     counter += pHist->symbolCount[i];
   }
 
-  return (counter == 1 << totalSymbolCountBits);
+#if defined(_DEBUG) && defined(_MSC_VER)
+  if (counter != ((uint32_t)1 << totalSymbolCountBits))
+    __debugbreak();
+#endif
+
+  return (counter == (uint32_t)(1 << totalSymbolCountBits));
 }
 
 template <uint32_t TotalSymbolCountBits>
diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp
index b62beae..07d4741 100644
--- a/src/mt_rANS32x32_16w_decode.cpp
+++ b/src/mt_rANS32x32_16w_decode.cpp
@@ -675,6 +675,7 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui
 
     const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
     (void)readHeadBackOffset; // unused in single-threaded version.
 
     for (size_t j = 0; j < StateCount; j++)
@@ -709,6 +710,8 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui
         break;
     }
 
+    decodeState.pReadHead = pReadHeadAfter;
+
   } while (i < outLengthInStates);
 
   if (i < expectedOutputLength)
diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp
index 07401a1..140a013 100644
--- a/src/mt_rANS32x32_16w_encode.cpp
+++ b/src/mt_rANS32x32_16w_encode.cpp
@@ -20,11 +20,11 @@ struct HistReplaceMul
 };
 
 template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } };
-template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 50; } };
-template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 50; } };
-template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 50; } };
-template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 50; } };
-template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 500; } };
 
 template <size_t TotalSymbolCountBits>
 struct MinBlockSizeBits
@@ -45,6 +45,9 @@ constexpr size_t MinBlockSize()
   return (size_t)1 << MinBlockSizeBits<TotalSymbolCountBits>::GetValue();
 }
 
+constexpr size_t MaxBlockSizeBits = 25;
+constexpr size_t MaxBlockSize = (size_t)1 << MaxBlockSizeBits;
+
 size_t mt_rANS32x32_16w_capacity(const size_t inputSize)
 {
   const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
@@ -251,7 +254,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
 
   normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
-  while (inputBlockTargetIndex > 0)
+  while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
   {
     if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
       inputBlockTargetIndex -= MinBlockSizeX;
@@ -343,7 +346,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
 
       normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
 
-      while (inputBlockTargetIndex > 0)
+      while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
       {
         if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
           inputBlockTargetIndex -= MinBlockSizeX;
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index 63afda4..ed55ecb 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -10,6 +10,9 @@
 
 #ifdef _WIN32
 #include <windows.h>
+#else
+#include <sched.h>
+#include <pthread.h>
 #endif
 
 struct thread_pool
@@ -34,7 +37,12 @@ void thread_pool_ThreadFunc(thread_pool *pThreadPool, const size_t index)
   SetThreadIdealProcessor(GetCurrentThread(), (DWORD)index);
   SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
 #else
-  (void)index;
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET((int32_t)index, &cpuset);
+
+  pthread_t current_thread = pthread_self();
+  pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
 #endif
 
   while (pThreadPool->isRunning)

From a490e8062f6521c7eb768877660b2ea7d9e2457e Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 02:49:51 +0200
Subject: [PATCH 19/34] as cleanup as possible

---
 src/block_codec32.h                |  713 +++++++++++
 src/block_codec64.h                | 1761 ++++++++++++++++++++++++++++
 src/block_rANS32x32_16w_decode.cpp |  626 +---------
 src/block_rANS32x32_16w_encode.cpp |   76 +-
 src/block_rANS32x64_16w_decode.cpp | 1696 +--------------------------
 src/block_rANS32x64_16w_encode.cpp |   83 +-
 src/mt_rANS32x32_16w_decode.cpp    |  668 +----------
 src/mt_rANS32x32_16w_encode.cpp    |   78 +-
 8 files changed, 2492 insertions(+), 3209 deletions(-)
 create mode 100644 src/block_codec32.h
 create mode 100644 src/block_codec64.h

diff --git a/src/block_codec32.h b/src/block_codec32.h
new file mode 100644
index 0000000..106da1c
--- /dev/null
+++ b/src/block_codec32.h
@@ -0,0 +1,713 @@
+#ifndef block_codec32_h__
+#define block_codec32_h__
+
+#include "hist.h"
+
+#include <string.h>
+
+constexpr size_t StateCount = 32; // Needs to be a power of two.
+
+//////////////////////////////////////////////////////////////////////////
+
+extern const uint8_t _ShuffleLutShfl32[256 * 8];
+extern const uint8_t _ShuffleLutPerm32[256 * 8];
+extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
+
+constexpr bool EncodeNoBranch = false;
+constexpr bool DecodeNoBranch = false;
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
+
+//////////////////////////////////////////////////////////////////////////
+
+struct _rans_encode_state32_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+enum rans32x32_encoder_type_t
+{
+  r32x32_et_scalar,
+};
+
+template <rans32x32_encoder_type_t type>
+struct rans32x32_16w_encoder
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <>
+struct rans32x32_16w_encoder<r32x32_et_scalar>
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  {
+    int64_t targetCmp = targetIndex + StateCount;
+
+    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = pState->hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = pState->states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          *pState->pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pState->pStart = (uint16_t)(state & 0xFFFF);
+            pState->pStart--;
+            state >>= 16;
+          }
+        }
+
+        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+      }
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <typename hist_type>
+struct _rans_decode_state32_t
+{
+#ifdef _MSC_VER
+  __declspec(align(32))
+#else
+  __attribute__((aligned(32)))
+#endif
+    uint32_t states[StateCount];
+
+  hist_type hist;
+  const uint16_t *pReadHead;
+};
+
+enum rans32x32_decoder_type_t
+{
+  r32x32_dt_scalar,
+  r32x32_dt_avx2_large_cache_15_to_13,
+  r32x32_dt_avx2_small_cache_15_to_13,
+  r32x32_dt_avx2_large_cache_12_to_10,
+  r32x32_dt_avx2_small_cache_12_to_10,
+};
+
+template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
+struct rans32x32_16w_decoder
+{
+  static size_t decode_section(_rans_decode_state32_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  (void)totalSymbolCountBits;
+
+  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
+
+  return inplace_make_hist_dec(pDecHist);
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec2_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec_pack_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
+{
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+    size_t i = startIndex;
+
+    for (; i < endIndex; i += StateCount)
+    {
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        const uint8_t index = _Rans32x32_idx2idx[j];
+        uint32_t state = pState->states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = pState->hist.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pState->pReadHead;
+          state = read ? newState : state;
+          pState->pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pState->pReadHead;
+            pState->pReadHead++;
+          }
+        }
+
+        pState->states[j] = state;
+      }
+    }
+
+    return i;
+  }
+};
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // const uint8_t symbol = pHist->cumulInv[slot];
+    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
+    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
+    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
+    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
+
+    // since they were int32_t turn into uint8_t
+    symbol0 = _mm256_and_si256(symbol0, lower8);
+    symbol1 = _mm256_and_si256(symbol1, lower8);
+    symbol2 = _mm256_and_si256(symbol2, lower8);
+    symbol3 = _mm256_and_si256(symbol3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
+    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
+    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
+    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
+      return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  static_assert(TotalSymbolCountBits <= 12);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot3, sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
+    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
+    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
+    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0, 1.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+#endif block_codec32_h__
diff --git a/src/block_codec64.h b/src/block_codec64.h
new file mode 100644
index 0000000..45f05b5
--- /dev/null
+++ b/src/block_codec64.h
@@ -0,0 +1,1761 @@
+#ifndef block_codec64_h__
+#define block_codec64_h__
+
+#include "hist.h"
+
+#include <string.h>
+
+constexpr size_t StateCount = 64; // Needs to be a power of two.
+
+//////////////////////////////////////////////////////////////////////////
+
+extern const uint8_t _ShuffleLutShfl32[256 * 8];
+extern const uint8_t _ShuffleLutPerm32[256 * 8];
+extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
+
+constexpr bool EncodeNoBranch = false;
+constexpr bool DecodeNoBranch = false;
+
+//////////////////////////////////////////////////////////////////////////
+
+static const uint8_t _Rans32x64_idx2idx[] =
+{
+  0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+  0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F,
+  0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37,
+  0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F,
+};
+
+static_assert(sizeof(_Rans32x64_idx2idx) == StateCount);
+
+struct _rans_encode_state64_t
+{
+  uint32_t states[StateCount];
+  hist_t hist;
+  uint16_t *pEnd, *pStart; // both compressed.
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+enum rans32x64_encoder_type_t
+{
+  r32x64_et_scalar,
+};
+
+template <rans32x64_encoder_type_t type>
+struct rans32x64_16w_encoder
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <>
+struct rans32x64_16w_encoder<r32x64_et_scalar>
+{
+  template <uint32_t TotalSymbolCountBits>
+  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
+  {
+    int64_t targetCmp = targetIndex + StateCount;
+
+    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
+    {
+      for (int64_t j = StateCount - 1; j >= 0; j--)
+      {
+        const uint8_t index = _Rans32x64_idx2idx[j];
+
+        const uint8_t in = pInData[i - StateCount + index];
+        const uint32_t symbolCount = pState->hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
+
+        const size_t stateIndex = j;
+
+        uint32_t state = pState->states[stateIndex];
+
+        if constexpr (EncodeNoBranch)
+        {
+          const bool write = state >= max;
+          *pState->pStart = (uint16_t)(state & 0xFFFF);
+          *pState->pStart -= (size_t)write;
+          state = write ? state >> 16 : state;
+        }
+        else
+        {
+          if (state >= max)
+          {
+            *pState->pStart = (uint16_t)(state & 0xFFFF);
+            pState->pStart--;
+            state >>= 16;
+          }
+        }
+
+        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
+      }
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <typename hist_type>
+struct _rans_decode_state64_t
+{
+#ifdef _MSC_VER
+  __declspec(align(64))
+#else
+  __attribute__((aligned(64)))
+#endif
+    uint32_t states[StateCount];
+
+  hist_type hist;
+  const uint16_t *pReadHead;
+};
+
+enum rans32x64_decoder_type_t
+{
+  r32x64_dt_scalar,
+  r32x64_dt_avx2_large_cache_15_to_13,
+  r32x64_dt_avx2_small_cache_15_to_13,
+  r32x64_dt_avx2_large_cache_12_to_10,
+  r32x64_dt_avx2_small_cache_12_to_10,
+  r32x64_dt_avx512_large_cache_15_to_13,
+  r32x64_dt_avx512_small_cache_15_to_13,
+  r32x64_dt_avx512_large_cache_12_to_10,
+  r32x64_dt_avx512_small_cache_12_to_10,
+};
+
+template <rans32x64_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
+struct rans32x64_16w_decoder
+{
+  static size_t decode_section(_rans_decode_state64_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  (void)totalSymbolCountBits;
+
+  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
+
+  return inplace_make_hist_dec(pDecHist);
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec2_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+template <uint32_t TotalSymbolCountBits>
+static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
+{
+  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
+    return false;
+
+  make_dec_pack_hist(pDecHist, pIncompleteHist);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
+{
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+    size_t i = startIndex;
+
+    for (; i < endIndex; i += StateCount)
+    {
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        const uint8_t index = _Rans32x64_idx2idx[j];
+        uint32_t state = pState->states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = pState->hist.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
+
+        if constexpr (DecodeNoBranch)
+        {
+          const bool read = state < DecodeConsumePoint16;
+          const uint32_t newState = state << 16 | *pState->pReadHead;
+          state = read ? newState : state;
+          pState->pReadHead += (size_t)read;
+        }
+        else
+        {
+          if (state < DecodeConsumePoint16)
+          {
+            state = state << 16 | *pState->pReadHead;
+            pState->pReadHead++;
+          }
+        }
+
+        pState->states[j] = state;
+      }
+    }
+
+    return i;
+  }
+};
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
+    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
+    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
+    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
+
+    // const uint8_t symbol = pHist->cumulInv[slot];
+    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
+    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
+    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
+    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
+    simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot4, sizeof(uint8_t));
+    simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot5, sizeof(uint8_t));
+    simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot6, sizeof(uint8_t));
+    simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot7, sizeof(uint8_t));
+
+    // since they were int32_t turn into uint8_t
+    symbol0 = _mm256_and_si256(symbol0, lower8);
+    symbol1 = _mm256_and_si256(symbol1, lower8);
+    symbol2 = _mm256_and_si256(symbol2, lower8);
+    symbol3 = _mm256_and_si256(symbol3, lower8);
+    symbol4 = _mm256_and_si256(symbol4, lower8);
+    symbol5 = _mm256_and_si256(symbol5, lower8);
+    symbol6 = _mm256_and_si256(symbol6, lower8);
+    symbol7 = _mm256_and_si256(symbol7, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
+    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
+    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
+
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
+    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol4, sizeof(uint32_t));
+    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol5, sizeof(uint32_t));
+    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol6, sizeof(uint32_t));
+    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol7, sizeof(uint32_t));
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
+    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
+    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
+    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
+    const simd_t cumul4 = _mm256_srli_epi32(pack4, 16);
+    const simd_t freq4 = _mm256_and_si256(pack4, lower16);
+    const simd_t cumul5 = _mm256_srli_epi32(pack5, 16);
+    const simd_t freq5 = _mm256_and_si256(pack5, lower16);
+    const simd_t cumul6 = _mm256_srli_epi32(pack6, 16);
+    const simd_t freq6 = _mm256_and_si256(pack6, lower16);
+    const simd_t cumul7 = _mm256_srli_epi32(pack7, 16);
+    const simd_t freq7 = _mm256_and_si256(pack7, lower16);
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
+    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
+    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
+    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
+    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
+    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
+    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
+    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
+    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
+    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
+      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
+      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
+      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
+        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
+        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
+        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
+#ifndef _MSC_VER
+__attribute__((target("avx2")))
+#endif
+static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned32)
+    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m256i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm256_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
+  const simd_t _16 = _mm256_set1_epi32(16);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
+    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
+    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
+    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
+    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot0, sizeof(uint32_t));
+    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot1, sizeof(uint32_t));
+    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot2, sizeof(uint32_t));
+    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot3, sizeof(uint32_t));
+    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot4, sizeof(uint32_t));
+    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot5, sizeof(uint32_t));
+    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot6, sizeof(uint32_t));
+    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot7, sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
+    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
+    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
+    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
+    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
+    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
+    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
+    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
+    const simd_t symbol4 = _mm256_and_si256(pack4, lower8);
+    const simd_t symbol5 = _mm256_and_si256(pack5, lower8);
+    const simd_t symbol6 = _mm256_and_si256(pack6, lower8);
+    const simd_t symbol7 = _mm256_and_si256(pack7, lower8);
+
+    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
+    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
+
+    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
+    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
+    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
+
+    if constexpr (WriteAligned32)
+      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+    else
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
+    const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12);
+    const simd_t freq4 = _mm256_srli_epi32(pack4, 20);
+    const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12);
+    const simd_t freq5 = _mm256_srli_epi32(pack5, 20);
+    const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12);
+    const simd_t freq6 = _mm256_srli_epi32(pack6, 20);
+    const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12);
+    const simd_t freq7 = _mm256_srli_epi32(pack7, 20);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
+    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
+    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
+    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
+    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
+    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
+    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
+    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
+    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
+    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
+    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
+    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
+      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
+      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
+      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
+      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
+      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
+      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
+      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
+        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
+
+        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
+        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
+
+        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
+        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
+
+        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
+        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
+
+        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
+        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
+
+        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
+        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
+
+        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
+        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
+
+        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
+        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
+        pState->pReadHead += maskPop0;
+
+        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
+        pState->pReadHead += maskPop1;
+
+        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
+        pState->pReadHead += maskPop2;
+
+        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
+        pState->pReadHead += maskPop3;
+
+        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
+        pState->pReadHead += maskPop4;
+
+        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
+        pState->pReadHead += maskPop5;
+
+        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
+        pState->pReadHead += maskPop6;
+
+        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
+        pState->pReadHead += maskPop7;
+
+        // finalize lookups.
+        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
+        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
+        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
+        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
+        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
+        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
+        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
+        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
+        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
+        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
+        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
+        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
+
+        // shuffle new words in place.
+        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
+        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
+        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
+        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
+        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
+        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
+        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
+        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
+
+        // expand new word.
+        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
+        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
+        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
+        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
+        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
+        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
+        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
+        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
+
+        // state = state << 16 | newWord;
+        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
+        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
+        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
+        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
+        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
+        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
+        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
+        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
+#ifndef _MSC_VER
+#ifdef __llvm__
+__attribute__((target("avx512bw")))
+#else
+__attribute__((target("avx512f", "avx512bw")))
+#endif
+#endif
+static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned64)
+    if ((reinterpret_cast<size_t>(pOutData) & (64 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m512i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower16 = _mm512_set1_epi32(0xFFFF);
+  const simd_t lower8 = _mm512_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
+  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+    simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    symbol0 = _mm512_and_si512(symbol0, lower8);
+    symbol1 = _mm512_and_si512(symbol1, lower8);
+    symbol2 = _mm512_and_si512(symbol2, lower8);
+    symbol3 = _mm512_and_si512(symbol3, lower8);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+    const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
+
+    // pack symbols to one si512.
+    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
+    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
+
+    // freq, cumul.
+    const simd_t cumul0 = _mm512_srli_epi32(pack0, 16);
+    const simd_t freq0 = _mm512_and_si512(pack0, lower16);
+    const simd_t cumul1 = _mm512_srli_epi32(pack1, 16);
+    const simd_t freq1 = _mm512_and_si512(pack1, lower16);
+    const simd_t cumul2 = _mm512_srli_epi32(pack2, 16);
+    const simd_t freq2 = _mm512_and_si512(pack2, lower16);
+    const simd_t cumul3 = _mm512_srli_epi32(pack3, 16);
+    const simd_t freq3 = _mm512_and_si512(pack3, lower16);
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned64)
+      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+    else
+      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
+    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
+    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
+    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
+      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
+      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
+      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
+        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
+        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
+        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
+        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
+        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
+        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
+        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
+        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // finalize lookups.
+        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
+#ifndef _MSC_VER
+#ifdef __llvm__
+__attribute__((target("avx512bw")))
+#else
+__attribute__((target("avx512f", "avx512bw")))
+#endif
+#endif
+static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+{
+  if constexpr (!WriteAligned64)
+    if ((reinterpret_cast<size_t>(pOutData) & (63 - 1)) == 0)
+      return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
+
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+  static_assert(TotalSymbolCountBits < 16);
+
+  typedef __m512i simd_t;
+  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
+
+  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
+    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
+
+  size_t i = startIndex;
+
+  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
+  const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1);
+  const simd_t lower8 = _mm512_set1_epi32(0xFF);
+  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
+  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
+  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
+
+  for (; i < endIndex; i += StateCount)
+  {
+    // const uint32_t slot = state & (TotalSymbolCount - 1);
+    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
+    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
+    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
+    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
+
+    // retrieve pack.
+    const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+    const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
+
+    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
+    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
+    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
+    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
+    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
+
+    // unpack symbol.
+    const simd_t symbol0 = _mm512_and_si512(pack0, lower8);
+    const simd_t symbol1 = _mm512_and_si512(pack1, lower8);
+    const simd_t symbol2 = _mm512_and_si512(pack2, lower8);
+    const simd_t symbol3 = _mm512_and_si512(pack3, lower8);
+
+    // pack symbols to one si512.
+    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
+    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
+    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
+    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
+
+    // unpack freq, cumul.
+    const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12);
+    const simd_t freq0 = _mm512_srli_epi32(pack0, 20);
+    const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12);
+    const simd_t freq1 = _mm512_srli_epi32(pack1, 20);
+    const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12);
+    const simd_t freq2 = _mm512_srli_epi32(pack2, 20);
+    const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12);
+    const simd_t freq3 = _mm512_srli_epi32(pack3, 20);
+
+    // We intentionally encoded in a way to not have to do horrible things here.
+    if constexpr (WriteAligned64)
+      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+    else
+      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
+
+    // const uint32_t freqScaled = shiftedState * freq;
+    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
+    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
+    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
+    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
+
+    // state = freqScaled + slot - cumul;
+    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
+    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
+    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
+    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
+
+    // now to the messy part...
+    {
+      // read input for blocks 0.
+      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
+      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
+      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
+      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
+      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
+
+      if constexpr (ShuffleMask16)
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
+        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
+        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
+        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
+        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+      else
+      {
+        // get masks of those compares & start loading shuffle masks.
+        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
+        const uint32_t cmpMask0b = cmpMask0 >> 8;
+        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
+        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
+
+        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
+        const uint32_t cmpMask1b = cmpMask1 >> 8;
+        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
+        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
+
+        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
+        const uint32_t cmpMask2b = cmpMask2 >> 8;
+        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
+        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
+
+        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
+        const uint32_t cmpMask3b = cmpMask3 >> 8;
+        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
+        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
+
+        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
+        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
+        pState->pReadHead += maskPop0a;
+
+        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
+        pState->pReadHead += maskPop0b;
+
+        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
+        pState->pReadHead += maskPop1a;
+
+        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
+        pState->pReadHead += maskPop1b;
+
+        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
+        pState->pReadHead += maskPop2a;
+
+        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
+        pState->pReadHead += maskPop2b;
+
+        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
+        pState->pReadHead += maskPop3a;
+
+        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
+
+        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
+        pState->pReadHead += maskPop3b;
+
+        // finalize lookups.
+        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
+        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
+
+        // matching: state << 16
+        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
+        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
+        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
+        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
+
+        if constexpr (YmmShuffle)
+        {
+          // shuffle new words in place.
+          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
+          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
+          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
+          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+        else
+        {
+          // shuffle new words in place.
+          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
+          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
+          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
+          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
+          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
+          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
+          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
+          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
+
+          // expand new word.
+          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
+          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
+          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
+          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
+
+          // state = state << 16 | newWord;
+          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
+          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
+          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
+          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
+        }
+      }
+    }
+  }
+
+  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
+    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
+
+  return i;
+}
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+template <uint32_t TotalSymbolCountBits>
+struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
+{
+  template <bool WriteAligned = false>
+  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
+  {
+    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
+  }
+};
+
+#endif
diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index dc4b341..ae2bd55 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -2,18 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec32.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 32; // Needs to be a power of two.
-constexpr bool DecodeNoBranch = false;
-
-//////////////////////////////////////////////////////////////////////////
-
-static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
-static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
-
 //////////////////////////////////////////////////////////////////////////
 
 extern const uint8_t _ShuffleLutShfl32[256 * 8];
@@ -22,623 +15,6 @@ extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
 
 //////////////////////////////////////////////////////////////////////////
 
-template <typename hist_type>
-struct _rans_decode_state32_t
-{
-#ifdef _MSC_VER
-  __declspec(align(32))
-#else
-  __attribute__((aligned(32)))
-#endif
-  uint32_t states[StateCount];
-
-  hist_type hist;
-  const uint16_t *pReadHead;
-};
-
-enum rans32x32_decoder_type_t
-{
-  r32x32_dt_scalar,
-  r32x32_dt_avx2_large_cache_15_to_13,
-  r32x32_dt_avx2_small_cache_15_to_13,
-  r32x32_dt_avx2_large_cache_12_to_10,
-  r32x32_dt_avx2_small_cache_12_to_10,
-};
-
-template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
-struct rans32x32_16w_decoder
-{
-  static size_t decode_section(_rans_decode_state32_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
-{
-  static size_t decode_section(_rans_decode_state32_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-
-    size_t i = startIndex;
-
-    for (; i < endIndex; i += StateCount)
-    {
-      for (size_t j = 0; j < StateCount; j++)
-      {
-        const uint8_t index = _Rans32x32_idx2idx[j];
-        uint32_t state = pState->states[j];
-
-        const uint32_t slot = state & (TotalSymbolCount - 1);
-        const uint8_t symbol = pState->hist.cumulInv[slot];
-        pOutData[i + index] = symbol;
-
-        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
-
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *pState->pReadHead;
-          state = read ? newState : state;
-          pState->pReadHead += (size_t)read;
-        }
-        else
-        {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *pState->pReadHead;
-            pState->pReadHead++;
-          }
-        }
-
-        pState->states[j] = state;
-      }
-    }
-
-    return i;
-  }
-};
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
-      return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-
-    // const uint8_t symbol = pHist->cumulInv[slot];
-    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
-    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
-    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
-    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
-
-    // since they were int32_t turn into uint8_t
-    symbol0 = _mm256_and_si256(symbol0, lower8);
-    symbol1 = _mm256_and_si256(symbol1, lower8);
-    symbol2 = _mm256_and_si256(symbol2, lower8);
-    symbol3 = _mm256_and_si256(symbol3, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
-
-    // freq, cumul.
-    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
-    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
-    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
-    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
-    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
-    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
-    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
-    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0, 1.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
-      return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  static_assert(TotalSymbolCountBits <= 12);
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot3, sizeof(uint32_t));
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    // unpack symbol.
-    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
-    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
-    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
-    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    // unpack freq, cumul.
-    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
-    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
-    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
-    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
-    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
-    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
-    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
-    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0, 1.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  (void)totalSymbolCountBits;
-
-  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
-
-  return inplace_make_hist_dec(pDecHist);
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec2_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec_pack_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
-//////////////////////////////////////////////////////////////////////////
-
 template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
 size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
 {
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index ec85a1d..3a25ed0 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -2,12 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec32.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 32; // Needs to be a power of two.
-constexpr bool EncodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
 
 constexpr size_t MinMinBlockSizeBits = 15;
@@ -56,79 +55,6 @@ size_t block_rANS32x32_16w_capacity(const size_t inputSize)
 
 //////////////////////////////////////////////////////////////////////////
 
-static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
-static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
-
-//////////////////////////////////////////////////////////////////////////
-
-struct _rans_encode_state32_t
-{
-  uint32_t states[StateCount];
-  hist_t hist;
-  uint16_t *pEnd, *pStart; // both compressed.
-};
-
-enum rans32x32_encoder_type_t
-{
-  r32x32_et_scalar,
-};
-
-template <rans32x32_encoder_type_t type>
-struct rans32x32_16w_encoder
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
-};
-
-template <>
-struct rans32x32_16w_encoder<r32x32_et_scalar>
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state32_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
-  {
-    int64_t targetCmp = targetIndex + StateCount;
-
-    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
-
-    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
-    {
-      for (int64_t j = StateCount - 1; j >= 0; j--)
-      {
-        const uint8_t index = _Rans32x32_idx2idx[j];
-
-        const uint8_t in = pInData[i - StateCount + index];
-        const uint32_t symbolCount = pState->hist.symbolCount[in];
-        const uint32_t max = EncodeEmitPoint * symbolCount;
-
-        const size_t stateIndex = j;
-
-        uint32_t state = pState->states[stateIndex];
-
-        if constexpr (EncodeNoBranch)
-        {
-          const bool write = state >= max;
-          *pState->pStart = (uint16_t)(state & 0xFFFF);
-          *pState->pStart -= (size_t)write;
-          state = write ? state >> 16 : state;
-        }
-        else
-        {
-          if (state >= max)
-          {
-            *pState->pStart = (uint16_t)(state & 0xFFFF);
-            pState->pStart--;
-            state >>= 16;
-          }
-        }
-
-        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
-      }
-    }
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
 template <uint32_t TotalSymbolCountBits>
 static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
 {
diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp
index c47f81a..c4e1788 100644
--- a/src/block_rANS32x64_16w_decode.cpp
+++ b/src/block_rANS32x64_16w_decode.cpp
@@ -2,1689 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec64.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 64; // Needs to be a power of two.
-constexpr bool DecodeNoBranch = false;
-
-//////////////////////////////////////////////////////////////////////////
-
-static const uint8_t _Rans32x64_idx2idx[] =
-{
-  0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-  0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F,
-  0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37,
-  0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F,
-};
-
-static_assert(sizeof(_Rans32x64_idx2idx) == StateCount);
-
-//////////////////////////////////////////////////////////////////////////
-
-extern const uint8_t _ShuffleLutShfl32[256 * 8];
-extern const uint8_t _ShuffleLutPerm32[256 * 8];
-extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
-
-//////////////////////////////////////////////////////////////////////////
-
-template <typename hist_type>
-struct _rans_decode_state64_t
-{
-#ifdef _MSC_VER
-  __declspec(align(64))
-#else
-  __attribute__((aligned(64)))
-#endif
-  uint32_t states[StateCount];
-
-  hist_type hist;
-  const uint16_t *pReadHead;
-};
-
-enum rans32x64_decoder_type_t
-{
-  r32x64_dt_scalar,
-  r32x64_dt_avx2_large_cache_15_to_13,
-  r32x64_dt_avx2_small_cache_15_to_13,
-  r32x64_dt_avx2_large_cache_12_to_10,
-  r32x64_dt_avx2_small_cache_12_to_10,
-  r32x64_dt_avx512_large_cache_15_to_13,
-  r32x64_dt_avx512_small_cache_15_to_13,
-  r32x64_dt_avx512_large_cache_12_to_10,
-  r32x64_dt_avx512_small_cache_12_to_10,
-};
-
-template <rans32x64_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
-struct rans32x64_16w_decoder
-{
-  static size_t decode_section(_rans_decode_state64_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
-{
-  static size_t decode_section(_rans_decode_state64_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-
-    size_t i = startIndex;
-
-    for (; i < endIndex; i += StateCount)
-    {
-      for (size_t j = 0; j < StateCount; j++)
-      {
-        const uint8_t index = _Rans32x64_idx2idx[j];
-        uint32_t state = pState->states[j];
-
-        const uint32_t slot = state & (TotalSymbolCount - 1);
-        const uint8_t symbol = pState->hist.cumulInv[slot];
-        pOutData[i + index] = symbol;
-
-        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
-
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *pState->pReadHead;
-          state = read ? newState : state;
-          pState->pReadHead += (size_t)read;
-        }
-        else
-        {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *pState->pReadHead;
-            pState->pReadHead++;
-          }
-        }
-
-        pState->states[j] = state;
-      }
-    }
-
-    return i;
-  }
-};
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
-      return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
-    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
-    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
-    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
-
-    // const uint8_t symbol = pHist->cumulInv[slot];
-    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
-    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
-    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
-    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
-    simd_t symbol4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot4, sizeof(uint8_t));
-    simd_t symbol5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot5, sizeof(uint8_t));
-    simd_t symbol6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot6, sizeof(uint8_t));
-    simd_t symbol7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot7, sizeof(uint8_t));
-
-    // since they were int32_t turn into uint8_t
-    symbol0 = _mm256_and_si256(symbol0, lower8);
-    symbol1 = _mm256_and_si256(symbol1, lower8);
-    symbol2 = _mm256_and_si256(symbol2, lower8);
-    symbol3 = _mm256_and_si256(symbol3, lower8);
-    symbol4 = _mm256_and_si256(symbol4, lower8);
-    symbol5 = _mm256_and_si256(symbol5, lower8);
-    symbol6 = _mm256_and_si256(symbol6, lower8);
-    symbol7 = _mm256_and_si256(symbol7, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
-    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
-    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
-
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
-    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol4, sizeof(uint32_t));
-    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol5, sizeof(uint32_t));
-    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol6, sizeof(uint32_t));
-    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol7, sizeof(uint32_t));
-
-    // freq, cumul.
-    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
-    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
-    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
-    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
-    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
-    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
-    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
-    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
-    const simd_t cumul4 = _mm256_srli_epi32(pack4, 16);
-    const simd_t freq4 = _mm256_and_si256(pack4, lower16);
-    const simd_t cumul5 = _mm256_srli_epi32(pack5, 16);
-    const simd_t freq5 = _mm256_and_si256(pack5, lower16);
-    const simd_t cumul6 = _mm256_srli_epi32(pack6, 16);
-    const simd_t freq6 = _mm256_and_si256(pack6, lower16);
-    const simd_t cumul7 = _mm256_srli_epi32(pack7, 16);
-    const simd_t freq7 = _mm256_and_si256(pack7, lower16);
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
-    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
-    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
-    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
-    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
-    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
-    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
-    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
-    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
-    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
-      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
-      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
-      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
-        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
-
-        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
-        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
-
-        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
-        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
-
-        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
-        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
-        pState->pReadHead += maskPop4;
-
-        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
-        pState->pReadHead += maskPop5;
-
-        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
-        pState->pReadHead += maskPop6;
-
-        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
-        pState->pReadHead += maskPop7;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
-        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
-        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
-        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
-        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
-        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
-        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
-        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
-        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
-        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
-        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
-        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
-        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
-        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
-
-        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
-        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
-
-        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
-        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
-
-        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
-        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
-        pState->pReadHead += maskPop4;
-
-        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
-        pState->pReadHead += maskPop5;
-
-        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
-        pState->pReadHead += maskPop6;
-
-        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
-        pState->pReadHead += maskPop7;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
-        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
-        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
-        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
-        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
-        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
-        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
-        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
-        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
-        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
-        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
-        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
-        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
-        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
-        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
-        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (32 - 1)) == 0)
-      return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-    const simd_t slot4 = _mm256_and_si256(statesX8[4], symCountMask);
-    const simd_t slot5 = _mm256_and_si256(statesX8[5], symCountMask);
-    const simd_t slot6 = _mm256_and_si256(statesX8[6], symCountMask);
-    const simd_t slot7 = _mm256_and_si256(statesX8[7], symCountMask);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot3, sizeof(uint32_t));
-    const simd_t pack4 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot4, sizeof(uint32_t));
-    const simd_t pack5 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot5, sizeof(uint32_t));
-    const simd_t pack6 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot6, sizeof(uint32_t));
-    const simd_t pack7 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(pState->hist.symbol), slot7, sizeof(uint32_t));
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-    const simd_t shiftedState4 = _mm256_srli_epi32(statesX8[4], TotalSymbolCountBits);
-    const simd_t shiftedState5 = _mm256_srli_epi32(statesX8[5], TotalSymbolCountBits);
-    const simd_t shiftedState6 = _mm256_srli_epi32(statesX8[6], TotalSymbolCountBits);
-    const simd_t shiftedState7 = _mm256_srli_epi32(statesX8[7], TotalSymbolCountBits);
-
-    // unpack symbol.
-    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
-    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
-    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
-    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
-    const simd_t symbol4 = _mm256_and_si256(pack4, lower8);
-    const simd_t symbol5 = _mm256_and_si256(pack5, lower8);
-    const simd_t symbol6 = _mm256_and_si256(pack6, lower8);
-    const simd_t symbol7 = _mm256_and_si256(pack7, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    const simd_t symPack45 = _mm256_packus_epi32(symbol4, symbol5);
-    const simd_t symPack67 = _mm256_packus_epi32(symbol6, symbol7);
-    const simd_t symPack4567 = _mm256_packus_epi16(symPack45, symPack67); // same weird order.
-
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i + StateCount / 2), symPack4567);
-
-    // unpack freq, cumul.
-    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
-    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
-    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
-    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
-    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
-    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
-    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
-    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
-    const simd_t cumul4 = _mm256_and_si256(_mm256_srli_epi32(pack4, 8), lower12);
-    const simd_t freq4 = _mm256_srli_epi32(pack4, 20);
-    const simd_t cumul5 = _mm256_and_si256(_mm256_srli_epi32(pack5, 8), lower12);
-    const simd_t freq5 = _mm256_srli_epi32(pack5, 20);
-    const simd_t cumul6 = _mm256_and_si256(_mm256_srli_epi32(pack6, 8), lower12);
-    const simd_t freq6 = _mm256_srli_epi32(pack6, 20);
-    const simd_t cumul7 = _mm256_and_si256(_mm256_srli_epi32(pack7, 8), lower12);
-    const simd_t freq7 = _mm256_srli_epi32(pack7, 20);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-    const __m256i freqScaled4 = _mm256_mullo_epi32(shiftedState4, freq4);
-    const __m256i freqScaled5 = _mm256_mullo_epi32(shiftedState5, freq5);
-    const __m256i freqScaled6 = _mm256_mullo_epi32(shiftedState6, freq6);
-    const __m256i freqScaled7 = _mm256_mullo_epi32(shiftedState7, freq7);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-    const simd_t state4 = _mm256_add_epi32(freqScaled4, _mm256_sub_epi32(slot4, cumul4));
-    const simd_t state5 = _mm256_add_epi32(freqScaled5, _mm256_sub_epi32(slot5, cumul5));
-    const simd_t state6 = _mm256_add_epi32(freqScaled6, _mm256_sub_epi32(slot6, cumul6));
-    const simd_t state7 = _mm256_add_epi32(freqScaled7, _mm256_sub_epi32(slot7, cumul7));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-      const simd_t cmp4 = _mm256_cmpgt_epi32(decodeConsumePoint, state4);
-      const simd_t cmp5 = _mm256_cmpgt_epi32(decodeConsumePoint, state5);
-      const simd_t cmp6 = _mm256_cmpgt_epi32(decodeConsumePoint, state6);
-      const simd_t cmp7 = _mm256_cmpgt_epi32(decodeConsumePoint, state7);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
-        __m128i lut4 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask4 << 4])); // `* 16`.
-
-        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
-        __m128i lut5 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask5 << 4])); // `* 16`.
-
-        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
-        __m128i lut6 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask6 << 4])); // `* 16`.
-
-        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
-        __m128i lut7 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask7 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
-        pState->pReadHead += maskPop4;
-
-        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
-        pState->pReadHead += maskPop5;
-
-        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
-        pState->pReadHead += maskPop6;
-
-        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
-        pState->pReadHead += maskPop7;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
-        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
-        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
-        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
-        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
-        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
-        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
-        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
-        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
-        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
-        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
-        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
-        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        const uint32_t cmpMask4 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp4));
-        __m128i lut4 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask4 << 3])); // `* 8`.
-
-        const uint32_t cmpMask5 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp5));
-        __m128i lut5 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask5 << 3])); // `* 8`.
-
-        const uint32_t cmpMask6 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp6));
-        __m128i lut6 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask6 << 3])); // `* 8`.
-
-        const uint32_t cmpMask7 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp7));
-        __m128i lut7 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask7 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        const __m128i newWords4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop4 = (uint32_t)__builtin_popcount(cmpMask4);
-        pState->pReadHead += maskPop4;
-
-        const __m128i newWords5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop5 = (uint32_t)__builtin_popcount(cmpMask5);
-        pState->pReadHead += maskPop5;
-
-        const __m128i newWords6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop6 = (uint32_t)__builtin_popcount(cmpMask6);
-        pState->pReadHead += maskPop6;
-
-        const __m128i newWords7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop7 = (uint32_t)__builtin_popcount(cmpMask7);
-        pState->pReadHead += maskPop7;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-        lut4 = _mm_or_si128(_mm_shuffle_epi8(lut4, shuffleDoubleMask), shuffleUpper16Bit);
-        lut5 = _mm_or_si128(_mm_shuffle_epi8(lut5, shuffleDoubleMask), shuffleUpper16Bit);
-        lut6 = _mm_or_si128(_mm_shuffle_epi8(lut6, shuffleDoubleMask), shuffleUpper16Bit);
-        lut7 = _mm_or_si128(_mm_shuffle_epi8(lut7, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-        const simd_t matchShiftedState4 = _mm256_sllv_epi32(state4, _mm256_and_si256(cmp4, _16));
-        const simd_t matchShiftedState5 = _mm256_sllv_epi32(state5, _mm256_and_si256(cmp5, _16));
-        const simd_t matchShiftedState6 = _mm256_sllv_epi32(state6, _mm256_and_si256(cmp6, _16));
-        const simd_t matchShiftedState7 = _mm256_sllv_epi32(state7, _mm256_and_si256(cmp7, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-        const __m128i newWordXmm4 = _mm_shuffle_epi8(newWords4, lut4);
-        const __m128i newWordXmm5 = _mm_shuffle_epi8(newWords5, lut5);
-        const __m128i newWordXmm6 = _mm_shuffle_epi8(newWords6, lut6);
-        const __m128i newWordXmm7 = _mm_shuffle_epi8(newWords7, lut7);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-        const __m256i newWord4 = _mm256_cvtepu16_epi32(newWordXmm4);
-        const __m256i newWord5 = _mm256_cvtepu16_epi32(newWordXmm5);
-        const __m256i newWord6 = _mm256_cvtepu16_epi32(newWordXmm6);
-        const __m256i newWord7 = _mm256_cvtepu16_epi32(newWordXmm7);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-        statesX8[4] = _mm256_or_si256(matchShiftedState4, newWord4);
-        statesX8[5] = _mm256_or_si256(matchShiftedState5, newWord5);
-        statesX8[6] = _mm256_or_si256(matchShiftedState6, newWord6);
-        statesX8[7] = _mm256_or_si256(matchShiftedState7, newWord7);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
-#ifndef _MSC_VER
-#ifdef __llvm__
-__attribute__((target("avx512bw")))
-#else
-__attribute__((target("avx512f", "avx512bw")))
-#endif
-#endif
-static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned64)
-    if ((reinterpret_cast<size_t>(pOutData) & (64 - 1)) == 0)
-      return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m512i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower16 = _mm512_set1_epi32(0xFFFF);
-  const simd_t lower8 = _mm512_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
-  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
-
-    // retrieve pack.
-    simd_t symbol0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
-    simd_t symbol1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
-    simd_t symbol2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
-    simd_t symbol3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.cumulInv), sizeof(uint8_t));
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    symbol0 = _mm512_and_si512(symbol0, lower8);
-    symbol1 = _mm512_and_si512(symbol1, lower8);
-    symbol2 = _mm512_and_si512(symbol2, lower8);
-    symbol3 = _mm512_and_si512(symbol3, lower8);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm512_i32gather_epi32(symbol0, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
-    const simd_t pack1 = _mm512_i32gather_epi32(symbol1, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
-    const simd_t pack2 = _mm512_i32gather_epi32(symbol2, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
-    const simd_t pack3 = _mm512_i32gather_epi32(symbol3, reinterpret_cast<const int32_t *>(pState->hist.symbols), sizeof(uint32_t));
-
-    // pack symbols to one si512.
-    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
-    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
-
-    // freq, cumul.
-    const simd_t cumul0 = _mm512_srli_epi32(pack0, 16);
-    const simd_t freq0 = _mm512_and_si512(pack0, lower16);
-    const simd_t cumul1 = _mm512_srli_epi32(pack1, 16);
-    const simd_t freq1 = _mm512_and_si512(pack1, lower16);
-    const simd_t cumul2 = _mm512_srli_epi32(pack2, 16);
-    const simd_t freq2 = _mm512_and_si512(pack2, lower16);
-    const simd_t cumul3 = _mm512_srli_epi32(pack3, 16);
-    const simd_t freq3 = _mm512_and_si512(pack3, lower16);
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned64)
-      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
-    else
-      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
-    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
-    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
-    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0.
-      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
-      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
-      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
-      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
-        const uint32_t cmpMask0b = cmpMask0 >> 8;
-        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
-        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
-
-        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
-        const uint32_t cmpMask1b = cmpMask1 >> 8;
-        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
-        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
-
-        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
-        const uint32_t cmpMask2b = cmpMask2 >> 8;
-        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
-        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
-
-        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
-        const uint32_t cmpMask3b = cmpMask3 >> 8;
-        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
-        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
-        pState->pReadHead += maskPop0a;
-
-        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
-        pState->pReadHead += maskPop0b;
-
-        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
-        pState->pReadHead += maskPop1a;
-
-        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
-        pState->pReadHead += maskPop1b;
-
-        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
-        pState->pReadHead += maskPop2a;
-
-        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
-        pState->pReadHead += maskPop2b;
-
-        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
-        pState->pReadHead += maskPop3a;
-
-        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
-        pState->pReadHead += maskPop3b;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
-        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
-        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
-        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
-
-        if constexpr (YmmShuffle)
-        {
-          // shuffle new words in place.
-          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
-          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
-          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
-          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-        else
-        {
-          // shuffle new words in place.
-          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
-          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
-          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
-          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
-          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
-          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
-          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
-          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
-        const uint32_t cmpMask0b = cmpMask0 >> 8;
-        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
-        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
-
-        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
-        const uint32_t cmpMask1b = cmpMask1 >> 8;
-        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
-        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
-
-        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
-        const uint32_t cmpMask2b = cmpMask2 >> 8;
-        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
-        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
-
-        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
-        const uint32_t cmpMask3b = cmpMask3 >> 8;
-        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
-        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
-        pState->pReadHead += maskPop0a;
-
-        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
-        pState->pReadHead += maskPop0b;
-
-        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
-        pState->pReadHead += maskPop1a;
-
-        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
-        pState->pReadHead += maskPop1b;
-
-        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
-        pState->pReadHead += maskPop2a;
-
-        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
-        pState->pReadHead += maskPop2b;
-
-        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
-        pState->pReadHead += maskPop3a;
-
-        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
-        pState->pReadHead += maskPop3b;
-
-        // finalize lookups.
-        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
-        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
-        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
-        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
-
-        if constexpr (YmmShuffle)
-        {
-          // shuffle new words in place.
-          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
-          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
-          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
-          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-        else
-        {
-          // shuffle new words in place.
-          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
-          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
-          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
-          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
-          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
-          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
-          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
-          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
-#ifndef _MSC_VER
-#ifdef __llvm__
-__attribute__((target("avx512bw")))
-#else
-__attribute__((target("avx512f", "avx512bw")))
-#endif
-#endif
-static size_t _block_rans32x64_decode_section_avx512_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned64)
-    if ((reinterpret_cast<size_t>(pOutData) & (63 - 1)) == 0)
-      return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, ShuffleMask16, YmmShuffle, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m512i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm512_loadu_si512(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm512_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower12 = _mm512_set1_epi32((1 << 12) - 1);
-  const simd_t lower8 = _mm512_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm512_set1_epi32(DecodeConsumePoint16);
-  const simd_t symbolPermuteMask = _mm512_set_epi32(15, 7, 14, 6, 11, 3, 10, 2, 13, 5, 12, 4, 9, 1, 8, 0);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm512_and_si512(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm512_and_si512(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm512_and_si512(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm512_and_si512(statesX8[3], symCountMask);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm512_i32gather_epi32(slot0, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
-    const simd_t pack1 = _mm512_i32gather_epi32(slot1, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
-    const simd_t pack2 = _mm512_i32gather_epi32(slot2, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
-    const simd_t pack3 = _mm512_i32gather_epi32(slot3, reinterpret_cast<const int32_t *>(pState->hist.symbol), sizeof(uint32_t));
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm512_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm512_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm512_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm512_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    // unpack symbol.
-    const simd_t symbol0 = _mm512_and_si512(pack0, lower8);
-    const simd_t symbol1 = _mm512_and_si512(pack1, lower8);
-    const simd_t symbol2 = _mm512_and_si512(pack2, lower8);
-    const simd_t symbol3 = _mm512_and_si512(pack3, lower8);
-
-    // pack symbols to one si512.
-    const simd_t symPack01 = _mm512_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm512_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm512_packus_epi16(symPack01, symPack23); // only god knows how this is packed now.
-    const simd_t symPackCompat = _mm512_permutexvar_epi32(symbolPermuteMask, symPack0123); // we could get rid of this if we'd chose to reorder everything fittingly.
-
-    // unpack freq, cumul.
-    const simd_t cumul0 = _mm512_and_si512(_mm512_srli_epi32(pack0, 8), lower12);
-    const simd_t freq0 = _mm512_srli_epi32(pack0, 20);
-    const simd_t cumul1 = _mm512_and_si512(_mm512_srli_epi32(pack1, 8), lower12);
-    const simd_t freq1 = _mm512_srli_epi32(pack1, 20);
-    const simd_t cumul2 = _mm512_and_si512(_mm512_srli_epi32(pack2, 8), lower12);
-    const simd_t freq2 = _mm512_srli_epi32(pack2, 20);
-    const simd_t cumul3 = _mm512_and_si512(_mm512_srli_epi32(pack3, 8), lower12);
-    const simd_t freq3 = _mm512_srli_epi32(pack3, 20);
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned64)
-      _mm512_stream_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
-    else
-      _mm512_storeu_si512(reinterpret_cast<simd_t *>(pOutData + i), symPackCompat);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const simd_t freqScaled0 = _mm512_mullo_epi32(shiftedState0, freq0);
-    const simd_t freqScaled1 = _mm512_mullo_epi32(shiftedState1, freq1);
-    const simd_t freqScaled2 = _mm512_mullo_epi32(shiftedState2, freq2);
-    const simd_t freqScaled3 = _mm512_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm512_add_epi32(freqScaled0, _mm512_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm512_add_epi32(freqScaled1, _mm512_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm512_add_epi32(freqScaled2, _mm512_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm512_add_epi32(freqScaled3, _mm512_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0.
-      const __m128i newWords0a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const __mmask16 cmpMask0 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state0);
-      const __mmask16 cmpMask1 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state1);
-      const __mmask16 cmpMask2 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state2);
-      const __mmask16 cmpMask3 = _mm512_cmpgt_epi32_mask(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
-        const uint32_t cmpMask0b = cmpMask0 >> 8;
-        __m128i lut0a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0a << 4])); // `* 16`.
-        __m128i lut0b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0b << 4])); // `* 16`.
-
-        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
-        const uint32_t cmpMask1b = cmpMask1 >> 8;
-        __m128i lut1a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1a << 4])); // `* 16`.
-        __m128i lut1b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1b << 4])); // `* 16`.
-
-        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
-        const uint32_t cmpMask2b = cmpMask2 >> 8;
-        __m128i lut2a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2a << 4])); // `* 16`.
-        __m128i lut2b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2b << 4])); // `* 16`.
-
-        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
-        const uint32_t cmpMask3b = cmpMask3 >> 8;
-        __m128i lut3a = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3a << 4])); // `* 16`.
-        __m128i lut3b = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3b << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
-        pState->pReadHead += maskPop0a;
-
-        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
-        pState->pReadHead += maskPop0b;
-
-        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
-        pState->pReadHead += maskPop1a;
-
-        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
-        pState->pReadHead += maskPop1b;
-
-        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
-        pState->pReadHead += maskPop2a;
-
-        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
-        pState->pReadHead += maskPop2b;
-
-        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
-        pState->pReadHead += maskPop3a;
-
-        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
-        pState->pReadHead += maskPop3b;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
-        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
-        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
-        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
-
-        if constexpr (YmmShuffle)
-        {
-          // shuffle new words in place.
-          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
-          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
-          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
-          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-        else
-        {
-          // shuffle new words in place.
-          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
-          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
-          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
-          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
-          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
-          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
-          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
-          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0a = cmpMask0 & 0xFF;
-        const uint32_t cmpMask0b = cmpMask0 >> 8;
-        __m128i lut0a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0a << 3])); // `* 8`.
-        __m128i lut0b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0b << 3])); // `* 8`.
-
-        const uint32_t cmpMask1a = cmpMask1 & 0xFF;
-        const uint32_t cmpMask1b = cmpMask1 >> 8;
-        __m128i lut1a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1a << 3])); // `* 8`.
-        __m128i lut1b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1b << 3])); // `* 8`.
-
-        const uint32_t cmpMask2a = cmpMask2 & 0xFF;
-        const uint32_t cmpMask2b = cmpMask2 >> 8;
-        __m128i lut2a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2a << 3])); // `* 8`.
-        __m128i lut2b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2b << 3])); // `* 8`.
-
-        const uint32_t cmpMask3a = cmpMask3 & 0xFF;
-        const uint32_t cmpMask3b = cmpMask3 >> 8;
-        __m128i lut3a = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3a << 3])); // `* 8`.
-        __m128i lut3b = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3b << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3, 4, 5, 6, 7.
-        const uint32_t maskPop0a = (uint32_t)__builtin_popcount(cmpMask0a);
-        pState->pReadHead += maskPop0a;
-
-        const __m128i newWords0b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop0b = (uint32_t)__builtin_popcount(cmpMask0b);
-        pState->pReadHead += maskPop0b;
-
-        const __m128i newWords1a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1a = (uint32_t)__builtin_popcount(cmpMask1a);
-        pState->pReadHead += maskPop1a;
-
-        const __m128i newWords1b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1b = (uint32_t)__builtin_popcount(cmpMask1b);
-        pState->pReadHead += maskPop1b;
-
-        const __m128i newWords2a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2a = (uint32_t)__builtin_popcount(cmpMask2a);
-        pState->pReadHead += maskPop2a;
-
-        const __m128i newWords2b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2b = (uint32_t)__builtin_popcount(cmpMask2b);
-        pState->pReadHead += maskPop2b;
-
-        const __m128i newWords3a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3a = (uint32_t)__builtin_popcount(cmpMask3a);
-        pState->pReadHead += maskPop3a;
-
-        const __m128i newWords3b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3b = (uint32_t)__builtin_popcount(cmpMask3b);
-        pState->pReadHead += maskPop3b;
-
-        // finalize lookups.
-        lut0a = _mm_or_si128(_mm_shuffle_epi8(lut0a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut0b = _mm_or_si128(_mm_shuffle_epi8(lut0b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1a = _mm_or_si128(_mm_shuffle_epi8(lut1a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1b = _mm_or_si128(_mm_shuffle_epi8(lut1b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2a = _mm_or_si128(_mm_shuffle_epi8(lut2a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2b = _mm_or_si128(_mm_shuffle_epi8(lut2b, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3a = _mm_or_si128(_mm_shuffle_epi8(lut3a, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3b = _mm_or_si128(_mm_shuffle_epi8(lut3b, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm512_mask_slli_epi32(state0, cmpMask0, state0, 16);
-        const simd_t matchShiftedState1 = _mm512_mask_slli_epi32(state1, cmpMask1, state1, 16);
-        const simd_t matchShiftedState2 = _mm512_mask_slli_epi32(state2, cmpMask2, state2, 16);
-        const simd_t matchShiftedState3 = _mm512_mask_slli_epi32(state3, cmpMask3, state3, 16);
-
-        if constexpr (YmmShuffle)
-        {
-          // shuffle new words in place.
-          const __m256i newWordXmm0 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords0b, newWords0a), _mm256_set_m128i(lut0b, lut0a));
-          const __m256i newWordXmm1 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords1b, newWords1a), _mm256_set_m128i(lut1b, lut1a));
-          const __m256i newWordXmm2 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords2b, newWords2a), _mm256_set_m128i(lut2b, lut2a));
-          const __m256i newWordXmm3 = _mm256_shuffle_epi8(_mm256_set_m128i(newWords3b, newWords3a), _mm256_set_m128i(lut3b, lut3a));
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(newWordXmm0);
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(newWordXmm1);
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(newWordXmm2);
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(newWordXmm3);
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-        else
-        {
-          // shuffle new words in place.
-          const __m128i newWordXmm0a = _mm_shuffle_epi8(newWords0a, lut0a);
-          const __m128i newWordXmm0b = _mm_shuffle_epi8(newWords0b, lut0b);
-          const __m128i newWordXmm1a = _mm_shuffle_epi8(newWords1a, lut1a);
-          const __m128i newWordXmm1b = _mm_shuffle_epi8(newWords1b, lut1b);
-          const __m128i newWordXmm2a = _mm_shuffle_epi8(newWords2a, lut2a);
-          const __m128i newWordXmm2b = _mm_shuffle_epi8(newWords2b, lut2b);
-          const __m128i newWordXmm3a = _mm_shuffle_epi8(newWords3a, lut3a);
-          const __m128i newWordXmm3b = _mm_shuffle_epi8(newWords3b, lut3b);
-
-          // expand new word.
-          const simd_t newWord0 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm0b, newWordXmm0a));
-          const simd_t newWord1 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm1b, newWordXmm1a));
-          const simd_t newWord2 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm2b, newWordXmm2a));
-          const simd_t newWord3 = _mm512_cvtepu16_epi32(_mm256_set_m128i(newWordXmm3b, newWordXmm3a));
-
-          // state = state << 16 | newWord;
-          statesX8[0] = _mm512_or_si512(matchShiftedState0, newWord0);
-          statesX8[1] = _mm512_or_si512(matchShiftedState1, newWord1);
-          statesX8[2] = _mm512_or_si512(matchShiftedState2, newWord2);
-          statesX8[3] = _mm512_or_si512(matchShiftedState3, newWord3);
-        }
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm512_storeu_si512(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx512_varA<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx512_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, true, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _block_rans32x64_decode_section_avx512_varC<TotalSymbolCountBits, false, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  (void)totalSymbolCountBits;
-
-  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
-
-  return inplace_make_hist_dec(pDecHist);
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec2_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec_pack_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits, rans32x64_decoder_type_t Impl, typename hist_type>
@@ -1777,20 +99,10 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength,
 
         state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
 
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *decodeState.pReadHead;
-          state = read ? newState : state;
-          decodeState.pReadHead += (size_t)read;
-        }
-        else
+        if (state < DecodeConsumePoint16)
         {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *decodeState.pReadHead;
-            decodeState.pReadHead++;
-          }
+          state = state << 16 | *decodeState.pReadHead;
+          decodeState.pReadHead++;
         }
 
         decodeState.states[j] = state;
diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp
index 7b77a81..e17469e 100644
--- a/src/block_rANS32x64_16w_encode.cpp
+++ b/src/block_rANS32x64_16w_encode.cpp
@@ -2,12 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec64.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 64; // Needs to be a power of two.
-constexpr bool EncodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
 
 constexpr size_t MinMinBlockSizeBits = 15;
@@ -56,86 +55,6 @@ size_t block_rANS32x64_16w_capacity(const size_t inputSize)
 
 //////////////////////////////////////////////////////////////////////////
 
-static const uint8_t _Rans32x64_idx2idx[] =
-{
-  0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-  0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F,
-  0x20, 0x21, 0x22, 0x23, 0x30, 0x31, 0x32, 0x33, 0x24, 0x25, 0x26, 0x27, 0x34, 0x35, 0x36, 0x37,
-  0x28, 0x29, 0x2A, 0x2B, 0x38, 0x39, 0x3A, 0x3B, 0x2C, 0x2D, 0x2E, 0x2F, 0x3C, 0x3D, 0x3E, 0x3F,
-};
-
-static_assert(sizeof(_Rans32x64_idx2idx) == StateCount);
-
-//////////////////////////////////////////////////////////////////////////
-
-struct _rans_encode_state64_t
-{
-  uint32_t states[StateCount];
-  hist_t hist;
-  uint16_t *pEnd, *pStart; // both compressed.
-};
-
-enum rans32x64_encoder_type_t
-{
-  r32x64_et_scalar,
-};
-
-template <rans32x64_encoder_type_t type>
-struct rans32x64_16w_encoder
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
-};
-
-template <>
-struct rans32x64_16w_encoder<r32x64_et_scalar>
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state64_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
-  {
-    int64_t targetCmp = targetIndex + StateCount;
-
-    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
-
-    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
-    {
-      for (int64_t j = StateCount - 1; j >= 0; j--)
-      {
-        const uint8_t index = _Rans32x64_idx2idx[j];
-
-        const uint8_t in = pInData[i - StateCount + index];
-        const uint32_t symbolCount = pState->hist.symbolCount[in];
-        const uint32_t max = EncodeEmitPoint * symbolCount;
-
-        const size_t stateIndex = j;
-
-        uint32_t state = pState->states[stateIndex];
-
-        if constexpr (EncodeNoBranch)
-        {
-          const bool write = state >= max;
-          *pState->pStart = (uint16_t)(state & 0xFFFF);
-          *pState->pStart -= (size_t)write;
-          state = write ? state >> 16 : state;
-        }
-        else
-        {
-          if (state >= max)
-          {
-            *pState->pStart = (uint16_t)(state & 0xFFFF);
-            pState->pStart--;
-            state >>= 16;
-          }
-        }
-
-        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
-      }
-    }
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
 template <uint32_t TotalSymbolCountBits>
 static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
 {
diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp
index 07d4741..503a9e6 100644
--- a/src/mt_rANS32x32_16w_decode.cpp
+++ b/src/mt_rANS32x32_16w_decode.cpp
@@ -2,641 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec32.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 32; // Needs to be a power of two.
-constexpr bool DecodeNoBranch = false;
-
-//////////////////////////////////////////////////////////////////////////
-
-static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
-static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
-
-//////////////////////////////////////////////////////////////////////////
-
-extern const uint8_t _ShuffleLutShfl32[256 * 8];
-extern const uint8_t _ShuffleLutPerm32[256 * 8];
-extern const uint8_t _DoubleShuffleLutShfl32[256 * 8 * 2];
-
-//////////////////////////////////////////////////////////////////////////
-
-template <typename hist_type>
-struct _rans_decode_state32mt_t
-{
-#ifdef _MSC_VER
-  __declspec(align(32))
-#else
-  __attribute__((aligned(32)))
-#endif
-  uint32_t states[StateCount];
-
-  hist_type hist;
-  const uint16_t *pReadHead;
-};
-
-enum rans32x32_decoder_type_t
-{
-  r32x32_dt_scalar,
-  r32x32_dt_avx2_large_cache_15_to_13,
-  r32x32_dt_avx2_small_cache_15_to_13,
-  r32x32_dt_avx2_large_cache_12_to_10,
-  r32x32_dt_avx2_small_cache_12_to_10,
-};
-
-template <rans32x32_decoder_type_t type, uint32_t TotalSymbolCountBits, typename hist_type>
-struct rans32x32_16w_decoder
-{
-  static size_t decode_section(_rans_decode_state32mt_t<hist_type> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex);
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<TotalSymbolCountBits>>
-{
-  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-
-    size_t i = startIndex;
-
-    for (; i < endIndex; i += StateCount)
-    {
-      for (size_t j = 0; j < StateCount; j++)
-      {
-        const uint8_t index = _Rans32x32_idx2idx[j];
-        uint32_t state = pState->states[j];
-
-        const uint32_t slot = state & (TotalSymbolCount - 1);
-        const uint8_t symbol = pState->hist.cumulInv[slot];
-        pOutData[i + index] = symbol;
-
-        state = (state >> TotalSymbolCountBits) * (uint32_t)pState->hist.symbolCount[symbol] + slot - (uint32_t)pState->hist.cumul[symbol];
-
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *pState->pReadHead;
-          state = read ? newState : state;
-          pState->pReadHead += (size_t)read;
-        }
-        else
-        {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *pState->pReadHead;
-            pState->pReadHead++;
-          }
-        }
-
-        pState->states[j] = state;
-      }
-    }
-
-    return i;
-  }
-};
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _mt_rans32x32_decode_section_avx2_varA(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
-      return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-  static_assert(TotalSymbolCountBits < 16);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower16 = _mm256_set1_epi32(0xFFFF);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-
-    // const uint8_t symbol = pHist->cumulInv[slot];
-    simd_t symbol0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot0, sizeof(uint8_t));
-    simd_t symbol1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot1, sizeof(uint8_t));
-    simd_t symbol2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot2, sizeof(uint8_t));
-    simd_t symbol3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.cumulInv), slot3, sizeof(uint8_t));
-
-    // since they were int32_t turn into uint8_t
-    symbol0 = _mm256_and_si256(symbol0, lower8);
-    symbol1 = _mm256_and_si256(symbol1, lower8);
-    symbol2 = _mm256_and_si256(symbol2, lower8);
-    symbol3 = _mm256_and_si256(symbol3, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbols), symbol3, sizeof(uint32_t));
-
-    // freq, cumul.
-    const simd_t cumul0 = _mm256_srli_epi32(pack0, 16);
-    const simd_t freq0 = _mm256_and_si256(pack0, lower16);
-    const simd_t cumul1 = _mm256_srli_epi32(pack1, 16);
-    const simd_t freq1 = _mm256_and_si256(pack1, lower16);
-    const simd_t cumul2 = _mm256_srli_epi32(pack2, 16);
-    const simd_t freq2 = _mm256_and_si256(pack2, lower16);
-    const simd_t cumul3 = _mm256_srli_epi32(pack3, 16);
-    const simd_t freq3 = _mm256_and_si256(pack3, lower16);
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0, 1.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
-__attribute__((target("avx2")))
-#endif
-static size_t _mt_rans32x32_decode_section_avx2_varC(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-{
-  if constexpr (!WriteAligned32)
-    if ((reinterpret_cast<size_t>(pOutData) & (StateCount - 1)) == 0)
-      return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, ShuffleMask16, true>(pState, pOutData, startIndex, endIndex);
-
-  static_assert(TotalSymbolCountBits <= 12);
-  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
-
-  typedef __m256i simd_t;
-  simd_t statesX8[StateCount / (sizeof(simd_t) / sizeof(uint32_t))];
-
-  for (size_t i = 0; i < sizeof(statesX8) / sizeof(simd_t); i++)
-    statesX8[i] = _mm256_load_si256(reinterpret_cast<const simd_t *>(reinterpret_cast<const uint8_t *>(pState->states) + i * sizeof(simd_t)));
-
-  size_t i = startIndex;
-
-  const simd_t symCountMask = _mm256_set1_epi32(TotalSymbolCount - 1);
-  const simd_t lower12 = _mm256_set1_epi32((1 << 12) - 1);
-  const simd_t lower8 = _mm256_set1_epi32(0xFF);
-  const simd_t decodeConsumePoint = _mm256_set1_epi32(DecodeConsumePoint16);
-  const simd_t _16 = _mm256_set1_epi32(16);
-  const __m128i shuffleDoubleMask = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
-  const __m128i shuffleUpper16Bit = _mm_set1_epi16(0x0100);
-
-  for (; i < endIndex; i += StateCount)
-  {
-    // const uint32_t slot = state & (TotalSymbolCount - 1);
-    const simd_t slot0 = _mm256_and_si256(statesX8[0], symCountMask);
-    const simd_t slot1 = _mm256_and_si256(statesX8[1], symCountMask);
-    const simd_t slot2 = _mm256_and_si256(statesX8[2], symCountMask);
-    const simd_t slot3 = _mm256_and_si256(statesX8[3], symCountMask);
-
-    // retrieve pack.
-    const simd_t pack0 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot0, sizeof(uint32_t));
-    const simd_t pack1 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot1, sizeof(uint32_t));
-    const simd_t pack2 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot2, sizeof(uint32_t));
-    const simd_t pack3 = _mm256_i32gather_epi32(reinterpret_cast<const int32_t *>(&pState->hist.symbol), slot3, sizeof(uint32_t));
-
-    // const uint32_t shiftedState = (state >> TotalSymbolCountBits);
-    const simd_t shiftedState0 = _mm256_srli_epi32(statesX8[0], TotalSymbolCountBits);
-    const simd_t shiftedState1 = _mm256_srli_epi32(statesX8[1], TotalSymbolCountBits);
-    const simd_t shiftedState2 = _mm256_srli_epi32(statesX8[2], TotalSymbolCountBits);
-    const simd_t shiftedState3 = _mm256_srli_epi32(statesX8[3], TotalSymbolCountBits);
-
-    // unpack symbol.
-    const simd_t symbol0 = _mm256_and_si256(pack0, lower8);
-    const simd_t symbol1 = _mm256_and_si256(pack1, lower8);
-    const simd_t symbol2 = _mm256_and_si256(pack2, lower8);
-    const simd_t symbol3 = _mm256_and_si256(pack3, lower8);
-
-    // pack symbols to one si256. (could possibly be `_mm256_cvtepi32_epi8` on avx-512f + avx-512-vl) (`_mm256_slli_epi32` + `_mm256_or_si256` packing is slower)
-    const simd_t symPack01 = _mm256_packus_epi32(symbol0, symbol1);
-    const simd_t symPack23 = _mm256_packus_epi32(symbol2, symbol3);
-    const simd_t symPack0123 = _mm256_packus_epi16(symPack01, symPack23); // `00 01 02 03 08 09 0A 0B 10 11 12 13 18 19 1A 1B 04 05 06 07 0C 0D 0E 0F 14 15 16 17 1C 1D 1E 1F`
-
-    // We intentionally encoded in a way to not have to do horrible things here.
-    if constexpr (WriteAligned32)
-      _mm256_stream_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-    else
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(pOutData + i), symPack0123);
-
-    // unpack freq, cumul.
-    const simd_t cumul0 = _mm256_and_si256(_mm256_srli_epi32(pack0, 8), lower12);
-    const simd_t freq0 = _mm256_srli_epi32(pack0, 20);
-    const simd_t cumul1 = _mm256_and_si256(_mm256_srli_epi32(pack1, 8), lower12);
-    const simd_t freq1 = _mm256_srli_epi32(pack1, 20);
-    const simd_t cumul2 = _mm256_and_si256(_mm256_srli_epi32(pack2, 8), lower12);
-    const simd_t freq2 = _mm256_srli_epi32(pack2, 20);
-    const simd_t cumul3 = _mm256_and_si256(_mm256_srli_epi32(pack3, 8), lower12);
-    const simd_t freq3 = _mm256_srli_epi32(pack3, 20);
-
-    // const uint32_t freqScaled = shiftedState * freq;
-    const __m256i freqScaled0 = _mm256_mullo_epi32(shiftedState0, freq0);
-    const __m256i freqScaled1 = _mm256_mullo_epi32(shiftedState1, freq1);
-    const __m256i freqScaled2 = _mm256_mullo_epi32(shiftedState2, freq2);
-    const __m256i freqScaled3 = _mm256_mullo_epi32(shiftedState3, freq3);
-
-    // state = freqScaled + slot - cumul;
-    const simd_t state0 = _mm256_add_epi32(freqScaled0, _mm256_sub_epi32(slot0, cumul0));
-    const simd_t state1 = _mm256_add_epi32(freqScaled1, _mm256_sub_epi32(slot1, cumul1));
-    const simd_t state2 = _mm256_add_epi32(freqScaled2, _mm256_sub_epi32(slot2, cumul2));
-    const simd_t state3 = _mm256_add_epi32(freqScaled3, _mm256_sub_epi32(slot3, cumul3));
-
-    // now to the messy part...
-    {
-      // read input for blocks 0, 1.
-      const __m128i newWords0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-      // (state < DecodeConsumePoint16) ? -1 : 0 | well, actually (DecodeConsumePoint16 > state) ? -1 : 0
-      const simd_t cmp0 = _mm256_cmpgt_epi32(decodeConsumePoint, state0);
-      const simd_t cmp1 = _mm256_cmpgt_epi32(decodeConsumePoint, state1);
-      const simd_t cmp2 = _mm256_cmpgt_epi32(decodeConsumePoint, state2);
-      const simd_t cmp3 = _mm256_cmpgt_epi32(decodeConsumePoint, state3);
-
-      if constexpr (ShuffleMask16)
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask0 << 4])); // `* 16`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask1 << 4])); // `* 16`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask2 << 4])); // `* 16`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_load_si128(reinterpret_cast<const __m128i *>(&_DoubleShuffleLutShfl32[cmpMask3 << 4])); // `* 16`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-      else
-      {
-        // get masks of those compares & start loading shuffle masks.
-        const uint32_t cmpMask0 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp0));
-        __m128i lut0 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask0 << 3])); // `* 8`.
-
-        const uint32_t cmpMask1 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp1));
-        __m128i lut1 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask1 << 3])); // `* 8`.
-
-        const uint32_t cmpMask2 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp2));
-        __m128i lut2 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask2 << 3])); // `* 8`.
-
-        const uint32_t cmpMask3 = (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(cmp3));
-        __m128i lut3 = _mm_lddqu_si128(reinterpret_cast<const __m128i *>(&_ShuffleLutShfl32[cmpMask3 << 3])); // `* 8`.
-
-        // advance read head & read input for blocks 1, 2, 3.
-        const uint32_t maskPop0 = (uint32_t)__builtin_popcount(cmpMask0);
-        pState->pReadHead += maskPop0;
-
-        const __m128i newWords1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop1 = (uint32_t)__builtin_popcount(cmpMask1);
-        pState->pReadHead += maskPop1;
-
-        const __m128i newWords2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop2 = (uint32_t)__builtin_popcount(cmpMask2);
-        pState->pReadHead += maskPop2;
-
-        const __m128i newWords3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pState->pReadHead));
-
-        const uint32_t maskPop3 = (uint32_t)__builtin_popcount(cmpMask3);
-        pState->pReadHead += maskPop3;
-
-        // finalize lookups.
-        lut0 = _mm_or_si128(_mm_shuffle_epi8(lut0, shuffleDoubleMask), shuffleUpper16Bit);
-        lut1 = _mm_or_si128(_mm_shuffle_epi8(lut1, shuffleDoubleMask), shuffleUpper16Bit);
-        lut2 = _mm_or_si128(_mm_shuffle_epi8(lut2, shuffleDoubleMask), shuffleUpper16Bit);
-        lut3 = _mm_or_si128(_mm_shuffle_epi8(lut3, shuffleDoubleMask), shuffleUpper16Bit);
-
-        // matching: state << 16
-        const simd_t matchShiftedState0 = _mm256_sllv_epi32(state0, _mm256_and_si256(cmp0, _16));
-        const simd_t matchShiftedState1 = _mm256_sllv_epi32(state1, _mm256_and_si256(cmp1, _16));
-        const simd_t matchShiftedState2 = _mm256_sllv_epi32(state2, _mm256_and_si256(cmp2, _16));
-        const simd_t matchShiftedState3 = _mm256_sllv_epi32(state3, _mm256_and_si256(cmp3, _16));
-
-        // shuffle new words in place.
-        const __m128i newWordXmm0 = _mm_shuffle_epi8(newWords0, lut0);
-        const __m128i newWordXmm1 = _mm_shuffle_epi8(newWords1, lut1);
-        const __m128i newWordXmm2 = _mm_shuffle_epi8(newWords2, lut2);
-        const __m128i newWordXmm3 = _mm_shuffle_epi8(newWords3, lut3);
-
-        // expand new word.
-        const __m256i newWord0 = _mm256_cvtepu16_epi32(newWordXmm0);
-        const __m256i newWord1 = _mm256_cvtepu16_epi32(newWordXmm1);
-        const __m256i newWord2 = _mm256_cvtepu16_epi32(newWordXmm2);
-        const __m256i newWord3 = _mm256_cvtepu16_epi32(newWordXmm3);
-
-        // state = state << 16 | newWord;
-        statesX8[0] = _mm256_or_si256(matchShiftedState0, newWord0);
-        statesX8[1] = _mm256_or_si256(matchShiftedState1, newWord1);
-        statesX8[2] = _mm256_or_si256(matchShiftedState2, newWord2);
-        statesX8[3] = _mm256_or_si256(matchShiftedState3, newWord3);
-      }
-    }
-  }
-
-  for (size_t j = 0; j < sizeof(statesX8) / sizeof(simd_t); j++)
-    _mm256_store_si256(reinterpret_cast<simd_t *>(reinterpret_cast<uint8_t *>(pState->states) + j * sizeof(simd_t)), statesX8[j]);
-
-  return i;
-}
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_15_to_13, TotalSymbolCountBits, hist_dec2_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32mt_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _mt_rans32x32_decode_section_avx2_varA<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_large_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, true>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-template <uint32_t TotalSymbolCountBits>
-struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCountBits, hist_dec_pack_t<TotalSymbolCountBits>>
-{
-  template <bool WriteAligned = false>
-  static size_t decode_section(_rans_decode_state32mt_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
-  {
-    return _mt_rans32x32_decode_section_avx2_varC<TotalSymbolCountBits, false>(pState, pOutData, startIndex, endIndex);
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  (void)totalSymbolCountBits;
-
-  memcpy(&pDecHist->symbolCount, &pIncompleteHist->symbolCount, sizeof(pIncompleteHist->symbolCount));
-
-  return inplace_make_hist_dec(pDecHist);
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec2_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec2_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
-template <uint32_t TotalSymbolCountBits>
-static bool _init_from_hist(hist_dec_pack_t<TotalSymbolCountBits> *pDecHist, hist_t *pIncompleteHist, const uint32_t totalSymbolCountBits)
-{
-  if (!inplace_complete_hist(pIncompleteHist, totalSymbolCountBits))
-    return false;
-
-  make_dec_pack_hist(pDecHist, pIncompleteHist);
-
-  return true;
-}
-
 //////////////////////////////////////////////////////////////////////////
 
 template <uint32_t TotalSymbolCountBits, rans32x32_decoder_type_t Impl, typename hist_type>
@@ -661,7 +31,7 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui
   if (inLength < expectedInputLength)
     return 0;
 
-  _rans_decode_state32mt_t<hist_type> decodeState;
+  _rans_decode_state32_t<hist_type> decodeState;
 
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
@@ -736,20 +106,10 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui
 
         state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
 
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *decodeState.pReadHead;
-          state = read ? newState : state;
-          decodeState.pReadHead += (size_t)read;
-        }
-        else
+        if (state < DecodeConsumePoint16)
         {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *decodeState.pReadHead;
-            decodeState.pReadHead++;
-          }
+          state = state << 16 | *decodeState.pReadHead;
+          decodeState.pReadHead++;
         }
 
         decodeState.states[j] = state;
@@ -784,7 +144,7 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
   if (inLength < expectedInputLength)
     return 0;
 
-  _rans_decode_state32mt_t<hist_type> decodeState;
+  _rans_decode_state32_t<hist_type> decodeState;
 
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
@@ -864,20 +224,10 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
 
         state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
 
-        if constexpr (DecodeNoBranch)
-        {
-          const bool read = state < DecodeConsumePoint16;
-          const uint32_t newState = state << 16 | *decodeState.pReadHead;
-          state = read ? newState : state;
-          decodeState.pReadHead += (size_t)read;
-        }
-        else
+        if (state < DecodeConsumePoint16)
         {
-          if (state < DecodeConsumePoint16)
-          {
-            state = state << 16 | *decodeState.pReadHead;
-            decodeState.pReadHead++;
-          }
+          state = state << 16 | *decodeState.pReadHead;
+          decodeState.pReadHead++;
         }
 
         decodeState.states[j] = state;
diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp
index 140a013..632fce0 100644
--- a/src/mt_rANS32x32_16w_encode.cpp
+++ b/src/mt_rANS32x32_16w_encode.cpp
@@ -2,12 +2,11 @@
 
 #include "hist.h"
 #include "simd_platform.h"
+#include "block_codec32.h"
 
 #include <string.h>
 #include <math.h>
 
-constexpr size_t StateCount = 32; // Needs to be a power of two.
-constexpr bool EncodeNoBranch = false;
 constexpr size_t SafeHistBitMax = 0;
 
 constexpr size_t MinMinBlockSizeBits = 15;
@@ -59,79 +58,6 @@ size_t mt_rANS32x32_16w_capacity(const size_t inputSize)
 
 //////////////////////////////////////////////////////////////////////////
 
-static const uint8_t _Rans32x32_idx2idx[] = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
-static_assert(sizeof(_Rans32x32_idx2idx) == StateCount);
-
-//////////////////////////////////////////////////////////////////////////
-
-struct _rans_encode_state32mt_t
-{
-  uint32_t states[StateCount];
-  hist_t hist;
-  uint16_t *pEnd, *pStart; // both compressed.
-};
-
-enum rans32x32_encoder_type_t
-{
-  r32x32_et_scalar,
-};
-
-template <rans32x32_encoder_type_t type>
-struct rans32x32_16w_encoder
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex);
-};
-
-template <>
-struct rans32x32_16w_encoder<r32x32_et_scalar>
-{
-  template <uint32_t TotalSymbolCountBits>
-  static void encode_section(_rans_encode_state32mt_t *pState, const uint8_t *pInData, const size_t startIndex, const size_t targetIndex)
-  {
-    int64_t targetCmp = targetIndex + StateCount;
-
-    constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
-
-    for (int64_t i = startIndex; i >= (int64_t)targetCmp; i -= StateCount)
-    {
-      for (int64_t j = StateCount - 1; j >= 0; j--)
-      {
-        const uint8_t index = _Rans32x32_idx2idx[j];
-
-        const uint8_t in = pInData[i - StateCount + index];
-        const uint32_t symbolCount = pState->hist.symbolCount[in];
-        const uint32_t max = EncodeEmitPoint * symbolCount;
-
-        const size_t stateIndex = j;
-
-        uint32_t state = pState->states[stateIndex];
-
-        if constexpr (EncodeNoBranch)
-        {
-          const bool write = state >= max;
-          *pState->pStart = (uint16_t)(state & 0xFFFF);
-          *pState->pStart -= (size_t)write;
-          state = write ? state >> 16 : state;
-        }
-        else
-        {
-          if (state >= max)
-          {
-            *pState->pStart = (uint16_t)(state & 0xFFFF);
-            pState->pStart--;
-            state >>= 16;
-          }
-        }
-
-        pState->states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)pState->hist.cumul[in] + (state % symbolCount);
-      }
-    }
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////
-
 template <uint32_t TotalSymbolCountBits>
 static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
 {
@@ -223,7 +149,7 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
   constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
   constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
 
-  _rans_encode_state32mt_t encodeState;
+  _rans_encode_state32_t encodeState;
   encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
   encodeState.pStart = encodeState.pEnd;
   

From 0da610c9d1ebdecc7ff0e62be40758bc196e119a Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 04:19:08 +0200
Subject: [PATCH 20/34] Adding 64 state variant

---
 src/main.cpp                    |  45 +++--
 src/mt_rANS32x64_16w.h          |  30 +++
 src/mt_rANS32x64_16w_decode.cpp | 337 ++++++++++++++++++++++++++++++++
 src/mt_rANS32x64_16w_encode.cpp | 316 ++++++++++++++++++++++++++++++
 4 files changed, 710 insertions(+), 18 deletions(-)
 create mode 100644 src/mt_rANS32x64_16w.h
 create mode 100644 src/mt_rANS32x64_16w_decode.cpp
 create mode 100644 src/mt_rANS32x64_16w_encode.cpp

diff --git a/src/main.cpp b/src/main.cpp
index 9cf1652..123d47d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,6 +13,7 @@
 #include "block_rANS32x32_16w.h"
 #include "block_rANS32x64_16w.h"
 #include "mt_rANS32x32_16w.h"
+#include "mt_rANS32x64_16w.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -164,26 +165,33 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe
 
 static codec_info_t _Codecs[] =
 {
-  // { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  // { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  // { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  // { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  // { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  // { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
-  // { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
-  // { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},
-  // { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}},
-  // { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}},
-  // { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}},
-  // { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_13, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_12, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_11, true }, {}}},
+  { "rANS32x64 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_10, true }, {}}},
   
-  { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_15>, true }, {}}},
-  { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_14>, true }, {}}},
-  { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_13>, true }, {}}},
-  { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_12>, true }, {}}},
-  { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_11>, true }, {}}},
-  { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_10>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_15>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_14>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_13>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_12>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_11>, true }, {}}},
+  { "rANS32x32 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper<mt_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode (single thread)", mt_rANS32x32_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x32_16w_decode_mt_10>, true }, {}}},
+  
+  { "rANS32x64 16w (independent blocks)", 15, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_15, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_15>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 14, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_14, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_14>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 13, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_13>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_13, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_13>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 12, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_12>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_12, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_12>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_11>, true }, {}}},
+  { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_10>, true }, {}}},
   
   { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
@@ -422,6 +430,7 @@ int32_t main(const int32_t argc, char **pArgv)
     compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x32_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, block_rANS32x64_16w_capacity(fileSize));
     compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x32_16w_capacity(fileSize));
+    compressedDataCapacity = rans_max(compressedDataCapacity, mt_rANS32x64_16w_capacity(fileSize));
 
     pCompressedData = (uint8_t *)ALIGNED_ALLOC(64, compressedDataCapacity);
 
diff --git a/src/mt_rANS32x64_16w.h b/src/mt_rANS32x64_16w.h
new file mode 100644
index 0000000..e588cf3
--- /dev/null
+++ b/src/mt_rANS32x64_16w.h
@@ -0,0 +1,30 @@
+#ifndef mt_rANS32x64_16w_h__
+#define mt_rANS32x64_16w_h__
+
+#include "hist.h"
+#include "thread_pool.h"
+
+size_t mt_rANS32x64_16w_capacity(const size_t inputSize);
+
+size_t mt_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity);
+
+size_t mt_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+size_t mt_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity);
+
+size_t mt_rANS32x64_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x64_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x64_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x64_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x64_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+size_t mt_rANS32x64_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool);
+
+#endif // mt_rANS32x64_16w_h__
diff --git a/src/mt_rANS32x64_16w_decode.cpp b/src/mt_rANS32x64_16w_decode.cpp
new file mode 100644
index 0000000..fb207fa
--- /dev/null
+++ b/src/mt_rANS32x64_16w_decode.cpp
@@ -0,0 +1,337 @@
+#include "mt_rANS32x64_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+#include "block_codec64.h"
+
+#include <string.h>
+#include <math.h>
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x64_decoder_type_t Impl, typename hist_type>
+size_t mt_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state64_t<hist_type> decodeState;
+
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+    (void)readHeadBackOffset; // unused in single-threaded version.
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+    }
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+    if (i > outLengthInStates)
+    {
+      if (i >= expectedOutputLength)
+        return expectedOutputLength;
+      else
+        break;
+    }
+
+    decodeState.pReadHead = pReadHeadAfter;
+
+  } while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x64_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if (state < DecodeConsumePoint16)
+        {
+          state = state << 16 | *decodeState.pReadHead;
+          decodeState.pReadHead++;
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x64_decoder_type_t Impl, typename hist_type>
+size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  if (inLength < sizeof(uint64_t) * 2 + sizeof(uint32_t) * StateCount + sizeof(uint16_t) * 256)
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr uint32_t TotalSymbolCount = ((uint32_t)1 << TotalSymbolCountBits);
+
+  size_t inputIndex = 0;
+  const uint64_t expectedOutputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (expectedOutputLength > outCapacity)
+    return 0;
+
+  const uint64_t expectedInputLength = *reinterpret_cast<const uint64_t *>(pInData + inputIndex);
+  inputIndex += sizeof(uint64_t);
+
+  if (inLength < expectedInputLength)
+    return 0;
+
+  _rans_decode_state64_t<hist_type> decodeState;
+
+  decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
+  const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
+  size_t i = 0;
+  hist_t hist;
+
+  do
+  {
+    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+
+    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+    }
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      hist.symbolCount[j] = *decodeState.pReadHead;
+      decodeState.pReadHead++;
+    }
+
+    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+      return 0;
+
+    uint64_t blockEndInStates = (i + blockSize);
+
+    if (blockEndInStates > outLengthInStates)
+      blockEndInStates = outLengthInStates;
+    else if ((blockEndInStates & (StateCount - 1)) != 0)
+      return 0;
+
+    if (i + blockSize > blockEndInStates)
+    {
+      i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+      break;
+    }
+    else
+    {
+      thread_pool_add(pThreadPool, [=]() {
+        auto decState = decodeState;
+        rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
+      });
+
+      i = blockEndInStates;
+      decodeState.pReadHead = pReadHeadAfter;
+    }
+
+  } while (i < outLengthInStates);
+
+  if (i < expectedOutputLength)
+  {
+    hist_dec_t<TotalSymbolCountBits> histDec;
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+
+    if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
+      return 0;
+
+    for (size_t j = 0; j < StateCount; j++)
+    {
+      const uint8_t index = _Rans32x64_idx2idx[j];
+
+      if (i + index < expectedOutputLength)
+      {
+        uint32_t state = decodeState.states[j];
+
+        const uint32_t slot = state & (TotalSymbolCount - 1);
+        const uint8_t symbol = histDec.cumulInv[slot];
+        pOutData[i + index] = symbol;
+
+        state = (state >> TotalSymbolCountBits) * (uint32_t)histDec.symbolCount[symbol] + slot - (uint32_t)histDec.cumul[symbol];
+
+        if (state < DecodeConsumePoint16)
+        {
+          state = state << 16 | *decodeState.pReadHead;
+          decodeState.pReadHead++;
+        }
+
+        decodeState.states[j] = state;
+      }
+    }
+  }
+
+  thread_pool_await(pThreadPool);
+
+  return expectedOutputLength;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static size_t mt_rANS32x64_decode_wrapper(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool = nullptr)
+{
+  _DetectCPUFeatures();
+
+  if (avx2Supported)
+  {
+    if constexpr (TotalSymbolCountBits >= 13)
+    {
+      if (pThreadPool)
+        return mt_rANS32x64_16w_decode_mt<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+      else
+        return mt_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_15_to_13, hist_dec2_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    }
+    else
+    {
+      if (pThreadPool)
+        return mt_rANS32x64_16w_decode_mt<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+      else
+        return mt_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_avx2_large_cache_12_to_10, hist_dec_pack_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+    }
+  }
+
+  // Fallback.
+  if (pThreadPool)
+    return mt_rANS32x64_16w_decode_mt<TotalSymbolCountBits, r32x64_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+  else
+    return mt_rANS32x64_16w_decode<TotalSymbolCountBits, r32x64_dt_scalar, hist_dec_t<TotalSymbolCountBits>>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x64_16w_decode_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x64_16w_decode_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x64_16w_decode_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x64_16w_decode_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x64_16w_decode_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity);
+}
+
+size_t mt_rANS32x64_16w_decode_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity)
+{
+  return mt_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x64_16w_decode_mt_15(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<15>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x64_16w_decode_mt_14(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<14>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x64_16w_decode_mt_13(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<13>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x64_16w_decode_mt_12(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<12>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x64_16w_decode_mt_11(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<11>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
+
+size_t mt_rANS32x64_16w_decode_mt_10(const uint8_t *pInData, const size_t inLength, uint8_t *pOutData, const size_t outCapacity, thread_pool *pThreadPool)
+{
+  return mt_rANS32x64_decode_wrapper<10>(pInData, inLength, pOutData, outCapacity, pThreadPool);
+}
diff --git a/src/mt_rANS32x64_16w_encode.cpp b/src/mt_rANS32x64_16w_encode.cpp
new file mode 100644
index 0000000..cb659f2
--- /dev/null
+++ b/src/mt_rANS32x64_16w_encode.cpp
@@ -0,0 +1,316 @@
+#include "mt_rANS32x64_16w.h"
+
+#include "hist.h"
+#include "simd_platform.h"
+#include "block_codec64.h"
+
+#include <string.h>
+#include <math.h>
+
+constexpr size_t SafeHistBitMax = 0;
+
+constexpr size_t MinMinBlockSizeBits = 15;
+constexpr size_t MinMinBlockSize = (size_t)1 << MinMinBlockSizeBits;
+
+template <size_t TotalSymbolCountBits>
+struct HistReplaceMul
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct HistReplaceMul<15> { constexpr static size_t GetValue() { return 50; } };
+template <> struct HistReplaceMul<14> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<13> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<12> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<11> { constexpr static size_t GetValue() { return 500; } };
+template <> struct HistReplaceMul<10> { constexpr static size_t GetValue() { return 500; } };
+
+template <size_t TotalSymbolCountBits>
+struct MinBlockSizeBits
+{
+  constexpr static size_t GetValue();
+};
+
+template <> struct MinBlockSizeBits<15> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<14> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<13> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<12> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<11> { constexpr static size_t GetValue() { return 16; } };
+template <> struct MinBlockSizeBits<10> { constexpr static size_t GetValue() { return 16; } };
+
+template <uint32_t TotalSymbolCountBits>
+constexpr size_t MinBlockSize()
+{
+  return (size_t)1 << MinBlockSizeBits<TotalSymbolCountBits>::GetValue();
+}
+
+constexpr size_t MaxBlockSizeBits = 25;
+constexpr size_t MaxBlockSize = (size_t)1 << MaxBlockSizeBits;
+
+size_t mt_rANS32x64_16w_capacity(const size_t inputSize)
+{
+  const size_t baseSize = 2 * sizeof(uint64_t) + 256 * sizeof(uint16_t) + inputSize + StateCount * sizeof(uint32_t);
+  const size_t blockCount = (inputSize + MinMinBlockSize) / MinMinBlockSize + 1;
+  const size_t perBlockExtraSize = sizeof(uint64_t) * 2 + 256 * sizeof(uint16_t) + StateCount * sizeof(uint32_t);
+
+  return baseSize + blockCount * perBlockExtraSize; // inputIndex hope this covers all of our bases.
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits>
+static bool _CanExtendHist(const uint8_t *pData, const size_t nextBlockStartOffset, const size_t nextBlockSize, hist_t *pOldHist, uint32_t symCount[256])
+{
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+
+  memset(symCount, 0, sizeof(uint32_t) * 256);
+  observe_hist(symCount, pData + nextBlockStartOffset, nextBlockSize);
+
+  // Do we include a symbol that hasn't been included before?
+  if constexpr (!IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+      if (symCount[j] > 0 && pOldHist->symbolCount[j] == 0)
+        return false;
+  }
+
+  hist_t newHist;
+
+  if constexpr (TotalSymbolCountBits == MinBlockSize<TotalSymbolCountBits>())
+  {
+    for (size_t j = 0; j < 256; j++)
+      newHist.symbolCount[j] = (uint16_t)symCount[j];
+
+    size_t counter = 0;
+
+    for (size_t j = 0; j < 256; j++)
+    {
+      newHist.cumul[j] = (uint16_t)counter;
+      counter += newHist.symbolCount[j];
+    }
+  }
+  else
+  {
+    normalize_hist(&newHist, symCount, MinBlockSize<TotalSymbolCountBits>(), TotalSymbolCountBits);
+  }
+
+  constexpr size_t totalSymbolCount = (1 << TotalSymbolCountBits);
+  constexpr size_t histReplacePoint = (totalSymbolCount * HistReplaceMul<TotalSymbolCountBits>::GetValue()) >> 12;
+
+  // this comparison isn't fair or fast, but should be a good starting point hopefully.
+  float costBefore = 0;
+  float costAfter = (float)(sizeof(uint16_t) * 256 + StateCount * sizeof(uint32_t) + sizeof(uint64_t) * 2) * 0.5f; // let's assume that block will be able to share it's histogram with someone else.
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = (symCount[j] - 1) * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+  else
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+        continue;
+
+      const float before = symCount[j] * log2f(pOldHist->symbolCount[j] / (float)totalSymbolCount);
+      const float after = symCount[j] * log2f(newHist.symbolCount[j] / (float)totalSymbolCount);
+
+      costBefore -= before;
+      costAfter -= after;
+    }
+  }
+
+  const float diff = costBefore - costAfter;
+
+  return (diff < histReplacePoint);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template <uint32_t TotalSymbolCountBits, rans32x64_encoder_type_t Impl>
+size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity)
+{
+  if (outCapacity < mt_rANS32x64_16w_capacity(length))
+    return 0;
+
+  static_assert(TotalSymbolCountBits < 16);
+  constexpr size_t EncodeEmitPoint = ((DecodeConsumePoint16 >> TotalSymbolCountBits) << 16);
+
+  constexpr bool IsSafeHist = TotalSymbolCountBits >= SafeHistBitMax;
+  constexpr size_t MinBlockSizeX = MinBlockSize<TotalSymbolCountBits>();
+
+  _rans_encode_state64_t encodeState;
+  encodeState.pEnd = reinterpret_cast<uint16_t *>(pOutData + outCapacity - sizeof(uint16_t));
+  encodeState.pStart = encodeState.pEnd;
+  
+  size_t inputBlockTargetIndex = (((length - 1) & ~(size_t)(StateCount - 1)) & ~(size_t)(MinBlockSizeX - 1));
+
+  if (inputBlockTargetIndex > MinBlockSizeX)
+    inputBlockTargetIndex -= MinBlockSizeX;
+
+  uint16_t *pBlockEnd = encodeState.pEnd;
+  size_t blockBackPoint = length;
+
+  uint32_t symCount[256];
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+
+  size_t extraCount = 0;
+
+  if constexpr (IsSafeHist)
+  {
+    for (size_t j = 0; j < 256; j++)
+    {
+      if (symCount[j] == 0)
+      {
+        symCount[j] = 1;
+        extraCount++;
+      }
+    }
+  }
+
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+
+  while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+  {
+    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+      inputBlockTargetIndex -= MinBlockSizeX;
+    else
+      break;
+  }
+
+  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+  blockBackPoint = length;
+
+  // Init States.
+  for (size_t i = 0; i < StateCount; i++)
+    encodeState.states[i] = DecodeConsumePoint16;
+
+  int64_t inputIndex = length - 1;
+  inputIndex &= ~(size_t)(StateCount - 1);
+  inputIndex += StateCount;
+
+  for (int64_t j = StateCount - 1; j >= 0; j--)
+  {
+    const uint8_t index = _Rans32x64_idx2idx[j];
+
+    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    {
+      const uint8_t in = pInData[inputIndex - StateCount + index];
+      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+      const uint32_t max = EncodeEmitPoint * symbolCount;
+
+      const size_t stateIndex = j;
+
+      uint32_t state = encodeState.states[stateIndex];
+
+      if (state >= max)
+      {
+        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+        encodeState.pStart--;
+        state >>= 16;
+      }
+
+      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+    }
+  }
+
+  inputIndex -= StateCount;
+
+  while (true)
+  {
+    rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    inputIndex = inputBlockTargetIndex;
+
+    // Write hist & states.
+    {
+      const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
+
+      encodeState.pStart++;
+      encodeState.pStart -= 256;
+      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+      encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
+      memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+
+      const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
+
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
+
+      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+
+      pBlockEnd = encodeState.pStart;
+      encodeState.pStart--;
+    }
+
+    if (inputIndex == 0)
+      break;
+
+    // Determine new histogram.
+    {
+      inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+
+      if constexpr (IsSafeHist)
+        for (size_t j = 0; j < 256; j++)
+          if (symCount[j] == 0)
+            symCount[j] = 1;
+
+      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+      while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+      {
+        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+          inputBlockTargetIndex -= MinBlockSizeX;
+        else
+          break;
+      }
+
+      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+      blockBackPoint = inputIndex;
+    }
+  }
+
+  uint8_t *pWrite = pOutData;
+  size_t outIndex = 0;
+
+  *reinterpret_cast<uint64_t *>(pWrite + outIndex) = (uint64_t)length;
+  outIndex += sizeof(uint64_t);
+
+  // compressed expected length.
+  outIndex += sizeof(uint64_t);
+
+  const size_t size = (encodeState.pEnd - encodeState.pStart) * sizeof(uint16_t);
+
+  memmove(pWrite + outIndex, encodeState.pStart + 1, size);
+  outIndex += size;
+
+  *reinterpret_cast<uint64_t *>(pOutData + sizeof(uint64_t)) = outIndex; // write total output length.
+
+  return outIndex;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+size_t mt_rANS32x64_16w_encode_15(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<15, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x64_16w_encode_14(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<14, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x64_16w_encode_13(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<13, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x64_16w_encode_12(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<12, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x64_16w_encode_11(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<11, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }
+size_t mt_rANS32x64_16w_encode_10(const uint8_t *pInData, const size_t length, uint8_t *pOutData, const size_t outCapacity) { return mt_rANS32x64_16w_encode<10, r32x64_et_scalar>(pInData, length, pOutData, outCapacity); }

From 2c96afcd87c5e819731610396058a7cf36e4921f Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 18:10:49 +0200
Subject: [PATCH 21/34] fixing weird single symbol hist issue for block_64

---
 src/block_codec64.h                |   1 +
 src/block_rANS32x64_16w_decode.cpp |  40 ++++---
 src/block_rANS32x64_16w_encode.cpp | 182 ++++++++++++++++++++---------
 src/main.cpp                       |  12 +-
 4 files changed, 161 insertions(+), 74 deletions(-)

diff --git a/src/block_codec64.h b/src/block_codec64.h
index 45f05b5..ac3c9bc 100644
--- a/src/block_codec64.h
+++ b/src/block_codec64.h
@@ -4,6 +4,7 @@
 #include "hist.h"
 
 #include <string.h>
+#include <stdio.h>
 
 constexpr size_t StateCount = 64; // Needs to be a power of two.
 
diff --git a/src/block_rANS32x64_16w_decode.cpp b/src/block_rANS32x64_16w_decode.cpp
index c4e1788..a909f9e 100644
--- a/src/block_rANS32x64_16w_decode.cpp
+++ b/src/block_rANS32x64_16w_decode.cpp
@@ -42,30 +42,42 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength,
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    for (size_t j = 0; j < 256; j++)
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
     {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
+
+      memset(pOutData + i, symbol, blockSize);
+
+      i += blockSize;
     }
+    else
+    {
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-    uint64_t blockEndInStates = (i + blockSize);
+      uint64_t blockEndInStates = i + blockSizeVal;
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
 
-    i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+    }
 
     if (i > outLengthInStates)
     {
@@ -80,7 +92,7 @@ size_t block_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength,
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
diff --git a/src/block_rANS32x64_16w_encode.cpp b/src/block_rANS32x64_16w_encode.cpp
index e17469e..4464a90 100644
--- a/src/block_rANS32x64_16w_encode.cpp
+++ b/src/block_rANS32x64_16w_encode.cpp
@@ -156,37 +156,63 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u
     inputBlockTargetIndex -= MinBlockSizeX;
 
   size_t blockBackPoint = length;
+  size_t numSymbols = 0;
+  uint8_t selectedSymbol = 0;
 
   uint32_t symCount[256];
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
-  size_t extraCount = 0;
+  for (size_t j = 0; j < 256; j++)
+  {
+    numSymbols += (size_t)!!symCount[j];
+    selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+  }
 
-  if constexpr (IsSafeHist)
+  if (numSymbols == 1)
   {
-    for (size_t j = 0; j < 256; j++)
+    int64_t idx = inputBlockTargetIndex - 1;
+
+    for (; idx >= 0; idx--)
+      if (pInData[idx] != selectedSymbol)
+        break;
+
+    inputBlockTargetIndex = (size_t)(idx + 1);
+
+    // Align with `StateCount`.
+    inputBlockTargetIndex += StateCount - 1;
+    inputBlockTargetIndex &= ~(StateCount - 1);
+  }
+  else
+  {
+    size_t extraCount = 0;
+
+    if constexpr (IsSafeHist)
     {
-      if (symCount[j] == 0)
+      for (size_t j = 0; j < 256; j++)
       {
-        symCount[j] = 1;
-        extraCount++;
+        if (symCount[j] == 0)
+        {
+          symCount[j] = 1;
+          extraCount++;
+        }
       }
     }
-  }
 
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
-  while (inputBlockTargetIndex > 0)
-  {
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-      inputBlockTargetIndex -= MinBlockSizeX;
-    else
-      break;
+    while (inputBlockTargetIndex > 0)
+    {
+      if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+        inputBlockTargetIndex -= MinBlockSizeX;
+      else
+        break;
+    }
+
+    // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+    observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
   }
 
-  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
   blockBackPoint = length;
 
   // Init States.
@@ -197,28 +223,31 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u
   inputIndex &= ~(size_t)(StateCount - 1);
   inputIndex += StateCount;
 
-  for (int64_t j = StateCount - 1; j >= 0; j--)
+  if (numSymbols != 1)
   {
-    const uint8_t index = _Rans32x64_idx2idx[j];
-
-    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    for (int64_t j = StateCount - 1; j >= 0; j--)
     {
-      const uint8_t in = pInData[inputIndex - StateCount + index];
-      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
-      const uint32_t max = EncodeEmitPoint * symbolCount;
+      const uint8_t index = _Rans32x64_idx2idx[j];
 
-      const size_t stateIndex = j;
+      if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+      {
+        const uint8_t in = pInData[inputIndex - StateCount + index];
+        const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
 
-      uint32_t state = encodeState.states[stateIndex];
+        const size_t stateIndex = j;
 
-      if (state >= max)
-      {
-        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
-        encodeState.pStart--;
-        state >>= 16;
-      }
+        uint32_t state = encodeState.states[stateIndex];
+
+        if (state >= max)
+        {
+          *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+          encodeState.pStart--;
+          state >>= 16;
+        }
 
-      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+        encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+      }
     }
   }
 
@@ -226,19 +255,32 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u
 
   while (true)
   {
-    rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    if (numSymbols != 1)
+      rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+
     inputIndex = inputBlockTargetIndex;
 
-    // Write hist.
+    // Write block info.
     {
       const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
 
       encodeState.pStart++;
-      encodeState.pStart -= 256;
-      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      if (numSymbols != 1) // write hist.
+      {
+        encodeState.pStart -= 256;
+        memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      }
+      else
+      {
+        const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54);
+
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator));
+      }
 
       encodeState.pStart--;
     }
@@ -248,28 +290,60 @@ size_t block_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, u
 
     // Determine new histogram.
     {
-      inputBlockTargetIndex -= MinBlockSizeX;
+      inputBlockTargetIndex -= 1;
+      inputBlockTargetIndex &= ~(MinBlockSizeX - 1);
+
+      const size_t initialSize = inputIndex - inputBlockTargetIndex;
+
+      if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3)
+        inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex);
+
+      numSymbols = 0;
 
-      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+      for (size_t j = 0; j < 256; j++)
+      {
+        numSymbols += (size_t)!!symCount[j];
+        selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+      }
 
-      if constexpr (IsSafeHist)
-        for (size_t j = 0; j < 256; j++)
-          if (symCount[j] == 0)
-            symCount[j] = 1;
+      if (numSymbols == 1)
+      {
+        int64_t idx = inputBlockTargetIndex - 1;
 
-      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+        for (; idx >= 0; idx--)
+          if (pInData[idx] != selectedSymbol)
+            break;
 
-      while (inputBlockTargetIndex > 0)
+        inputBlockTargetIndex = (size_t)(idx + 1);
+
+        // Align with `StateCount`.
+        inputBlockTargetIndex += StateCount - 1;
+        inputBlockTargetIndex &= ~(StateCount - 1);
+      }
+      else
       {
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-          inputBlockTargetIndex -= MinBlockSizeX;
-        else
-          break;
+        if constexpr (IsSafeHist)
+          for (size_t j = 0; j < 256; j++)
+            if (symCount[j] == 0)
+              symCount[j] = 1;
+
+        normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+        while (inputBlockTargetIndex > 0)
+        {
+          if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+            inputBlockTargetIndex -= MinBlockSizeX;
+          else
+            break;
+        }
+
+        // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+        observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+        normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       }
 
-      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       blockBackPoint = inputIndex;
     }
   }
diff --git a/src/main.cpp b/src/main.cpp
index 123d47d..09b0c55 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -165,12 +165,12 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe
 
 static codec_info_t _Codecs[] =
 {
-  { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
   { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
   { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},

From 2337364713a0dbbf2e43ebc7dbdf629dc9cf1486 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 19:06:20 +0200
Subject: [PATCH 22/34] Hopefully applying those changes correctly to block_32

---
 src/block_codec64.h                |   1 -
 src/block_rANS32x32_16w_decode.cpp |  40 ++++---
 src/block_rANS32x32_16w_encode.cpp | 185 ++++++++++++++++++++---------
 src/main.cpp                       |  12 +-
 4 files changed, 161 insertions(+), 77 deletions(-)

diff --git a/src/block_codec64.h b/src/block_codec64.h
index ac3c9bc..45f05b5 100644
--- a/src/block_codec64.h
+++ b/src/block_codec64.h
@@ -4,7 +4,6 @@
 #include "hist.h"
 
 #include <string.h>
-#include <stdio.h>
 
 constexpr size_t StateCount = 64; // Needs to be a power of two.
 
diff --git a/src/block_rANS32x32_16w_decode.cpp b/src/block_rANS32x32_16w_decode.cpp
index ae2bd55..48444c9 100644
--- a/src/block_rANS32x32_16w_decode.cpp
+++ b/src/block_rANS32x32_16w_decode.cpp
@@ -48,30 +48,42 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    for (size_t j = 0; j < 256; j++)
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
     {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
+
+      memset(pOutData + i, symbol, blockSize);
+
+      i += blockSize;
     }
+    else
+    {
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-    uint64_t blockEndInStates = (i + blockSize);
+      uint64_t blockEndInStates = (i + blockSizeVal);
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
 
-    i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+    }
 
     if (i > outLengthInStates)
     {
@@ -86,7 +98,7 @@ size_t block_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength,
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
diff --git a/src/block_rANS32x32_16w_encode.cpp b/src/block_rANS32x32_16w_encode.cpp
index 3a25ed0..891b235 100644
--- a/src/block_rANS32x32_16w_encode.cpp
+++ b/src/block_rANS32x32_16w_encode.cpp
@@ -156,38 +156,63 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
     inputBlockTargetIndex -= MinBlockSizeX;
 
   size_t blockBackPoint = length;
+  size_t numSymbols = 0;
+  uint8_t selectedSymbol = 0;
 
   uint32_t symCount[256];
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
-  size_t extraCount = 0;
+  for (size_t j = 0; j < 256; j++)
+  {
+    numSymbols += (size_t)!!symCount[j];
+    selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+  }
+  
+  if (numSymbols == 1)
+  {
+    int64_t idx = inputBlockTargetIndex - 1;
 
-  if constexpr (IsSafeHist)
+    for (; idx >= 0; idx--)
+      if (pInData[idx] != selectedSymbol)
+        break;
+
+    inputBlockTargetIndex = (size_t)(idx + 1);
+
+    // Align with `StateCount`.
+    inputBlockTargetIndex += StateCount - 1;
+    inputBlockTargetIndex &= ~(StateCount - 1);
+  }
+  else
   {
-    for (size_t j = 0; j < 256; j++)
+    size_t extraCount = 0;
+
+    if constexpr (IsSafeHist)
     {
-      if (symCount[j] == 0)
+      for (size_t j = 0; j < 256; j++)
       {
-        symCount[j] = 1;
-        extraCount++;
+        if (symCount[j] == 0)
+        {
+          symCount[j] = 1;
+          extraCount++;
+        }
       }
     }
-  }
 
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
-  while (inputBlockTargetIndex > 0)
-  {
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-      inputBlockTargetIndex -= MinBlockSizeX;
-    else
-      break;
-  }
+    while (inputBlockTargetIndex > 0)
+    {
+      if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+        inputBlockTargetIndex -= MinBlockSizeX;
+      else
+        break;
+    }
 
-  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
-  blockBackPoint = length;
+    // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+    observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+    blockBackPoint = length;
+  }
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
@@ -197,28 +222,31 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
   inputIndex &= ~(size_t)(StateCount - 1);
   inputIndex += StateCount;
 
-  for (int64_t j = StateCount - 1; j >= 0; j--)
+  if (numSymbols != 1)
   {
-    const uint8_t index = _Rans32x32_idx2idx[j];
-
-    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    for (int64_t j = StateCount - 1; j >= 0; j--)
     {
-      const uint8_t in = pInData[inputIndex - StateCount + index];
-      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
-      const uint32_t max = EncodeEmitPoint * symbolCount;
+      const uint8_t index = _Rans32x32_idx2idx[j];
 
-      const size_t stateIndex = j;
+      if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+      {
+        const uint8_t in = pInData[inputIndex - StateCount + index];
+        const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
 
-      uint32_t state = encodeState.states[stateIndex];
+        const size_t stateIndex = j;
 
-      if (state >= max)
-      {
-        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
-        encodeState.pStart--;
-        state >>= 16;
-      }
+        uint32_t state = encodeState.states[stateIndex];
 
-      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+        if (state >= max)
+        {
+          *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+          encodeState.pStart--;
+          state >>= 16;
+        }
+
+        encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+      }
     }
   }
 
@@ -226,19 +254,32 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
 
   while (true)
   {
-    rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    if (numSymbols != 1)
+      rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+
     inputIndex = inputBlockTargetIndex;
 
-    // Write hist.
+    // Write block info.
     {
       const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
 
       encodeState.pStart++;
-      encodeState.pStart -= 256;
-      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      if (numSymbols != 1) // write hist.
+      {
+        encodeState.pStart -= 256;
+        memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      }
+      else
+      {
+        const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54);
+
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator));
+      }
 
       encodeState.pStart--;
     }
@@ -248,28 +289,60 @@ size_t block_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, u
 
     // Determine new histogram.
     {
-      inputBlockTargetIndex -= MinBlockSizeX;
+      inputBlockTargetIndex -= 1;
+      inputBlockTargetIndex &= ~(MinBlockSizeX - 1);
+
+      const size_t initialSize = inputIndex - inputBlockTargetIndex;
+
+      if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3)
+        inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex);
+
+      numSymbols = 0;
 
-      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+      for (size_t j = 0; j < 256; j++)
+      {
+        numSymbols += (size_t)!!symCount[j];
+        selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+      }
 
-      if constexpr (IsSafeHist)
-        for (size_t j = 0; j < 256; j++)
-          if (symCount[j] == 0)
-            symCount[j] = 1;
+      if (numSymbols == 1)
+      {
+        int64_t idx = inputBlockTargetIndex - 1;
 
-      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+        for (; idx >= 0; idx--)
+          if (pInData[idx] != selectedSymbol)
+            break;
 
-      while (inputBlockTargetIndex > 0)
+        inputBlockTargetIndex = (size_t)(idx + 1);
+
+        // Align with `StateCount`.
+        inputBlockTargetIndex += StateCount - 1;
+        inputBlockTargetIndex &= ~(StateCount - 1);
+      }
+      else
       {
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-          inputBlockTargetIndex -= MinBlockSizeX;
-        else
-          break;
+        if constexpr (IsSafeHist)
+          for (size_t j = 0; j < 256; j++)
+            if (symCount[j] == 0)
+              symCount[j] = 1;
+
+        normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+        while (inputBlockTargetIndex > 0)
+        {
+          if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+            inputBlockTargetIndex -= MinBlockSizeX;
+          else
+            break;
+        }
+
+        // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+        observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+        normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       }
 
-      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       blockBackPoint = inputIndex;
     }
   }
diff --git a/src/main.cpp b/src/main.cpp
index 09b0c55..123d47d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -165,12 +165,12 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe
 
 static codec_info_t _Codecs[] =
 {
-  //{ "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
-  //{ "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 13, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_13>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_13, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 12, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_12>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_12, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 11, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_11>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_11, true }, {}}},
+  { "rANS32x32 16w (variable block size)", 10, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_10>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_10, true }, {}}},
   
   { "rANS32x64 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_15, true }, {}}},
   { "rANS32x64 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x64_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x64_16w_decode_14, true }, {}}},

From 347604fa07689211cbba4f2c155c0ff437236e8d Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Fri, 7 Jul 2023 21:35:52 +0200
Subject: [PATCH 23/34] Adapting MT codecs to also feature those changes

---
 src/block_codec32.h             |   2 +-
 src/block_codec64.h             |   2 +-
 src/mt_rANS32x32_16w_decode.cpp | 158 +++++++++++++++-----------
 src/mt_rANS32x32_16w_encode.cpp | 195 ++++++++++++++++++++++----------
 src/mt_rANS32x64_16w_decode.cpp | 158 +++++++++++++++-----------
 src/mt_rANS32x64_16w_encode.cpp | 195 ++++++++++++++++++++++----------
 6 files changed, 452 insertions(+), 258 deletions(-)

diff --git a/src/block_codec32.h b/src/block_codec32.h
index 106da1c..1366b5e 100644
--- a/src/block_codec32.h
+++ b/src/block_codec32.h
@@ -710,4 +710,4 @@ struct rans32x32_16w_decoder<r32x32_dt_avx2_small_cache_12_to_10, TotalSymbolCou
   }
 };
 
-#endif block_codec32_h__
+#endif // block_codec32_h__
diff --git a/src/block_codec64.h b/src/block_codec64.h
index 45f05b5..4ea95f3 100644
--- a/src/block_codec64.h
+++ b/src/block_codec64.h
@@ -1758,4 +1758,4 @@ struct rans32x64_16w_decoder<r32x64_dt_avx512_small_cache_12_to_10, TotalSymbolC
   }
 };
 
-#endif
+#endif // block_codec64_h__
diff --git a/src/mt_rANS32x32_16w_decode.cpp b/src/mt_rANS32x32_16w_decode.cpp
index 503a9e6..f1edf5e 100644
--- a/src/mt_rANS32x32_16w_decode.cpp
+++ b/src/mt_rANS32x32_16w_decode.cpp
@@ -36,58 +36,70 @@ size_t mt_rANS32x32_16w_decode(const uint8_t *pInData, const size_t inLength, ui
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
-    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
-    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
-    (void)readHeadBackOffset; // unused in single-threaded version.
-
-    for (size_t j = 0; j < StateCount; j++)
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
     {
-      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
-      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
-    }
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
 
-    for (size_t j = 0; j < 256; j++)
-    {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
+      memset(pOutData + i, symbol, blockSize);
+
+      i += blockSize;
     }
+    else
+    {
+      const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+      const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+      (void)readHeadBackOffset; // unused in single-threaded version.
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+        decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      }
 
-    uint64_t blockEndInStates = (i + blockSize);
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-    i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      uint64_t blockEndInStates = (i + blockSizeVal);
 
-    if (i > outLengthInStates)
-    {
-      if (i >= expectedOutputLength)
-        return expectedOutputLength;
-      else
-        break;
-    }
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
 
-    decodeState.pReadHead = pReadHeadAfter;
+      i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+      if (i > outLengthInStates)
+      {
+        if (i >= expectedOutputLength)
+          return expectedOutputLength;
+        else
+          break;
+      }
+
+      decodeState.pReadHead = pReadHeadAfter;
+    }
 
   } while (i < outLengthInStates);
 
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
@@ -149,55 +161,67 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
-    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
+    {
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
 
-    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+      memset(pOutData + i, symbol, blockSize); // let's hope this isn't the last block, because otherwise we'd delay starting tasks for other stuff, but otherwise `memset` is probably faster than starting a task.
 
-    for (size_t j = 0; j < StateCount; j++)
-    {
-      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
-      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      i += blockSize;
     }
-
-    for (size_t j = 0; j < 256; j++)
+    else
     {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
-    }
+      const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
 
-    uint64_t blockEndInStates = (i + blockSize);
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+        decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      }
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (i + blockSize > blockEndInStates)
-    {
-      i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-      break;
-    }
-    else
-    {
-      thread_pool_add(pThreadPool, [=]() {
-        auto decState = decodeState;
-        rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
-      });
+      uint64_t blockEndInStates = (i + blockSizeVal);
 
-      i = blockEndInStates;
-      decodeState.pReadHead = pReadHeadAfter;
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
+
+      if (i + blockSizeVal > blockEndInStates)
+      {
+        i = rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+        break;
+      }
+      else
+      {
+        thread_pool_add(pThreadPool, [=]() {
+          auto decState = decodeState;
+          rans32x32_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
+          });
+
+        i = blockEndInStates;
+        decodeState.pReadHead = pReadHeadAfter;
+      }
     }
 
   } while (i < outLengthInStates);
@@ -205,7 +229,7 @@ size_t mt_rANS32x32_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
diff --git a/src/mt_rANS32x32_16w_encode.cpp b/src/mt_rANS32x32_16w_encode.cpp
index 632fce0..f37deaa 100644
--- a/src/mt_rANS32x32_16w_encode.cpp
+++ b/src/mt_rANS32x32_16w_encode.cpp
@@ -160,38 +160,63 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
 
   uint16_t *pBlockEnd = encodeState.pEnd;
   size_t blockBackPoint = length;
+  size_t numSymbols = 0;
+  uint8_t selectedSymbol = 0;
 
   uint32_t symCount[256];
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
-  size_t extraCount = 0;
+  for (size_t j = 0; j < 256; j++)
+  {
+    numSymbols += (size_t)!!symCount[j];
+    selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+  }
 
-  if constexpr (IsSafeHist)
+  if (numSymbols == 1)
   {
-    for (size_t j = 0; j < 256; j++)
+    int64_t idx = inputBlockTargetIndex - 1;
+
+    for (; idx >= 0; idx--)
+      if (pInData[idx] != selectedSymbol)
+        break;
+
+    inputBlockTargetIndex = (size_t)(idx + 1);
+
+    // Align with `StateCount`.
+    inputBlockTargetIndex += StateCount - 1;
+    inputBlockTargetIndex &= ~(StateCount - 1);
+  }
+  else
+  {
+    size_t extraCount = 0;
+
+    if constexpr (IsSafeHist)
     {
-      if (symCount[j] == 0)
+      for (size_t j = 0; j < 256; j++)
       {
-        symCount[j] = 1;
-        extraCount++;
+        if (symCount[j] == 0)
+        {
+          symCount[j] = 1;
+          extraCount++;
+        }
       }
     }
-  }
 
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
-  while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
-  {
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-      inputBlockTargetIndex -= MinBlockSizeX;
-    else
-      break;
-  }
+    while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+    {
+      if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+        inputBlockTargetIndex -= MinBlockSizeX;
+      else
+        break;
+    }
 
-  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
-  blockBackPoint = length;
+    // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+    observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+    blockBackPoint = length;
+  }
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
@@ -201,28 +226,31 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
   inputIndex &= ~(size_t)(StateCount - 1);
   inputIndex += StateCount;
 
-  for (int64_t j = StateCount - 1; j >= 0; j--)
+  if (numSymbols != 1)
   {
-    const uint8_t index = _Rans32x32_idx2idx[j];
-
-    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    for (int64_t j = StateCount - 1; j >= 0; j--)
     {
-      const uint8_t in = pInData[inputIndex - StateCount + index];
-      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
-      const uint32_t max = EncodeEmitPoint * symbolCount;
+      const uint8_t index = _Rans32x32_idx2idx[j];
 
-      const size_t stateIndex = j;
+      if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+      {
+        const uint8_t in = pInData[inputIndex - StateCount + index];
+        const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
 
-      uint32_t state = encodeState.states[stateIndex];
+        const size_t stateIndex = j;
 
-      if (state >= max)
-      {
-        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
-        encodeState.pStart--;
-        state >>= 16;
-      }
+        uint32_t state = encodeState.states[stateIndex];
 
-      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+        if (state >= max)
+        {
+          *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+          encodeState.pStart--;
+          state >>= 16;
+        }
+
+        encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+      }
     }
   }
 
@@ -230,27 +258,40 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
 
   while (true)
   {
-    rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    if (numSymbols != 1)
+      rans32x32_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    
     inputIndex = inputBlockTargetIndex;
 
-    // Write hist & states.
+    // Write block info.
     {
       const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
 
       encodeState.pStart++;
-      encodeState.pStart -= 256;
-      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
 
-      encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
-      memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+      if (numSymbols != 1) // write hist & states.
+      {
+        encodeState.pStart -= 256;
+        memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+        encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
+        memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+
+        const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
 
-      const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      }
+      else
+      {
+        const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54);
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator));
+      }
 
       pBlockEnd = encodeState.pStart;
       encodeState.pStart--;
@@ -261,28 +302,60 @@ size_t mt_rANS32x32_16w_encode(const uint8_t *pInData, const size_t length, uint
 
     // Determine new histogram.
     {
-      inputBlockTargetIndex -= MinBlockSizeX;
+      inputBlockTargetIndex -= 1;
+      inputBlockTargetIndex &= ~(MinBlockSizeX - 1);
+
+      const size_t initialSize = inputIndex - inputBlockTargetIndex;
+
+      if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3)
+        inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex);
+
+      numSymbols = 0;
 
-      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+      for (size_t j = 0; j < 256; j++)
+      {
+        numSymbols += (size_t)!!symCount[j];
+        selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+      }
 
-      if constexpr (IsSafeHist)
-        for (size_t j = 0; j < 256; j++)
-          if (symCount[j] == 0)
-            symCount[j] = 1;
+      if (numSymbols == 1)
+      {
+        int64_t idx = inputBlockTargetIndex - 1;
 
-      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+        for (; idx >= 0; idx--)
+          if (pInData[idx] != selectedSymbol)
+            break;
 
-      while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+        inputBlockTargetIndex = (size_t)(idx + 1);
+
+        // Align with `StateCount`.
+        inputBlockTargetIndex += StateCount - 1;
+        inputBlockTargetIndex &= ~(StateCount - 1);
+      }
+      else
       {
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-          inputBlockTargetIndex -= MinBlockSizeX;
-        else
-          break;
+        if constexpr (IsSafeHist)
+          for (size_t j = 0; j < 256; j++)
+            if (symCount[j] == 0)
+              symCount[j] = 1;
+
+        normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+        while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+        {
+          if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+            inputBlockTargetIndex -= MinBlockSizeX;
+          else
+            break;
+        }
+
+        // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+        observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+        normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       }
 
-      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       blockBackPoint = inputIndex;
     }
   }
diff --git a/src/mt_rANS32x64_16w_decode.cpp b/src/mt_rANS32x64_16w_decode.cpp
index fb207fa..766366d 100644
--- a/src/mt_rANS32x64_16w_decode.cpp
+++ b/src/mt_rANS32x64_16w_decode.cpp
@@ -36,58 +36,70 @@ size_t mt_rANS32x64_16w_decode(const uint8_t *pInData, const size_t inLength, ui
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
-    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
-    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
-    (void)readHeadBackOffset; // unused in single-threaded version.
-
-    for (size_t j = 0; j < StateCount; j++)
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
     {
-      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
-      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
-    }
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
 
-    for (size_t j = 0; j < 256; j++)
-    {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
+      memset(pOutData + i, symbol, blockSize);
+
+      i += blockSize;
     }
+    else
+    {
+      const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+      const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+      (void)readHeadBackOffset; // unused in single-threaded version.
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+        decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      }
 
-    uint64_t blockEndInStates = (i + blockSize);
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-    i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      uint64_t blockEndInStates = (i + blockSizeVal);
 
-    if (i > outLengthInStates)
-    {
-      if (i >= expectedOutputLength)
-        return expectedOutputLength;
-      else
-        break;
-    }
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
 
-    decodeState.pReadHead = pReadHeadAfter;
+      i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+      if (i > outLengthInStates)
+      {
+        if (i >= expectedOutputLength)
+          return expectedOutputLength;
+        else
+          break;
+      }
+
+      decodeState.pReadHead = pReadHeadAfter;
+    }
 
   } while (i < outLengthInStates);
 
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
@@ -149,55 +161,67 @@ size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
   decodeState.pReadHead = reinterpret_cast<const uint16_t *>(pInData + inputIndex);
   const size_t outLengthInStates = expectedOutputLength - StateCount + 1;
   size_t i = 0;
-  hist_t hist;
+  hist_t hist = {};
 
   do
   {
-    const uint64_t blockSize = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+    const uint64_t blockSizeVal = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
     decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
-    decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
+    if (blockSizeVal & ((uint64_t)1 << 63)) // Single symbol hist
+    {
+      const uint8_t symbol = (blockSizeVal >> 54) & 0xFF;
+      const uint64_t blockSize = blockSizeVal & (((uint64_t)1 << 54) - 1);
 
-    const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
+      memset(pOutData + i, symbol, blockSize); // let's hope this isn't the last block, because otherwise we'd delay starting tasks for other stuff, but otherwise `memset` is probably faster than starting a task.
 
-    for (size_t j = 0; j < StateCount; j++)
-    {
-      decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
-      decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      i += blockSize;
     }
-
-    for (size_t j = 0; j < 256; j++)
+    else
     {
-      hist.symbolCount[j] = *decodeState.pReadHead;
-      decodeState.pReadHead++;
-    }
+      const uint64_t readHeadBackOffset = *reinterpret_cast<const uint64_t *>(decodeState.pReadHead);
+      decodeState.pReadHead += sizeof(uint64_t) / sizeof(uint16_t);
 
-    if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
-      return 0;
+      const uint16_t *pReadHeadAfter = decodeState.pReadHead + readHeadBackOffset + 1;
 
-    uint64_t blockEndInStates = (i + blockSize);
+      for (size_t j = 0; j < StateCount; j++)
+      {
+        decodeState.states[j] = *reinterpret_cast<const uint32_t *>(decodeState.pReadHead);
+        decodeState.pReadHead += (sizeof(uint32_t) / sizeof(uint16_t));
+      }
 
-    if (blockEndInStates > outLengthInStates)
-      blockEndInStates = outLengthInStates;
-    else if ((blockEndInStates & (StateCount - 1)) != 0)
-      return 0;
+      for (size_t j = 0; j < 256; j++)
+      {
+        hist.symbolCount[j] = *decodeState.pReadHead;
+        decodeState.pReadHead++;
+      }
 
-    if (i + blockSize > blockEndInStates)
-    {
-      i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+      if (!_init_from_hist(&decodeState.hist, &hist, TotalSymbolCountBits))
+        return 0;
 
-      break;
-    }
-    else
-    {
-      thread_pool_add(pThreadPool, [=]() {
-        auto decState = decodeState;
-        rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
-      });
+      uint64_t blockEndInStates = (i + blockSizeVal);
 
-      i = blockEndInStates;
-      decodeState.pReadHead = pReadHeadAfter;
+      if (blockEndInStates > outLengthInStates)
+        blockEndInStates = outLengthInStates;
+      else if ((blockEndInStates & (StateCount - 1)) != 0)
+        return 0;
+
+      if (i + blockSizeVal > blockEndInStates)
+      {
+        i = rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decodeState, pOutData, i, blockEndInStates);
+
+        break;
+      }
+      else
+      {
+        thread_pool_add(pThreadPool, [=]() {
+          auto decState = decodeState;
+          rans32x64_16w_decoder<Impl, TotalSymbolCountBits, hist_type>::decode_section(&decState, pOutData, i, blockEndInStates);
+          });
+
+        i = blockEndInStates;
+        decodeState.pReadHead = pReadHeadAfter;
+      }
     }
 
   } while (i < outLengthInStates);
@@ -205,7 +229,7 @@ size_t mt_rANS32x64_16w_decode_mt(const uint8_t *pInData, const size_t inLength,
   if (i < expectedOutputLength)
   {
     hist_dec_t<TotalSymbolCountBits> histDec;
-    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount));
+    memcpy(&histDec.symbolCount, &hist.symbolCount, sizeof(hist.symbolCount)); // this is unreachable with single symbol hist at the end.
 
     if (!inplace_make_hist_dec<TotalSymbolCountBits>(&histDec))
       return 0;
diff --git a/src/mt_rANS32x64_16w_encode.cpp b/src/mt_rANS32x64_16w_encode.cpp
index cb659f2..3ca01cc 100644
--- a/src/mt_rANS32x64_16w_encode.cpp
+++ b/src/mt_rANS32x64_16w_encode.cpp
@@ -160,38 +160,63 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint
 
   uint16_t *pBlockEnd = encodeState.pEnd;
   size_t blockBackPoint = length;
+  size_t numSymbols = 0;
+  uint8_t selectedSymbol = 0;
 
   uint32_t symCount[256];
   observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
 
-  size_t extraCount = 0;
+  for (size_t j = 0; j < 256; j++)
+  {
+    numSymbols += (size_t)!!symCount[j];
+    selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+  }
 
-  if constexpr (IsSafeHist)
+  if (numSymbols == 1)
   {
-    for (size_t j = 0; j < 256; j++)
+    int64_t idx = inputBlockTargetIndex - 1;
+
+    for (; idx >= 0; idx--)
+      if (pInData[idx] != selectedSymbol)
+        break;
+
+    inputBlockTargetIndex = (size_t)(idx + 1);
+
+    // Align with `StateCount`.
+    inputBlockTargetIndex += StateCount - 1;
+    inputBlockTargetIndex &= ~(StateCount - 1);
+  }
+  else
+  {
+    size_t extraCount = 0;
+
+    if constexpr (IsSafeHist)
     {
-      if (symCount[j] == 0)
+      for (size_t j = 0; j < 256; j++)
       {
-        symCount[j] = 1;
-        extraCount++;
+        if (symCount[j] == 0)
+        {
+          symCount[j] = 1;
+          extraCount++;
+        }
       }
     }
-  }
 
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex + extraCount, TotalSymbolCountBits);
 
-  while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
-  {
-    if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-      inputBlockTargetIndex -= MinBlockSizeX;
-    else
-      break;
-  }
+    while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+    {
+      if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+        inputBlockTargetIndex -= MinBlockSizeX;
+      else
+        break;
+    }
 
-  // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-  observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-  normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
-  blockBackPoint = length;
+    // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+    observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+    normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
+    blockBackPoint = length;
+  }
 
   // Init States.
   for (size_t i = 0; i < StateCount; i++)
@@ -201,28 +226,31 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint
   inputIndex &= ~(size_t)(StateCount - 1);
   inputIndex += StateCount;
 
-  for (int64_t j = StateCount - 1; j >= 0; j--)
+  if (numSymbols != 1)
   {
-    const uint8_t index = _Rans32x64_idx2idx[j];
-
-    if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+    for (int64_t j = StateCount - 1; j >= 0; j--)
     {
-      const uint8_t in = pInData[inputIndex - StateCount + index];
-      const uint32_t symbolCount = encodeState.hist.symbolCount[in];
-      const uint32_t max = EncodeEmitPoint * symbolCount;
+      const uint8_t index = _Rans32x64_idx2idx[j];
 
-      const size_t stateIndex = j;
+      if (inputIndex - (int64_t)StateCount + (int64_t)index < (int64_t)length)
+      {
+        const uint8_t in = pInData[inputIndex - StateCount + index];
+        const uint32_t symbolCount = encodeState.hist.symbolCount[in];
+        const uint32_t max = EncodeEmitPoint * symbolCount;
 
-      uint32_t state = encodeState.states[stateIndex];
+        const size_t stateIndex = j;
 
-      if (state >= max)
-      {
-        *encodeState.pStart = (uint16_t)(state & 0xFFFF);
-        encodeState.pStart--;
-        state >>= 16;
-      }
+        uint32_t state = encodeState.states[stateIndex];
 
-      encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+        if (state >= max)
+        {
+          *encodeState.pStart = (uint16_t)(state & 0xFFFF);
+          encodeState.pStart--;
+          state >>= 16;
+        }
+
+        encodeState.states[stateIndex] = ((state / symbolCount) << TotalSymbolCountBits) + (uint32_t)encodeState.hist.cumul[in] + (state % symbolCount);
+      }
     }
   }
 
@@ -230,27 +258,40 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint
 
   while (true)
   {
-    rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    if (numSymbols != 1)
+      rans32x64_16w_encoder<Impl>::template encode_section<TotalSymbolCountBits>(&encodeState, pInData, inputIndex, inputBlockTargetIndex);
+    
     inputIndex = inputBlockTargetIndex;
 
-    // Write hist & states.
+    // Write block info.
     {
       const uint64_t blockSize = blockBackPoint - inputBlockTargetIndex;
 
       encodeState.pStart++;
-      encodeState.pStart -= 256;
-      memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
 
-      encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
-      memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+      if (numSymbols != 1) // write hist & states.
+      {
+        encodeState.pStart -= 256;
+        memcpy(encodeState.pStart, encodeState.hist.symbolCount, sizeof(encodeState.hist.symbolCount));
+
+        encodeState.pStart -= sizeof(uint32_t) / sizeof(uint16_t) * StateCount;
+        memcpy(encodeState.pStart, encodeState.states, sizeof(uint32_t) * StateCount);
+
+        const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
 
-      const uint64_t writeHeadOffset = pBlockEnd - (encodeState.pStart + 1);
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &writeHeadOffset, sizeof(writeHeadOffset));
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+      }
+      else
+      {
+        const uint64_t singleSymbolIndicator = blockSize | ((uint64_t)1 << 63) | ((uint64_t)selectedSymbol << 54);
 
-      encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
-      memcpy(encodeState.pStart, &blockSize, sizeof(blockSize));
+        encodeState.pStart -= sizeof(uint64_t) / sizeof(uint16_t);
+        memcpy(encodeState.pStart, &singleSymbolIndicator, sizeof(singleSymbolIndicator));
+      }
 
       pBlockEnd = encodeState.pStart;
       encodeState.pStart--;
@@ -261,28 +302,60 @@ size_t mt_rANS32x64_16w_encode(const uint8_t *pInData, const size_t length, uint
 
     // Determine new histogram.
     {
-      inputBlockTargetIndex -= MinBlockSizeX;
+      inputBlockTargetIndex -= 1;
+      inputBlockTargetIndex &= ~(MinBlockSizeX - 1);
+
+      const size_t initialSize = inputIndex - inputBlockTargetIndex;
+
+      if (inputBlockTargetIndex > 0 && initialSize < MinBlockSizeX * 2 / 3)
+        inputBlockTargetIndex -= MinBlockSizeX;
+
+      observe_hist(symCount, pInData + inputBlockTargetIndex, inputIndex - inputBlockTargetIndex);
+
+      numSymbols = 0;
 
-      observe_hist(symCount, pInData + inputBlockTargetIndex, MinBlockSizeX);
+      for (size_t j = 0; j < 256; j++)
+      {
+        numSymbols += (size_t)!!symCount[j];
+        selectedSymbol = symCount[j] ? (uint8_t)j : selectedSymbol;
+      }
 
-      if constexpr (IsSafeHist)
-        for (size_t j = 0; j < 256; j++)
-          if (symCount[j] == 0)
-            symCount[j] = 1;
+      if (numSymbols == 1)
+      {
+        int64_t idx = inputBlockTargetIndex - 1;
 
-      normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+        for (; idx >= 0; idx--)
+          if (pInData[idx] != selectedSymbol)
+            break;
 
-      while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+        inputBlockTargetIndex = (size_t)(idx + 1);
+
+        // Align with `StateCount`.
+        inputBlockTargetIndex += StateCount - 1;
+        inputBlockTargetIndex &= ~(StateCount - 1);
+      }
+      else
       {
-        if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
-          inputBlockTargetIndex -= MinBlockSizeX;
-        else
-          break;
+        if constexpr (IsSafeHist)
+          for (size_t j = 0; j < 256; j++)
+            if (symCount[j] == 0)
+              symCount[j] = 1;
+
+        normalize_hist(&encodeState.hist, symCount, MinBlockSizeX, TotalSymbolCountBits);
+
+        while (inputBlockTargetIndex > 0 && blockBackPoint - inputBlockTargetIndex < MaxBlockSize)
+        {
+          if (_CanExtendHist<TotalSymbolCountBits>(pInData, inputBlockTargetIndex - MinBlockSizeX, MinBlockSizeX, &encodeState.hist, symCount))
+            inputBlockTargetIndex -= MinBlockSizeX;
+          else
+            break;
+        }
+
+        // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
+        observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
+        normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       }
 
-      // Performance of this could be improved by keeping the current counts around. (or simply using the original hist, if that was only good for one block)
-      observe_hist(symCount, pInData + inputBlockTargetIndex, blockBackPoint - inputBlockTargetIndex);
-      normalize_hist(&encodeState.hist, symCount, blockBackPoint - inputBlockTargetIndex, TotalSymbolCountBits);
       blockBackPoint = inputIndex;
     }
   }

From 49951641dd533d3aabba492940f64325a424b9a8 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 00:54:47 +0200
Subject: [PATCH 24/34] Adding more command line options

---
 src/main.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 123d47d..c51e684 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -53,6 +53,11 @@ static size_t _HistMax = 15;
 static size_t _HistMin = 10;
 static bool _Include32Block = false;
 static bool _IncludeRaw = false;
+static bool _IncludeMT = false;
+static bool _ExcludeBlock = false;
+static bool _Exclude32x16 = false;
+static bool _Exclude32x32 = false;
+static bool _Exclude32x64 = false;
 static size_t _RunCount = 8;
 static size_t _EncodeRunCount = 2;
 static size_t _DecodeRunCount = 16;
@@ -236,6 +241,11 @@ const char ArgumentHistMin[] = "--hist-min";
 const char ArgumentHistMax[] = "--hist-max";
 const char ArgumentInclude32Blk[] = "--include-32blk";
 const char ArgumentIncludeRaw[] = "--include-raw";
+const char ArgumentIncludeMT[] = "--include-mt";
+const char ArgumentExcludeBlock[] = "--exclude-block";
+const char ArgumentExclude16[] = "--exclude-16";
+const char ArgumentExclude32[] = "--exclude-32";
+const char ArgumentExclude64[] = "--exclude-64";
 const char ArgumentNoSleep[] = "--no-sleep";
 const char ArgumentCpuCore[] = "--cpu-core";
 const char ArgumentRuns[] = "--runs";
@@ -249,14 +259,18 @@ int32_t main(const int32_t argc, char **pArgv)
   if (argc == 1)
   {
     puts("Invalid Parameter.\n\nUsage: hsrans <filename>");
-    printf("\t%s \tRun all variants of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentRuns);
+    printf("\t%s \t\t\tRun all variants of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants);
     printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMin);
     printf("\t%s <10-15> \tRestrict codecs to a number of histogram bits\n", ArgumentHistMax);
-    printf("\t%s \tRun all implementations of the specified codecs, not just the ones that we'd expect to be fast\n", ArgumentAllVariants);
-    printf("\t%s \tRun the benchmark on a specific core\n", ArgumentCpuCore);
-    printf("\t%s \tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw);
+    printf("\t%s \t\tRun the (single-threaded) benchmark on a specific core\n", ArgumentCpuCore);
+    printf("\t%s \t\tInclude multi-threading optimized variants\n", ArgumentIncludeMT);
+    printf("\t%s \t\tInclude RAW variants with one only one histogram for the entire file\n", ArgumentIncludeRaw);
     printf("\t%s \tInclude 32 block variants (which are generally quite slow), requires '%s'\n", ArgumentInclude32Blk, ArgumentIncludeRaw);
-    printf("\t%s <uint>\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode);
+    printf("\t%s \tExclude the main (variable block size) variants form the benchmark\n", ArgumentExcludeBlock);
+    printf("\t%s \t\tExclude 16 state variants from the benchmark (only RAW)\n", ArgumentExclude16);
+    printf("\t%s \t\tExclude 32 state variants from the benchmark\n", ArgumentExclude32);
+    printf("\t%s \t\tExclude 64 state variants from the benchmark\n", ArgumentExclude64);
+    printf("\t%s <uint>\t\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode);
     printf("\t%s <uint>\tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode);
     printf("\t%s <uint>\tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode);
     printf("\t%s <uint>\tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep);
@@ -279,6 +293,12 @@ int32_t main(const int32_t argc, char **pArgv)
         argsRemaining--;
         _OnlyRelevantCodecs = false;
       }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeMT, sizeof(ArgumentIncludeMT)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _IncludeMT = true;
+      }
       else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentIncludeRaw, sizeof(ArgumentIncludeRaw)) == 0)
       {
         argIndex++;
@@ -291,6 +311,30 @@ int32_t main(const int32_t argc, char **pArgv)
         argsRemaining--;
         _Include32Block = true;
       }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExcludeBlock, sizeof(ArgumentExcludeBlock)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _ExcludeBlock = true;
+      }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude16, sizeof(ArgumentExclude16)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _Exclude32x16 = true;
+      }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude32, sizeof(ArgumentExclude32)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _Exclude32x32 = true;
+      }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentExclude64, sizeof(ArgumentExclude64)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _Exclude32x64 = true;
+      }
       else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentNoSleep, sizeof(ArgumentNoSleep)) == 0)
       {
         argIndex++;
@@ -518,8 +562,13 @@ int32_t main(const int32_t argc, char **pArgv)
     make_hist(&hist, pUncompressedData, fileSize, _Codecs[codecId].totalSymbolCountBits);
     bool skipCodec = false;
 
+    skipCodec |= (!_IncludeMT && strstr(_Codecs[codecId].name, " (independent blocks)") != nullptr);
     skipCodec |= (!_IncludeRaw && strstr(_Codecs[codecId].name, " (raw)") != nullptr);
     skipCodec |= (!_Include32Block && strstr(_Codecs[codecId].name, " 32blk ") != nullptr);
+    skipCodec |= (_ExcludeBlock && strstr(_Codecs[codecId].name, " (variable block size)") != nullptr);
+    skipCodec |= (_Exclude32x16 && strstr(_Codecs[codecId].name, "32x16") != nullptr);
+    skipCodec |= (_Exclude32x32 && strstr(_Codecs[codecId].name, "32x32") != nullptr);
+    skipCodec |= (_Exclude32x64 && strstr(_Codecs[codecId].name, "32x64") != nullptr);
     skipCodec |= _Codecs[codecId].totalSymbolCountBits > _HistMax;
     skipCodec |= _Codecs[codecId].totalSymbolCountBits < _HistMin;
 

From acabb9fd90cca0d83d4984d38336e809621a1aa0 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 02:51:00 +0200
Subject: [PATCH 25/34] Updating x-ray benchmark

---
 README.md       | 311 +++++++++++++++++++++++-------------------------
 docs/index.html |  94 +++++++++++++++
 2 files changed, 240 insertions(+), 165 deletions(-)

diff --git a/README.md b/README.md
index 3da0208..b1ed15b 100644
--- a/README.md
+++ b/README.md
@@ -29,181 +29,162 @@
 ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes)
 | Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w        10**               |  65.59 % |   12.83 clk/byte |   341.55 MiB/s |   1.43 clk/byte |  2989.66 MiB/s |
-| **rANS32x64 16w        11**               |  64.33 % |   12.34 clk/byte |   347.24 MiB/s |   1.44 clk/byte |  2973.71 MiB/s |
-| **rANS32x64 16w        12**               |  63.81 % |   12.51 clk/byte |   342.31 MiB/s |   1.44 clk/byte |  2967.92 MiB/s |
-| TurboANX 63                               |  63.4 %  |   -              |   981.79 MiB/s |  -              |  2964.02 MiB/s |
-| TurboANX 48                               |  63.3 %  |   -              |   969.72 MiB/s |  -              |  2917.59 MiB/s |
-| TurboANX 40                               |  63.2 %  |   -              |   964.45 MiB/s |  -              |  2883.45 MiB/s |
-| TurboANX 32                               |  66.4 %  |   -              |   951.53 MiB/s |  -              |  2856.26 MiB/s |
-| **rANS32x32 16w        11**               |  64.33 % |   12.86 clk/byte |   333.03 MiB/s |   1.50 clk/byte |  2856.20 MiB/s |
-| **rANS32x32 16w        10**               |  65.59 % |   12.80 clk/byte |   334.68 MiB/s |   1.51 clk/byte |  2845.56 MiB/s |
-| TurboANX 24                               |  63.0 %  |   -              |   936.12 MiB/s |  -              |  2765.31 MiB/s |
-| TurboANX 16                               |  62.8 %  |   -              |   902.32 MiB/s |  -              |  2631.85 MiB/s |
-| **rANS32x32 16w        12**               |  63.81 % |   12.83 clk/byte |   343.55 MiB/s |   1.54 clk/byte |  2784.13 MiB/s |
-| fsehuf                                    |  63.4 %  |   -              |  1581.32 MiB/s |  -              |  2515.23 MiB/s |
-| htscodecs_rans32avx2 0                    |  63.5 %  |   -              |  1041.93 MiB/s |  -              |  2374.04 MiB/s |
-| TurboANX 8                                |  62.7 %  |   -              |   823.76 MiB/s |  -              |  2347.10 MiB/s |
-| **rANS32x32 32blk 16w  12**               |  63.81 % |   12.62 clk/byte |   339.50 MiB/s |   1.85 clk/byte |  2312.10 MiB/s |
-| **rANS32x32 32blk 16w  11**               |  64.33 % |   12.67 clk/byte |   338.00 MiB/s |   1.86 clk/byte |  2299.31 MiB/s |
-| **rANS32x32 32blk 16w  10**               |  65.59 % |   12.91 clk/byte |   331.80 MiB/s |   1.87 clk/byte |  2289.10 MiB/s |
-| htscodecs_rans32avx512 0                  |  63.5 %  |   -              |   796.70 MiB/s |  -              |  2221.93 MiB/s |
-| **rANS32x32 32blk 8w   11**               |  64.33 % |   15.01 clk/byte |   285.45 MiB/s |   2.15 clk/byte |  1988.10 MiB/s |
-| **rANS32x32 32blk 8w   12**               |  63.82 % |   15.15 clk/byte |   282.80 MiB/s |   2.16 clk/byte |  1984.68 MiB/s |
-| **rANS32x32 32blk 8w   10**               |  65.60 % |   14.70 clk/byte |   291.41 MiB/s |   2.17 clk/byte |  1977.26 MiB/s |
-| htscodecs_rans32sse 0                     |  63.5 %  |   -              |   732.08 MiB/s |  -              |  1948.66 MiB/s |
-| TurboANX 4                                |  63.0 %  |   -              |   706.92 MiB/s |  -              |  1929.18 MiB/s |
-| **rANS32x64 16w        13**               |  63.61 % |   12.32 clk/byte |   348.13 MiB/s |   2.29 clk/byte |  1872.44 MiB/s |
-| **rANS32x64 16w        14**               |  63.55 % |   12.36 clk/byte |   346.57 MiB/s |   2.28 clk/byte |  1876.95 MiB/s |
-| **rANS32x64 16w        15**               |  63.57 % |   12.30 clk/byte |   350.49 MiB/s |   2.34 clk/byte |  1828.28 MiB/s |
-| **rANS32x32 16w        13**               |  63.61 % |   12.55 clk/byte |   341.20 MiB/s |   2.38 clk/byte |  1800.28 MiB/s |
-| **rANS32x32 16w        14**               |  63.55 % |   12.54 clk/byte |   341.70 MiB/s |   2.39 clk/byte |  1795.66 MiB/s |
-| **rANS32x16 16w        10**               |  65.59 % |   13.26 clk/byte |   323.07 MiB/s |   2.54 clk/byte |  1684.80 MiB/s |
-| **rANS32x16 16w        12**               |  63.81 % |   13.21 clk/byte |   324.24 MiB/s |   2.55 clk/byte |  1681.73 MiB/s |
-| **rANS32x16 16w        11**               |  64.33 % |   13.25 clk/byte |   323.17 MiB/s |   2.55 clk/byte |  1676.41 MiB/s |
-| **rANS32x32 16w        15**               |  63.57 % |   12.94 clk/byte |   342.60 MiB/s |   2.56 clk/byte |  1675.11 MiB/s |
-| **rANS32x32 32blk 16w  14**               |  63.55 % |   13.02 clk/byte |   329.08 MiB/s |   2.66 clk/byte |  1607.26 MiB/s |
-| **rANS32x32 32blk 16w  13**               |  63.61 % |   12.56 clk/byte |   341.16 MiB/s |   2.71 clk/byte |  1582.28 MiB/s |
-| **rANS32x32 32blk 16w  15**               |  63.57 % |   13.21 clk/byte |   324.33 MiB/s |   2.76 clk/byte |  1550.93 MiB/s |
-| **rANS32x32 32blk 8w   13**               |  63.60 % |   15.07 clk/byte |   284.24 MiB/s |   2.98 clk/byte |  1438.01 MiB/s |
-| **rANS32x32 32blk 8w   14**               |  63.53 % |   15.06 clk/byte |   284.45 MiB/s |   3.00 clk/byte |  1429.24 MiB/s |
-| TurboANX 2                                |  64.0 %  |   -              |   656.86 MiB/s |  -              |  1416.33 MiB/s |
-| **rANS32x32 32blk 8w   15**               |  63.51 % |   15.11 clk/byte |   283.41 MiB/s |   3.10 clk/byte |  1381.63 MiB/s |
-| **rANS32x16 16w        13**               |  63.61 % |   13.14 clk/byte |   325.92 MiB/s |   3.60 clk/byte |  1190.23 MiB/s |
-| **rANS32x16 16w        14**               |  63.55 % |   13.37 clk/byte |   320.41 MiB/s |   3.64 clk/byte |  1175.92 MiB/s |
-| **rANS32x16 16w        15**               |  63.57 % |   13.28 clk/byte |   322.51 MiB/s |   4.21 clk/byte |  1017.12 MiB/s |
-| fse                                       |  63.2 %  |   -              |   736.10 MiB/s |  -              |   966.58 MiB/s |
-| TurboANX 1                                |  66.4 %  |   -              |   522.13 MiB/s |  -              |   942.43 MiB/s |
-| htscodecs_rans32avx512 1                  |  51.6 %  |   -              |   168.22 MiB/s |  -              |   322.22 MiB/s |
-| htscodecs_rans32avx2 1                    |  51.6 %  |   -              |   177.36 MiB/s |  -              |   319.15 MiB/s |
-| FastHF                                    |  63.6 %  |   -              |   189.84 MiB/s |  -              |   151.62 MiB/s |
-| FastAC                                    |  63.2 %  |   -              |   223.06 MiB/s |  -              |    84.37 MiB/s |
-| htscodecs_arith_dyn 1                     |  47.8 %  |   -              |    89.60 MiB/s |  -              |    81.63 MiB/s |
-| htscodecs_arith_dyn 0                     |  62.0 %  |   -              |    88.09 MiB/s |  -              |    75.05 MiB/s |
-
-The following benchmarks demonstrate, apart from incredibly high decompression speeds, how terrible the histogram generation currently is:
+| **rANS32x64 16w        10**               |  65.59 % |   12.83 clk/byte |   341.55 MiB/s |   1.43 clk/byte |  2,989.66 MiB/s |
+| **rANS32x64 16w        11**               |  64.33 % |   12.34 clk/byte |   347.24 MiB/s |   1.44 clk/byte |  2,973.71 MiB/s |
+| **rANS32x64 16w        12**               |  63.81 % |   12.51 clk/byte |   342.31 MiB/s |   1.44 clk/byte |  2,967.92 MiB/s |
+| TurboANX 63                               |  63.4 %  |   -              |   981.79 MiB/s |  -              |  2,964.02 MiB/s |
+| TurboANX 48                               |  63.3 %  |   -              |   969.72 MiB/s |  -              |  2,917.59 MiB/s |
+| TurboANX 40                               |  63.2 %  |   -              |   964.45 MiB/s |  -              |  2,883.45 MiB/s |
+| TurboANX 32                               |  66.4 %  |   -              |   951.53 MiB/s |  -              |  2,856.26 MiB/s |
+| **rANS32x32 16w        11**               |  64.33 % |   12.86 clk/byte |   333.03 MiB/s |   1.50 clk/byte |  2,856.20 MiB/s |
+| **rANS32x32 16w        10**               |  65.59 % |   12.80 clk/byte |   334.68 MiB/s |   1.51 clk/byte |  2,845.56 MiB/s |
+| TurboANX 24                               |  63.0 %  |   -              |   936.12 MiB/s |  -              |  2,765.31 MiB/s |
+| TurboANX 16                               |  62.8 %  |   -              |   902.32 MiB/s |  -              |  2,631.85 MiB/s |
+| **rANS32x32 16w        12**               |  63.81 % |   12.83 clk/byte |   343.55 MiB/s |   1.54 clk/byte |  2,784.13 MiB/s |
+| fsehuf                                    |  63.4 %  |   -              | 1,581.32 MiB/s |  -              |  2,515.23 MiB/s |
+| htscodecs rans32avx2 0                    |  63.5 %  |   -              | 1,041.93 MiB/s |  -              |  2,374.04 MiB/s |
+| TurboANX 8                                |  62.7 %  |   -              |   823.76 MiB/s |  -              |  2,347.10 MiB/s |
+| **rANS32x32 32blk 16w  12**               |  63.81 % |   12.62 clk/byte |   339.50 MiB/s |   1.85 clk/byte |  2,312.10 MiB/s |
+| **rANS32x32 32blk 16w  11**               |  64.33 % |   12.67 clk/byte |   338.00 MiB/s |   1.86 clk/byte |  2,299.31 MiB/s |
+| **rANS32x32 32blk 16w  10**               |  65.59 % |   12.91 clk/byte |   331.80 MiB/s |   1.87 clk/byte |  2,289.10 MiB/s |
+| htscodecs rans32avx512 0                  |  63.5 %  |   -              |   796.70 MiB/s |  -              |  2,221.93 MiB/s |
+| **rANS32x32 32blk 8w   11**               |  64.33 % |   15.01 clk/byte |   285.45 MiB/s |   2.15 clk/byte |  1,988.10 MiB/s |
+| **rANS32x32 32blk 8w   12**               |  63.82 % |   15.15 clk/byte |   282.80 MiB/s |   2.16 clk/byte |  1,984.68 MiB/s |
+| **rANS32x32 32blk 8w   10**               |  65.60 % |   14.70 clk/byte |   291.41 MiB/s |   2.17 clk/byte |  1,977.26 MiB/s |
+| htscodecs rans32sse 0                     |  63.5 %  |   -              |   732.08 MiB/s |  -              |  1,948.66 MiB/s |
+| TurboANX 4                                |  63.0 %  |   -              |   706.92 MiB/s |  -              |  1,929.18 MiB/s |
+| **rANS32x64 16w        13**               |  63.61 % |   12.32 clk/byte |   348.13 MiB/s |   2.29 clk/byte |  1,872.44 MiB/s |
+| **rANS32x64 16w        14**               |  63.55 % |   12.36 clk/byte |   346.57 MiB/s |   2.28 clk/byte |  1,876.95 MiB/s |
+| **rANS32x64 16w        15**               |  63.57 % |   12.30 clk/byte |   350.49 MiB/s |   2.34 clk/byte |  1,828.28 MiB/s |
+| **rANS32x32 16w        13**               |  63.61 % |   12.55 clk/byte |   341.20 MiB/s |   2.38 clk/byte |  1,800.28 MiB/s |
+| **rANS32x32 16w        14**               |  63.55 % |   12.54 clk/byte |   341.70 MiB/s |   2.39 clk/byte |  1,795.66 MiB/s |
+| **rANS32x16 16w        10**               |  65.59 % |   13.26 clk/byte |   323.07 MiB/s |   2.54 clk/byte |  1,684.80 MiB/s |
+| **rANS32x16 16w        12**               |  63.81 % |   13.21 clk/byte |   324.24 MiB/s |   2.55 clk/byte |  1,681.73 MiB/s |
+| **rANS32x16 16w        11**               |  64.33 % |   13.25 clk/byte |   323.17 MiB/s |   2.55 clk/byte |  1,676.41 MiB/s |
+| **rANS32x32 16w        15**               |  63.57 % |   12.94 clk/byte |   342.60 MiB/s |   2.56 clk/byte |  1,675.11 MiB/s |
+| **rANS32x32 32blk 16w  14**               |  63.55 % |   13.02 clk/byte |   329.08 MiB/s |   2.66 clk/byte |  1,607.26 MiB/s |
+| **rANS32x32 32blk 16w  13**               |  63.61 % |   12.56 clk/byte |   341.16 MiB/s |   2.71 clk/byte |  1,582.28 MiB/s |
+| **rANS32x32 32blk 16w  15**               |  63.57 % |   13.21 clk/byte |   324.33 MiB/s |   2.76 clk/byte |  1,550.93 MiB/s |
+| **rANS32x32 32blk 8w   13**               |  63.60 % |   15.07 clk/byte |   284.24 MiB/s |   2.98 clk/byte |  1,438.01 MiB/s |
+| **rANS32x32 32blk 8w   14**               |  63.53 % |   15.06 clk/byte |   284.45 MiB/s |   3.00 clk/byte |  1,429.24 MiB/s |
+| TurboANX 2                                |  64.0 %  |   -              |   656.86 MiB/s |  -              |  1,416.33 MiB/s |
+| **rANS32x32 32blk 8w   15**               |  63.51 % |   15.11 clk/byte |   283.41 MiB/s |   3.10 clk/byte |  1,381.63 MiB/s |
+| **rANS32x16 16w        13**               |  63.61 % |   13.14 clk/byte |   325.92 MiB/s |   3.60 clk/byte |  1,190.23 MiB/s |
+| **rANS32x16 16w        14**               |  63.55 % |   13.37 clk/byte |   320.41 MiB/s |   3.64 clk/byte |  1,175.92 MiB/s |
+| **rANS32x16 16w        15**               |  63.57 % |   13.28 clk/byte |   322.51 MiB/s |   4.21 clk/byte |  1,017.12 MiB/s |
+| fse                                       |  63.2 %  |   -              |   736.10 MiB/s |  -              |    966.58 MiB/s |
+| TurboANX 1                                |  66.4 %  |   -              |   522.13 MiB/s |  -              |    942.43 MiB/s |
+| htscodecs rans32avx512 1                  |  51.6 %  |   -              |   168.22 MiB/s |  -              |    322.22 MiB/s |
+| htscodecs rans32avx2 1                    |  51.6 %  |   -              |   177.36 MiB/s |  -              |    319.15 MiB/s |
+| FastHF                                    |  63.6 %  |   -              |   189.84 MiB/s |  -              |    151.62 MiB/s |
+| FastAC                                    |  63.2 %  |   -              |   223.06 MiB/s |  -              |     84.37 MiB/s |
+| htscodecs arith_dyn 1                     |  47.8 %  |   -              |    89.60 MiB/s |  -              |     81.63 MiB/s |
+| htscodecs arith_dyn 0                     |  62.0 %  |   -              |    88.09 MiB/s |  -              |     75.05 MiB/s |
 
 ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus)
-| Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
+| Codec Type | License | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w                    11**   |  82.60 % |   13.75 clk/byte |   311.60 MiB/s |   1.39 clk/byte |  3079.98 MiB/s |
-| **rANS32x64 16w                    10**   |  82.66 % |   14.03 clk/byte |   305.22 MiB/s |   1.42 clk/byte |  3026.65 MiB/s |
-| TurboANX 63                               |  79.6  % |   -              |   989.68 MiB/s |   -             |  2966.83 MiB/s |
-| TurboANX 48                               |  79.6  % |   -              |   979.24 MiB/s |   -             |  2923.90 MiB/s |
-| TurboANX 40                               |  79.7  % |   -              |   982.57 MiB/s |   -             |  2904.99 MiB/s |
-| **rANS32x64 16w                    12**   |  82.57 % |   13.99 clk/byte |   306.19 MiB/s |   1.48 clk/byte |  2900.31 MiB/s |
-| TurboANX 32                               |  79.7  % |   -              |   973.82 MiB/s |   -             |  2860.76 MiB/s |
-| **rANS32x32 16w                    11**   |  82.60 % |   14.31 clk/byte |   299.31 MiB/s |   1.50 clk/byte |  2851.47 MiB/s |
-| **rANS32x32 16w                    10**   |  82.66 % |   13.82 clk/byte |   309.99 MiB/s |   1.52 clk/byte |  2822.97 MiB/s |
-| TurboANX 24                               |  79.8  % |   -              |   962.68 MiB/s |   -             |  2785.82 MiB/s |
-| **rANS32x32 16w                    12**   |  82.57 % |   13.95 clk/byte |   306.97 MiB/s |   1.59 clk/byte |  2693.99 MiB/s |
-| TurboANX 16                               |  79.9  % |   -              |   937.33 MiB/s |   -             |  2661.07 MiB/s |
-| TurboANX 8                                |  80.5  % |   -              |   864.63 MiB/s |   -             |  2360.30 MiB/s |
-| htscodecs_rans32avx2 0                    |  80.6  % |   -              |   966.58 MiB/s |   -             |  2244.87 MiB/s |
-| htscodecs_rans32avx512 0                  |  80.6  % |   -              |   739.14 MiB/s |   -             |  2139.47 MiB/s |
-| fsehuf                                    |  80.0  % |   -              |  1395.71 MiB/s |   -             |  1946.34 MiB/s |
-| htscodecs_rans32sse 0                     |  80.6  % |   -              |   723.48 MiB/s |   -             |  1914.15 MiB/s |
-| **rANS32x64 16w                    13**   |  82.57 % |   13.94 clk/byte |   307.28 MiB/s |   2.25 clk/byte |  1903.01 MiB/s |
-| TurboANX 4                                |  81.9  % |   -              |   677.08 MiB/s |   -             |  1883.40 MiB/s |
-| **rANS32x64 16w                    14**   |  82.58 % |   14.09 clk/byte |   304.01 MiB/s |   2.29 clk/byte |  1870.17 MiB/s |
-| **rANS32x32 16w                    13**   |  82.57 % |   13.97 clk/byte |   306.60 MiB/s |   2.31 clk/byte |  1855.99 MiB/s |
-| **rANS32x64 16w                    15**   |  82.63 % |   13.88 clk/byte |   308.52 MiB/s |   2.39 clk/byte |  1793.13 MiB/s |
-| **rANS32x32 16w                    14**   |  82.58 % |   13.91 clk/byte |   307.92 MiB/s |   2.45 clk/byte |  1749.16 MiB/s |
-| **rANS32x32 16w                    15**   |  82.63 % |   14.20 clk/byte |   301.70 MiB/s |   2.59 clk/byte |  1654.49 MiB/s |
-| TurboANX 2                                |  83.7  % |   -              |   600.46 MiB/s |   -             |  1292.65 MiB/s |
-| fse                                       |  80.3  % |   -              |   696.88 MiB/s |   -             |   990.39 MiB/s |
-| TurboANX 1                                |  85.1  % |   -              |   387.40 MiB/s |   -             |   719.84 MiB/s |
-| htscodecs_rans32avx2 1                    |  74.4  % |   -              |   114.89 MiB/s |   -             |   229.78 MiB/s |
-| htscodecs_rans32avx512 1                  |  74.4  % |   -              |   104.87 MiB/s |   -             |   220.91 MiB/s |
-| FastHF                                    |  80.0  % |   -              |   183.35 MiB/s |   -             |   144.30 MiB/s |
-| FastAC                                    |  79.7  % |   -              |   244.35 MiB/s |   -             |    77.33 MiB/s |
-| htscodecs_arith_dyn 1                     |  67.6  % |   -              |    45.13 MiB/s |   -             |    45.67 MiB/s |
-| htscodecs_arith_dyn 0                     |  79.6  % |   -              |    47.12 MiB/s |   -             |    45.40 MiB/s |
+| **rANS32x64 16w 11 (raw)**   | BSD-2  |  82.60 % |   311.60 MiB/s |   1.39 clk/byte |  3,079.98 MiB/s |
+| **rANS32x64 16w 12**         | BSD-2  |  80.17 % |   193.60 MiB/s |   1.41 clk/byte |  3,048.15 MiB/s |
+| **rANS32x64 16w 12 (raw)**   | BSD-2  |  82.57 % |   308.10 MiB/s |   1.41 clk/byte |  3,041.07 MiB/s |
+| **rANS32x64 16w 10**         | BSD-2  |  80.81 % |   193.28 MiB/s |   1.41 clk/byte |  3,040.97 MiB/s |
+| **rANS32x64 16w 10 (raw)**   | BSD-2  |  82.83 % |   305.96 MiB/s |   1.42 clk/byte |  3,027.01 MiB/s |
+| **rANS32x64 16w 11**         | BSD-2  |  80.24 % |   186.41 MiB/s |   1.42 clk/byte |  3,015.25 MiB/s |
+| TurboANX 63                  | -      |  79.6  % |   989.68 MiB/s |   -             |  2,966.83 MiB/s |
+| TurboANX 48                  | -      |  79.6  % |   979.24 MiB/s |   -             |  2,923.90 MiB/s |
+| TurboANX 40                  | -      |  79.7  % |   982.57 MiB/s |   -             |  2,904.99 MiB/s |
+| **rANS32x32 16w 11 (raw)**   | BSD-2  |  82.60 % |   303.34 MiB/s |   1.48 clk/byte |  2,886.18 MiB/s |
+| **rANS32x32 16w 10 (raw)**   | BSD-2  |  82.83 % |   301.23 MiB/s |   1.49 clk/byte |  2,881.42 MiB/s |
+| **rANS32x32 16w 12 (raw)**   | BSD-2  |  82.57 % |   307.10 MiB/s |   1.49 clk/byte |  2,872.78 MiB/s |
+| TurboANX 32                  | -      |  79.7  % |   973.82 MiB/s |   -             |  2,860.76 MiB/s |
+| **rANS32x32 16w 10**         | BSD-2  |  80.81 % |   192.99 MiB/s |   1.51 clk/byte |  2,841.71 MiB/s |
+| **rANS32x32 16w 11**         | BSD-2  |  80.24 % |   190.01 MiB/s |   1.51 clk/byte |  2,834.43 MiB/s |
+| **rANS32x32 16w 12**         | BSD-2  |  80.53 % |   195.09 MiB/s |   1.54 clk/byte |  2,787.94 MiB/s |
+| TurboANX 24                  | -      |  79.8  % |   962.68 MiB/s |   -             |  2,785.82 MiB/s |
+| TurboANX 16                  | -      |  79.9  % |   937.33 MiB/s |   -             |  2,661.07 MiB/s |
+| TurboANX 8                   | -      |  80.5  % |   864.63 MiB/s |   -             |  2,360.30 MiB/s |
+| htscodecs rans32avx2 0       | BSD-3  |  80.6  % |   966.58 MiB/s |   -             |  2,244.87 MiB/s |
+| htscodecs rans32avx512 0     | BSD-3  |  80.6  % |   739.14 MiB/s |   -             |  2,139.47 MiB/s |
+| FSE Huff0                    | BSD-2  |  80.0  % | 1,395.71 MiB/s |   -             |  1,946.34 MiB/s |
+| htscodecs rans32sse 0        | BSD-3  |  80.6  % |   723.48 MiB/s |   -             |  1,914.15 MiB/s |
+| **rANS32x64 16w 13 (raw)**   | BSD-2  |  82.57 % |   305.45 MiB/s |   2.24 clk/byte |  1,910.60 MiB/s |
+| **rANS32x64 16w 14 (raw)**   | BSD-2  |  82.58 % |   308.96 MiB/s |   2.25 clk/byte |  1,903.66 MiB/s |
+| **rANS32x64 16w 13**         | BSD-2  |  79.98 % |   191.74 MiB/s |   2.26 clk/byte |  1,892.64 MiB/s |
+| TurboANX 4                   | -      |  81.9  % |   677.08 MiB/s |   -             |  1,883.40 MiB/s |
+| **rANS32x32 16w 13 (raw)**   | BSD-2  |  82.57 % |   305.00 MiB/s |   2.29 clk/byte |  1,870.26 MiB/s |
+| **rANS32x64 16w 15 (raw)**   | BSD-2  |  82.63 % |   307.44 MiB/s |   2.30 clk/byte |  1,865.65 MiB/s |
+| **rANS32x32 16w 14 (raw)**   | BSD-2  |  82.58 % |   306.18 MiB/s |   2.30 clk/byte |  1,865.18 MiB/s |
+| **rANS32x64 16w 14**         | BSD-2  |  80.02 % |   192.71 MiB/s |   2.30 clk/byte |  1,861.42 MiB/s |
+| **rANS32x32 16w 13**         | BSD-2  |  80.01 % |   196.93 MiB/s |   2.37 clk/byte |  1,808.33 MiB/s |
+| **rANS32x64 16w 15**         | BSD-2  |  80.25 % |   193.85 MiB/s |   2.42 clk/byte |  1,773.42 MiB/s |
+| **rANS32x32 16w 14**         | BSD-2  |  80.06 % |   198.86 MiB/s |   2.42 clk/byte |  1,767.12 MiB/s |
+| **rANS32x32 16w 15 (raw)**   | BSD-2  |  82.63 % |   304.21 MiB/s |   2.44 clk/byte |  1,758.57 MiB/s |
+| **rANS32x32 16w 15**         | BSD-2  |  80.06 % |   191.91 MiB/s |   2.70 clk/byte |  1,585.77 MiB/s |
+| TurboANX 2                   | -      |  83.7  % |   600.46 MiB/s |   -             |  1,292.65 MiB/s |
+| FSE                          | BSD-2  |  80.3  % |   696.88 MiB/s |   -             |    990.39 MiB/s |
+| TurboANX 1                   | -      |  85.1  % |   387.40 MiB/s |   -             |    719.84 MiB/s |
+| htscodecs rans32avx2 1       | BSD-3  |  74.4  % |   114.89 MiB/s |   -             |    229.78 MiB/s |
+| htscodecs rans32avx512 1     | BSD-3  |  74.4  % |   104.87 MiB/s |   -             |    220.91 MiB/s |
+| FastHF                       | Custom |  80.0  % |   183.35 MiB/s |   -             |    144.30 MiB/s |
+| FastAC                       | Custom |  79.7  % |   244.35 MiB/s |   -             |     77.33 MiB/s |
+| htscodecs arith_dyn 1        | BSD-3  |  67.6  % |    45.13 MiB/s |   -             |     45.67 MiB/s |
+| htscodecs arith_dyn 0        | BSD-3  |  79.6  % |    47.12 MiB/s |   -             |     45.40 MiB/s |
 
 ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus)
 | Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w                    11**   |  77.82 % |   13.84 clk/byte |   309.39 MiB/s |   1.44 clk/byte |  2978.20 MiB/s |
-| **rANS32x64 16w                    10**   |  77.92 % |   14.16 clk/byte |   302.46 MiB/s |   1.44 clk/byte |  2968.99 MiB/s |
-| TurboANX 63                               |  70.1  % |   -              |   965.97 MiB/s |   -             |  2959.13 MiB/s |
-| **rANS32x64 16w                    12**   |  77.79 % |   14.21 clk/byte |   301.44 MiB/s |   1.45 clk/byte |  2946.52 MiB/s |
-| TurboANX 48                               |  69.6  % |   -              |   954.87 MiB/s |   -             |  2911.55 MiB/s |
-| **rANS32x32 16w                    10**   |  77.92 % |   13.97 clk/byte |   306.54 MiB/s |   1.49 clk/byte |  2878.05 MiB/s |
-| TurboANX 40                               |  69.3  % |   -              |   941.29 MiB/s |   -             |  2869.21 MiB/s |
-| **rANS32x32 16w                    11**   |  77.82 % |   14.34 clk/byte |   298.79 MiB/s |   1.49 clk/byte |  2867.33 MiB/s |
-| TurboANX 32                               |  68.9  % |   -              |   927.04 MiB/s |   -             |  2815.87 MiB/s |
-| **rANS32x32 16w                    12**   |  77.79 % |   14.25 clk/byte |   300.51 MiB/s |   1.54 clk/byte |  2782.35 MiB/s |
-| TurboANX 24                               |  68.4  % |   -              |   900.92 MiB/s |   -             |  2732.74 MiB/s |
-| TurboANX 16                               |  67.9  % |   -              |   854.34 MiB/s |   -             |  2582.05 MiB/s |
-| htscodecs_rans32avx2 0                    |  69.5  % |   -              |  1014.19 MiB/s |   -             |  2250.58 MiB/s |
-| TurboANX 8                                |  67.2  % |   -              |   748.14 MiB/s |   -             |  2183.29 MiB/s |
-| htscodecs_rans32avx512 0                  |  69.5  % |   -              |   760.33 MiB/s |   -             |  2115.31 MiB/s |
-| fsehuf                                    |  69.2  % |   -              |  1491.60 MiB/s |   -             |  2092.00 MiB/s |
-| **rANS32x32 16w                    14**   |  77.79 % |   14.02 clk/byte |   305.49 MiB/s |   2.37 clk/byte |  1804.10 MiB/s |
-| **rANS32x64 16w                    14**   |  77.79 % |   14.09 clk/byte |   303.97 MiB/s |   2.26 clk/byte |  1891.46 MiB/s |
-| htscodecs_rans32sse 0                     |  69.5  % |   -              |   724.39 MiB/s |   -             |  1884.40 MiB/s |
-| **rANS32x64 16w                    13**   |  77.79 % |   13.89 clk/byte |   308.28 MiB/s |   2.27 clk/byte |  1883.91 MiB/s |
-| **rANS32x64 16w                    15**   |  77.85 % |   13.86 clk/byte |   309.13 MiB/s |   2.31 clk/byte |  1855.74 MiB/s |
-| **rANS32x32 16w                    13**   |  77.78 % |   14.13 clk/byte |   303.23 MiB/s |   2.37 clk/byte |  1806.03 MiB/s |
-| **rANS32x32 16w                    15**   |  77.84 % |   14.29 clk/byte |   299.78 MiB/s |   2.46 clk/byte |  1743.60 MiB/s |
-| TurboANX 4                                |  67.3  % |   -              |   603.91 MiB/s |   -             |  1658.68 MiB/s |
-| TurboANX 2                                |  68.5  % |   -              |   556.95 MiB/s |   -             |  1106.06 MiB/s |
-| fse                                       |  69.3  % |   -              |   713.08 MiB/s |   -             |   973.71 MiB/s |
-| TurboANX 1                                |  71.6  % |   -              |   392.67 MiB/s |   -             |   677.10 MiB/s |
-| htscodecs_rans32avx512 1                  |  55.7  % |   -              |    81.02 MiB/s |   -             |   168.42 MiB/s |
-| htscodecs_rans32avx2 1                    |  55.7  % |   -              |    83.68 MiB/s |   -             |   167.19 MiB/s |
-| FastHF                                    |  71.8  % |   -              |   174.86 MiB/s |   -             |   130.78 MiB/s |
-| FastAC                                    |  70.7  % |   -              |   234.95 MiB/s |   -             |    81.01 MiB/s |
-| htscodecs_arith_dyn 1                     |  52.1  % |   -              |    62.87 MiB/s |   -             |    62.98 MiB/s |
-| htscodecs_arith_dyn 0                     |  66.4  % |   -              |    63.82 MiB/s |   -             |    59.92 MiB/s |
+| **rANS32x64 16w                    11**   |  77.82 % |   13.84 clk/byte |   309.39 MiB/s |   1.44 clk/byte |  2,978.20 MiB/s |
+| **rANS32x64 16w                    10**   |  77.92 % |   14.16 clk/byte |   302.46 MiB/s |   1.44 clk/byte |  2,968.99 MiB/s |
+| TurboANX 63                               |  70.1  % |   -              |   965.97 MiB/s |   -             |  2,959.13 MiB/s |
+| **rANS32x64 16w                    12**   |  77.79 % |   14.21 clk/byte |   301.44 MiB/s |   1.45 clk/byte |  2,946.52 MiB/s |
+| TurboANX 48                               |  69.6  % |   -              |   954.87 MiB/s |   -             |  2,911.55 MiB/s |
+| **rANS32x32 16w                    10**   |  77.92 % |   13.97 clk/byte |   306.54 MiB/s |   1.49 clk/byte |  2,878.05 MiB/s |
+| TurboANX 40                               |  69.3  % |   -              |   941.29 MiB/s |   -             |  2,869.21 MiB/s |
+| **rANS32x32 16w                    11**   |  77.82 % |   14.34 clk/byte |   298.79 MiB/s |   1.49 clk/byte |  2,867.33 MiB/s |
+| TurboANX 32                               |  68.9  % |   -              |   927.04 MiB/s |   -             |  2,815.87 MiB/s |
+| **rANS32x32 16w                    12**   |  77.79 % |   14.25 clk/byte |   300.51 MiB/s |   1.54 clk/byte |  2,782.35 MiB/s |
+| TurboANX 24                               |  68.4  % |   -              |   900.92 MiB/s |   -             |  2,732.74 MiB/s |
+| TurboANX 16                               |  67.9  % |   -              |   854.34 MiB/s |   -             |  2,582.05 MiB/s |
+| htscodecs_rans32avx2 0                    |  69.5  % |   -              | 1,014.19 MiB/s |   -             |  2,250.58 MiB/s |
+| TurboANX 8                                |  67.2  % |   -              |   748.14 MiB/s |   -             |  2,183.29 MiB/s |
+| htscodecs_rans32avx512 0                  |  69.5  % |   -              |   760.33 MiB/s |   -             |  2,115.31 MiB/s |
+| fsehuf                                    |  69.2  % |   -              | 1,491.60 MiB/s |   -             |  2,092.00 MiB/s |
+| **rANS32x32 16w                    14**   |  77.79 % |   14.02 clk/byte |   305.49 MiB/s |   2.37 clk/byte |  1,804.10 MiB/s |
+| **rANS32x64 16w                    14**   |  77.79 % |   14.09 clk/byte |   303.97 MiB/s |   2.26 clk/byte |  1,891.46 MiB/s |
+| htscodecs_rans32sse 0                     |  69.5  % |   -              |   724.39 MiB/s |   -             |  1,884.40 MiB/s |
+| **rANS32x64 16w                    13**   |  77.79 % |   13.89 clk/byte |   308.28 MiB/s |   2.27 clk/byte |  1,883.91 MiB/s |
+| **rANS32x64 16w                    15**   |  77.85 % |   13.86 clk/byte |   309.13 MiB/s |   2.31 clk/byte |  1,855.74 MiB/s |
+| **rANS32x32 16w                    13**   |  77.78 % |   14.13 clk/byte |   303.23 MiB/s |   2.37 clk/byte |  1,806.03 MiB/s |
+| **rANS32x32 16w                    15**   |  77.84 % |   14.29 clk/byte |   299.78 MiB/s |   2.46 clk/byte |  1,743.60 MiB/s |
+| TurboANX 4                                |  67.3  % |   -              |   603.91 MiB/s |   -             |  1,658.68 MiB/s |
+| TurboANX 2                                |  68.5  % |   -              |   556.95 MiB/s |   -             |  1,106.06 MiB/s |
+| fse                                       |  69.3  % |   -              |   713.08 MiB/s |   -             |    973.71 MiB/s |
+| TurboANX 1                                |  71.6  % |   -              |   392.67 MiB/s |   -             |    677.10 MiB/s |
+| htscodecs_rans32avx512 1                  |  55.7  % |   -              |    81.02 MiB/s |   -             |    168.42 MiB/s |
+| htscodecs_rans32avx2 1                    |  55.7  % |   -              |    83.68 MiB/s |   -             |    167.19 MiB/s |
+| FastHF                                    |  71.8  % |   -              |   174.86 MiB/s |   -             |    130.78 MiB/s |
+| FastAC                                    |  70.7  % |   -              |   234.95 MiB/s |   -             |     81.01 MiB/s |
+| htscodecs_arith_dyn 1                     |  52.1  % |   -              |    62.87 MiB/s |   -             |     62.98 MiB/s |
+| htscodecs_arith_dyn 0                     |  66.4  % |   -              |    63.82 MiB/s |   -             |     59.92 MiB/s |
 
-Thanks to [James Bonfield](https://github.com/jkbonfield) I also have benchmarks for `htscodecs` (MB/s converted to MiB/s) and `hypersonic-rANS` on an `Intel i7-1185G7` (Tiger Lake) via WSL1 compiled with GCC, where the AVX-512 versions of the 32x64 codecs seemed to be particularly fast:
+## Easy Multithreading
+hypersonic-rANS includes a variant that's encodes blocks independently (at the expense of compression ratio) allowing for easy multithreading.
 
-### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes)
-| Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
+### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus)
+| Codec Type | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w                    10**   |  65.59 % |   12.40 clk/byte |   230.40 MiB/s |   0.96 clk/byte |  2976.35 MiB/s |
-| **rANS32x64 16w                    11**   |  64.33 % |   12.32 clk/byte |   231.88 MiB/s |   0.97 clk/byte |  2947.61 MiB/s |
-| **rANS32x64 16w                    12**   |  63.81 % |   12.22 clk/byte |   233.65 MiB/s |   0.98 clk/byte |  2924.03 MiB/s |
-| htscodecs r32x16  -o4 -c 0x0404           |  63.64 % |   -              |   956.44 MiB/s |   -             |  2513.31 MiB/s |
-| **rANS32x64 16w                    13**   |  63.61 % |   12.05 clk/byte |   236.96 MiB/s |   1.24 clk/byte |  2307.39 MiB/s |
-| **rANS32x64 16w                    14**   |  63.55 % |   12.05 clk/byte |   236.97 MiB/s |   1.25 clk/byte |  2292.33 MiB/s |
-| **rANS32x64 16w                    15**   |  63.57 % |   11.82 clk/byte |   241.75 MiB/s |   1.27 clk/byte |  2250.60 MiB/s |
-| **rANS32x32 32blk 16w              10**   |  65.59 % |   12.52 clk/byte |   228.14 MiB/s |   1.44 clk/byte |  1989.49 MiB/s |
-| **rANS32x32 16w                    12**   |  63.81 % |   12.31 clk/byte |   232.08 MiB/s |   1.44 clk/byte |  1982.58 MiB/s |
-| **rANS32x32 16w                    10**   |  65.59 % |   12.77 clk/byte |   223.69 MiB/s |   1.45 clk/byte |  1972.50 MiB/s |
-| **rANS32x32 32blk 16w              11**   |  64.33 % |   11.85 clk/byte |   240.96 MiB/s |   1.48 clk/byte |  1933.49 MiB/s |
-| **rANS32x32 16w                    11**   |  64.33 % |   12.38 clk/byte |   230.65 MiB/s |   1.49 clk/byte |  1921.87 MiB/s |
-| htscodecs r32x16  -o4 -c 0x0202           |  63.64 % |   -              |   820.64 MiB/s |   -             |  1906.11 MiB/s |
-| **rANS32x32 32blk 16w              12**   |  63.81 % |   12.27 clk/byte |   232.80 MiB/s |   1.50 clk/byte |  1901.61 MiB/s |
-| **rANS32x32 32blk 8w               10**   |  65.60 % |   14.13 clk/byte |   202.20 MiB/s |   1.84 clk/byte |  1552.18 MiB/s |
-| **rANS32x32 16w                    13**   |  63.61 % |   11.96 clk/byte |   238.82 MiB/s |   1.89 clk/byte |  1514.47 MiB/s |
-| **rANS32x32 32blk 8w               11**   |  64.33 % |   14.42 clk/byte |   198.05 MiB/s |   1.91 clk/byte |  1496.66 MiB/s |
-| **rANS32x32 32blk 8w               12**   |  63.82 % |   14.63 clk/byte |   195.21 MiB/s |   1.93 clk/byte |  1477.22 MiB/s |
-| **rANS32x32 16w                    15**   |  63.57 % |   12.25 clk/byte |   233.13 MiB/s |   1.94 clk/byte |  1468.86 MiB/s |
-| **rANS32x32 32blk 16w              15**   |  63.57 % |   11.99 clk/byte |   238.28 MiB/s |   1.95 clk/byte |  1467.98 MiB/s |
-| **rANS32x32 16w                    14**   |  63.55 % |   12.78 clk/byte |   223.47 MiB/s |   1.99 clk/byte |  1437.32 MiB/s |
-| **rANS32x32 32blk 16w              14**   |  63.55 % |   11.76 clk/byte |   242.82 MiB/s |   2.03 clk/byte |  1405.81 MiB/s |
-| **rANS32x32 32blk 16w              13**   |  63.61 % |   11.85 clk/byte |   241.08 MiB/s |   2.07 clk/byte |  1379.67 MiB/s |
-| **rANS32x32 32blk 8w               14**   |  63.53 % |   14.31 clk/byte |   199.62 MiB/s |   2.33 clk/byte |  1224.75 MiB/s |
-| htscodecs r32x16  -o5 -c 0x0404           |  48.81 % |   -              |   533.10 MiB/s |   -             |  1221.94 MiB/s |
-| **rANS32x32 32blk 8w               13**   |  63.60 % |   14.43 clk/byte |   197.94 MiB/s |   2.36 clk/byte |  1209.93 MiB/s |
-| htscodecs r32x16  -o4 -c 0x0101           |  63.64 % |   -              |   544.55 MiB/s |   -             |  1189.42 MiB/s |
-| **rANS32x32 32blk 8w               15**   |  63.51 % |   14.36 clk/byte |   198.92 MiB/s |   2.45 clk/byte |  1167.62 MiB/s |
-| htscodecs r32x16  -o5 -c 0x0202           |  48.81 % |   -              |   406.07 MiB/s |   -             |  1020.72 MiB/s |
-| **rANS32x16 16w                    11**   |  64.33 % |   14.21 clk/byte |   201.05 MiB/s |   2.85 clk/byte |  1001.90 MiB/s |
-| **rANS32x16 16w                    13**   |  63.61 % |   12.81 clk/byte |   223.01 MiB/s |   2.86 clk/byte |   997.44 MiB/s |
-| **rANS32x16 16w                    10**   |  65.59 % |   14.68 clk/byte |   194.52 MiB/s |   2.92 clk/byte |   976.98 MiB/s |
-| **rANS32x16 16w                    12**   |  63.81 % |   12.80 clk/byte |   223.21 MiB/s |   3.02 clk/byte |   946.74 MiB/s |
-| **rANS32x16 16w                    14**   |  63.55 % |   13.51 clk/byte |   211.37 MiB/s |   3.28 clk/byte |   870.14 MiB/s |
-| htscodecs r32x16  -o4 -c 0x0000           |  63.64 % |   -              |   548.17 MiB/s |   -             |   847.63 MiB/s |
-| **rANS32x16 16w                    15**   |  63.57 % |   12.67 clk/byte |   225.40 MiB/s |   3.55 clk/byte |   805.70 MiB/s |
-| htscodecs r4x16   -o0                     |  63.64 % |   -              |   543.40 MiB/s |   -             |   803.85 MiB/s |
-| htscodecs r32x16  -o5 -c 0x0101           |  48.81 % |   -              |   294.49 MiB/s |   -             |   688.74 MiB/s |
-| htscodecs r4x8    -o0                     |  63.64 % |   -              |   323.87 MiB/s |   -             |   506.50 MiB/s |
-| htscodecs r32x16  -o5 -c 0x0000           |  48.81 % |   -              |   290.49 MiB/s |   -             |   466.92 MiB/s |
-| htscodecs r4x16   -o1                     |  48.80 % |   -              |   328.54 MiB/s |   -             |   449.47 MiB/s |
-| htscodecs r4x8    -o1                     |  49.13 % |   -              |   324.06 MiB/s |   -             |   300.22 MiB/s |
+| rANS32x64 16w 10 mt |  80.23 % |   200.03 MiB/s |    0.24 clk/byte | 18,035.77 MiB/s |
+| rANS32x32 16w 10 mt |  80.17 % |   194.73 MiB/s |    0.25 clk/byte | 17,834.38 MiB/s |
+| rANS32x64 16w 11 mt |  80.08 % |   202.10 MiB/s |    0.26 clk/byte | 16,210.44 MiB/s |
+| rANS32x32 16w 11 mt |  80.02 % |   191.90 MiB/s |    0.27 clk/byte | 15,630.58 MiB/s |
+| rANS32x64 16w 12 mt |  80.05 % |   197.62 MiB/s |    0.34 clk/byte | 13,207.00 MiB/s |
+| rANS32x32 16w 12 mt |  79.99 % |   197.21 MiB/s |    0.36 clk/byte | 12,358.57 MiB/s |
+| rANS32x64 16w 13 mt |  80.04 % |   199.94 MiB/s |    0.37 clk/byte | 11,938.77 MiB/s |
+| rANS32x32 16w 13 mt |  79.99 % |   195.00 MiB/s |    0.37 clk/byte | 11,497.36 MiB/s |
+| rANS32x64 16w 14 mt |  80.05 % |   199.87 MiB/s |    0.42 clk/byte | 10,318.01 MiB/s |
+| rANS32x32 16w 14 mt |  80.01 % |   190.94 MiB/s |    0.42 clk/byte | 10,134.59 MiB/s |
+| rANS32x64 16w 15 mt |  80.09 % |   200.59 MiB/s |    0.59 clk/byte |  7,308.43 MiB/s |
+| rANS32x32 16w 15 mt |  80.03 % |   192.28 MiB/s |    0.62 clk/byte |  7,024.69 MiB/s |
 
 ## Building
 ### On Linux/WSL
diff --git a/docs/index.html b/docs/index.html
index 47bea27..8aa291c 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -693,6 +693,100 @@ <h3>enwik8 (wikipedia extract)</h3>
             obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("enwik8")); });
         </script>
       </div>
+      <div class="graph_container">
+        <h3>x-ray (medical x-ray image)</h3>
+        <p>Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.</p>
+        <!--
+        <div class="graph_metric_container">
+          <a class="graph_metricY r" onclick="javascript:setYMetric('entropy')">entropy</a>
+          <a class="graph_metricY l active" onclick="javascript:setYMetric('ratio')">ratio</a>
+        </div>
+        <div class="graph_metric_container">
+          <a class="graph_metricX r" onclick="javascript:setXMetric('encode')">encode</a>
+          <a class="graph_metricX l active" onclick="javascript:setXMetric('decode')">decode</a>
+        </div>
+        -->
+        <canvas id="xray" class="chart"></canvas>
+        <script>
+            var obj = document.getElementById("xray");
+            var ctx = obj.getContext("2d");
+            ctx.scale(pixelRatio(), pixelRatio());
+
+            init_obj(obj);
+
+            add_point_no_entropy(obj, 1758.57, 82.63,  304.21, "rANS32x32 16w 15 (raw)"  );
+            add_point_no_entropy(obj, 1865.18, 82.58,  306.18, "rANS32x32 16w 14 (raw)"  );
+            add_point_no_entropy(obj, 1870.26, 82.57,  305.00, "rANS32x32 16w 13 (raw)"  );
+            add_point_no_entropy(obj, 2872.78, 82.57,  307.10, "rANS32x32 16w 12 (raw)"  );
+            add_point_no_entropy(obj, 2886.18, 82.60,  303.34, "rANS32x32 16w 11 (raw)"  );
+            add_point_no_entropy(obj, 2881.42, 82.83,  301.23, "rANS32x32 16w 10 (raw)"  );
+
+            add_point_no_entropy(obj, 1865.65, 82.63,  307.44, "rANS32x64 16w 15 (raw)"  );
+            add_point_no_entropy(obj, 1903.66, 82.58,  308.96, "rANS32x64 16w 14 (raw)"  );
+            add_point_no_entropy(obj, 1910.60, 82.57,  305.45, "rANS32x64 16w 13 (raw)"  );
+            add_point_no_entropy(obj, 3041.07, 82.57,  308.10, "rANS32x64 16w 12 (raw)"  );
+            add_point_no_entropy(obj, 3079.98, 82.60,  311.60, "rANS32x64 16w 11 (raw)"  );
+            add_point_no_entropy(obj, 3027.01, 82.83,  305.96, "rANS32x64 16w 10 (raw)"  );
+
+            add_point_no_entropy(obj, 1585.77, 80.06,  191.91, "rANS32x32 16w 15"        );
+            add_point_no_entropy(obj, 1767.12, 80.06,  198.86, "rANS32x32 16w 14"        );
+            add_point_no_entropy(obj, 1808.33, 80.01,  196.93, "rANS32x32 16w 13"        );
+            add_point_no_entropy(obj, 2787.94, 80.53,  195.09, "rANS32x32 16w 12"        );
+            add_point_no_entropy(obj, 2834.43, 80.24,  190.01, "rANS32x32 16w 11"        );
+            add_point_no_entropy(obj, 2841.71, 80.81,  192.99, "rANS32x32 16w 10"        );
+
+            add_point_no_entropy(obj, 1773.42, 80.25,  193.85, "rANS32x64 16w 15"        );
+            add_point_no_entropy(obj, 1861.42, 80.02,  192.71, "rANS32x64 16w 14"        );
+            add_point_no_entropy(obj, 1892.64, 79.98,  191.74, "rANS32x64 16w 13"        );
+            add_point_no_entropy(obj, 3048.15, 80.17,  193.60, "rANS32x64 16w 12"        );
+            add_point_no_entropy(obj, 3015.25, 80.24,  186.41, "rANS32x64 16w 11"        );
+            add_point_no_entropy(obj, 3040.97, 80.81,  193.28, "rANS32x64 16w 10"        );
+
+            add_point_no_entropy(obj, 2966.83, 79.6,  989.68, "TurboANX 63 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2923.90, 79.6,  979.24, "TurboANX 48 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2904.99, 79.7,  982.57, "TurboANX 40 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2860.76, 79.7,  973.82, "TurboANX 32 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2785.82, 79.8,  962.68, "TurboANX 24 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2661.07, 79.9,  937.33, "TurboANX 16 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2360.30, 80.5,  864.63, "TurboANX 8  (Native Windows Build)", true);
+            add_point_no_entropy(obj, 1883.40, 81.9,  677.08, "TurboANX 4  (Native Windows Build)", true);
+            add_point_no_entropy(obj, 1292.65, 83.7,  600.46, "TurboANX 2  (Native Windows Build)", true);
+            add_point_no_entropy(obj,  719.84, 85.1,  387.40, "TurboANX 1  (Native Windows Build)", true);
+
+            add_point_no_entropy(obj, 1914.15, 80.6,  723.48, "htscodecs rans32sse 0"    , true);
+            add_point_no_entropy(obj, 2244.87, 80.6,  966.58, "htscodecs rans32avx2 0"   , true);
+            add_point_no_entropy(obj, 2139.47, 80.6,  739.14, "htscodecs rans32avx512 0" , true);
+            //add_point_no_entropy(obj,  229.78, 74.4,  114.89, "htscodecs rans32avx2 1"   , true);
+            //add_point_no_entropy(obj,  220.91, 74.4,  104.87, "htscodecs rans32avx512 1" , true);
+
+            add_point_no_entropy(obj,   45.40, 79.6,   47.12, "htscodecs arith_dyn 0"    , true);
+            //add_point_no_entropy(obj,   45.67, 67.6,   45.13, "htscodecs arith_dyn 1"    , true);
+
+            add_point_no_entropy(obj, 1946.34, 80.0, 1395.71, "FSE Huff0"                , true);
+            add_point_no_entropy(obj,  990.39, 80.3,  696.88, "FSE"                      , true);
+
+            add_point_no_entropy(obj,  144.30, 80.0,  183.35, "FastHF"                   , true);
+            add_point_no_entropy(obj,   77.33, 79.7,  244.35, "FastAC"                   , true);
+
+            add_line(obj, ["rANS32x32 16w 15", "rANS32x32 16w 14", "rANS32x32 16w 13", "rANS32x32 16w 12", "rANS32x32 16w 11", "rANS32x32 16w 10"], getColor());
+            add_line(obj, ["rANS32x64 16w 15", "rANS32x64 16w 14", "rANS32x64 16w 13", "rANS32x64 16w 12", "rANS32x64 16w 11", "rANS32x64 16w 10"], getColor());
+
+            add_line(obj, ["rANS32x32 16w 15 (raw)", "rANS32x32 16w 14 (raw)", "rANS32x32 16w 13 (raw)", "rANS32x32 16w 12 (raw)", "rANS32x32 16w 11 (raw)", "rANS32x32 16w 10 (raw)"], getColor());
+            add_line(obj, ["rANS32x64 16w 15 (raw)", "rANS32x64 16w 14 (raw)", "rANS32x64 16w 13 (raw)", "rANS32x64 16w 12 (raw)", "rANS32x64 16w 11 (raw)", "rANS32x64 16w 10 (raw)"], getColor());
+            
+            add_line(obj, ["FSE", "FSE Huff0" ], "#55555588");
+            add_line(obj, ["htscodecs rans32sse 0", "htscodecs rans32avx2 0", "htscodecs rans32avx512 0" ], "#55555588");
+            add_line(obj, ["FastAC", "FastHF"], "#55555588");
+            add_line(obj, ["TurboANX 1  (Native Windows Build)", "TurboANX 2  (Native Windows Build)", "TurboANX 4  (Native Windows Build)", "TurboANX 8  (Native Windows Build)", "TurboANX 16 (Native Windows Build)", "TurboANX 24 (Native Windows Build)", "TurboANX 32 (Native Windows Build)", "TurboANX 40 (Native Windows Build)", "TurboANX 48 (Native Windows Build)", "TurboANX 63 (Native Windows Build)"], "#55555588");
+
+            calc_pareto_line(obj);
+            calc_pareto_line2(obj);
+
+            window.addEventListener("resize", () => { draw_obj(document.getElementById("xray").getContext("2d"), document.getElementById("xray")); });
+            window.addEventListener("load", () => { draw_obj(document.getElementById("xray").getContext("2d"), document.getElementById("xray")); });
+            obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("xray")); });
+        </script>
+      </div>
     </div>
   </body>
 </html>
\ No newline at end of file

From bcf9678a4723de0dd5f5917afc8ba0fb3bcc87fb Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 16:41:48 +0200
Subject: [PATCH 26/34] Updating mozilla benchmark

---
 README.md       | 296 ++++++++++++++++++++++++------------------------
 docs/index.html | 208 ++++++++++++++++------------------
 src/main.cpp    |  26 ++---
 3 files changed, 264 insertions(+), 266 deletions(-)

diff --git a/README.md b/README.md
index b1ed15b..5fa70cb 100644
--- a/README.md
+++ b/README.md
@@ -27,164 +27,170 @@
 - Every best performing decoder variant requires AVX2. (The AVX-512 variants for 32x64 can be faster in rare circumstances, but they weren't in this benchmark)
 
 ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes)
-| Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
+| Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w        10**               |  65.59 % |   12.83 clk/byte |   341.55 MiB/s |   1.43 clk/byte |  2,989.66 MiB/s |
-| **rANS32x64 16w        11**               |  64.33 % |   12.34 clk/byte |   347.24 MiB/s |   1.44 clk/byte |  2,973.71 MiB/s |
-| **rANS32x64 16w        12**               |  63.81 % |   12.51 clk/byte |   342.31 MiB/s |   1.44 clk/byte |  2,967.92 MiB/s |
-| TurboANX 63                               |  63.4 %  |   -              |   981.79 MiB/s |  -              |  2,964.02 MiB/s |
-| TurboANX 48                               |  63.3 %  |   -              |   969.72 MiB/s |  -              |  2,917.59 MiB/s |
-| TurboANX 40                               |  63.2 %  |   -              |   964.45 MiB/s |  -              |  2,883.45 MiB/s |
-| TurboANX 32                               |  66.4 %  |   -              |   951.53 MiB/s |  -              |  2,856.26 MiB/s |
-| **rANS32x32 16w        11**               |  64.33 % |   12.86 clk/byte |   333.03 MiB/s |   1.50 clk/byte |  2,856.20 MiB/s |
-| **rANS32x32 16w        10**               |  65.59 % |   12.80 clk/byte |   334.68 MiB/s |   1.51 clk/byte |  2,845.56 MiB/s |
-| TurboANX 24                               |  63.0 %  |   -              |   936.12 MiB/s |  -              |  2,765.31 MiB/s |
-| TurboANX 16                               |  62.8 %  |   -              |   902.32 MiB/s |  -              |  2,631.85 MiB/s |
-| **rANS32x32 16w        12**               |  63.81 % |   12.83 clk/byte |   343.55 MiB/s |   1.54 clk/byte |  2,784.13 MiB/s |
-| fsehuf                                    |  63.4 %  |   -              | 1,581.32 MiB/s |  -              |  2,515.23 MiB/s |
-| htscodecs rans32avx2 0                    |  63.5 %  |   -              | 1,041.93 MiB/s |  -              |  2,374.04 MiB/s |
-| TurboANX 8                                |  62.7 %  |   -              |   823.76 MiB/s |  -              |  2,347.10 MiB/s |
-| **rANS32x32 32blk 16w  12**               |  63.81 % |   12.62 clk/byte |   339.50 MiB/s |   1.85 clk/byte |  2,312.10 MiB/s |
-| **rANS32x32 32blk 16w  11**               |  64.33 % |   12.67 clk/byte |   338.00 MiB/s |   1.86 clk/byte |  2,299.31 MiB/s |
-| **rANS32x32 32blk 16w  10**               |  65.59 % |   12.91 clk/byte |   331.80 MiB/s |   1.87 clk/byte |  2,289.10 MiB/s |
-| htscodecs rans32avx512 0                  |  63.5 %  |   -              |   796.70 MiB/s |  -              |  2,221.93 MiB/s |
-| **rANS32x32 32blk 8w   11**               |  64.33 % |   15.01 clk/byte |   285.45 MiB/s |   2.15 clk/byte |  1,988.10 MiB/s |
-| **rANS32x32 32blk 8w   12**               |  63.82 % |   15.15 clk/byte |   282.80 MiB/s |   2.16 clk/byte |  1,984.68 MiB/s |
-| **rANS32x32 32blk 8w   10**               |  65.60 % |   14.70 clk/byte |   291.41 MiB/s |   2.17 clk/byte |  1,977.26 MiB/s |
-| htscodecs rans32sse 0                     |  63.5 %  |   -              |   732.08 MiB/s |  -              |  1,948.66 MiB/s |
-| TurboANX 4                                |  63.0 %  |   -              |   706.92 MiB/s |  -              |  1,929.18 MiB/s |
-| **rANS32x64 16w        13**               |  63.61 % |   12.32 clk/byte |   348.13 MiB/s |   2.29 clk/byte |  1,872.44 MiB/s |
-| **rANS32x64 16w        14**               |  63.55 % |   12.36 clk/byte |   346.57 MiB/s |   2.28 clk/byte |  1,876.95 MiB/s |
-| **rANS32x64 16w        15**               |  63.57 % |   12.30 clk/byte |   350.49 MiB/s |   2.34 clk/byte |  1,828.28 MiB/s |
-| **rANS32x32 16w        13**               |  63.61 % |   12.55 clk/byte |   341.20 MiB/s |   2.38 clk/byte |  1,800.28 MiB/s |
-| **rANS32x32 16w        14**               |  63.55 % |   12.54 clk/byte |   341.70 MiB/s |   2.39 clk/byte |  1,795.66 MiB/s |
-| **rANS32x16 16w        10**               |  65.59 % |   13.26 clk/byte |   323.07 MiB/s |   2.54 clk/byte |  1,684.80 MiB/s |
-| **rANS32x16 16w        12**               |  63.81 % |   13.21 clk/byte |   324.24 MiB/s |   2.55 clk/byte |  1,681.73 MiB/s |
-| **rANS32x16 16w        11**               |  64.33 % |   13.25 clk/byte |   323.17 MiB/s |   2.55 clk/byte |  1,676.41 MiB/s |
-| **rANS32x32 16w        15**               |  63.57 % |   12.94 clk/byte |   342.60 MiB/s |   2.56 clk/byte |  1,675.11 MiB/s |
-| **rANS32x32 32blk 16w  14**               |  63.55 % |   13.02 clk/byte |   329.08 MiB/s |   2.66 clk/byte |  1,607.26 MiB/s |
-| **rANS32x32 32blk 16w  13**               |  63.61 % |   12.56 clk/byte |   341.16 MiB/s |   2.71 clk/byte |  1,582.28 MiB/s |
-| **rANS32x32 32blk 16w  15**               |  63.57 % |   13.21 clk/byte |   324.33 MiB/s |   2.76 clk/byte |  1,550.93 MiB/s |
-| **rANS32x32 32blk 8w   13**               |  63.60 % |   15.07 clk/byte |   284.24 MiB/s |   2.98 clk/byte |  1,438.01 MiB/s |
-| **rANS32x32 32blk 8w   14**               |  63.53 % |   15.06 clk/byte |   284.45 MiB/s |   3.00 clk/byte |  1,429.24 MiB/s |
-| TurboANX 2                                |  64.0 %  |   -              |   656.86 MiB/s |  -              |  1,416.33 MiB/s |
-| **rANS32x32 32blk 8w   15**               |  63.51 % |   15.11 clk/byte |   283.41 MiB/s |   3.10 clk/byte |  1,381.63 MiB/s |
-| **rANS32x16 16w        13**               |  63.61 % |   13.14 clk/byte |   325.92 MiB/s |   3.60 clk/byte |  1,190.23 MiB/s |
-| **rANS32x16 16w        14**               |  63.55 % |   13.37 clk/byte |   320.41 MiB/s |   3.64 clk/byte |  1,175.92 MiB/s |
-| **rANS32x16 16w        15**               |  63.57 % |   13.28 clk/byte |   322.51 MiB/s |   4.21 clk/byte |  1,017.12 MiB/s |
-| fse                                       |  63.2 %  |   -              |   736.10 MiB/s |  -              |    966.58 MiB/s |
-| TurboANX 1                                |  66.4 %  |   -              |   522.13 MiB/s |  -              |    942.43 MiB/s |
-| htscodecs rans32avx512 1                  |  51.6 %  |   -              |   168.22 MiB/s |  -              |    322.22 MiB/s |
-| htscodecs rans32avx2 1                    |  51.6 %  |   -              |   177.36 MiB/s |  -              |    319.15 MiB/s |
-| FastHF                                    |  63.6 %  |   -              |   189.84 MiB/s |  -              |    151.62 MiB/s |
-| FastAC                                    |  63.2 %  |   -              |   223.06 MiB/s |  -              |     84.37 MiB/s |
-| htscodecs arith_dyn 1                     |  47.8 %  |   -              |    89.60 MiB/s |  -              |     81.63 MiB/s |
-| htscodecs arith_dyn 0                     |  62.0 %  |   -              |    88.09 MiB/s |  -              |     75.05 MiB/s |
+| **rANS32x64 16w 11 (raw)**                | ✔️ |  64.48 % |   336.81 MiB/s |   1.42 clk/byte |  3,018.02 MiB/s |
+| **rANS32x64 16w 10 (raw)**                | ✔️ |  65.97 % |   335.28 MiB/s |   1.42 clk/byte |  3,013.45 MiB/s |
+| **rANS32x64 16w 12 (raw)**                | ✔️ |  63.83 % |   347.90 MiB/s |   1.42 clk/byte |  3,009.18 MiB/s |
+| TurboANX 63                               | ❌ |  63.4 %  |   981.79 MiB/s |  -              |  2,964.02 MiB/s |
+| **rANS32x64 16w 10**                      | ✔️ |  65.56 % |   239.77 MiB/s |   1.46 clk/byte |  2,934.64 MiB/s |
+| TurboANX 48                               | ❌ |  63.3 %  |   969.72 MiB/s |  -              |  2,917.59 MiB/s |
+| **rANS32x64 16w 11**                      | ✔️ |  64.30 % |   225.35 MiB/s |   1.47 clk/byte |  2,907.73 MiB/s |
+| TurboANX 40                               | ❌ |  63.2 %  |   964.45 MiB/s |  -              |  2,883.45 MiB/s |
+| **rANS32x64 16w 12**                      | ✔️ |  63.73 % |   230.37 MiB/s |   1.50 clk/byte |  2,856.76 MiB/s |
+| TurboANX 32                               | ❌ |  66.4 %  |   951.53 MiB/s |  -              |  2,856.26 MiB/s |
+| **rANS32x32 16w 10 (raw)**                | ✔️ |  65.97 % |   328.77 MiB/s |   1.52 clk/byte |  2,822.60 MiB/s |
+| **rANS32x32 16w 11 (raw)**                | ✔️ |  64.48 % |   332.10 MiB/s |   1.52 clk/byte |  2,817.60 MiB/s |
+| **rANS32x32 16w 12 (raw)**                | ✔️ |  63.83 % |   341.70 MiB/s |   1.53 clk/byte |  2,800.63 MiB/s |
+| TurboANX 24                               | ❌ |  63.0 %  |   936.12 MiB/s |  -              |  2,765.31 MiB/s |
+| **rANS32x32 16w 10**                      | ✔️ |  65.56 % |   237.21 MiB/s |   1.55 clk/byte |  2,765.18 MiB/s |
+| **rANS32x32 16w 11**                      | ✔️ |  64.30 % |   238.29 MiB/s |   1.57 clk/byte |  2,735.12 MiB/s |
+| **rANS32x32 16w 12**                      | ✔️ |  63.71 % |   243.00 MiB/s |   1.62 clk/byte |  2,642.01 MiB/s |
+| TurboANX 16                               | ❌ |  62.8 %  |   902.32 MiB/s |  -              |  2,631.85 MiB/s |
+| FSE Huff0                                 | ✔️ |  63.4 %  | 1,581.32 MiB/s |  -              |  2,515.23 MiB/s |
+| htscodecs rans32avx2 0                    | ✔️ |  63.5 %  | 1,041.93 MiB/s |  -              |  2,374.04 MiB/s |
+| TurboANX 8                                | ❌ |  62.7 %  |   823.76 MiB/s |  -              |  2,347.10 MiB/s |
+| htscodecs rans32avx512 0                  | ✔️ |  63.5 %  |   796.70 MiB/s |  -              |  2,221.93 MiB/s |
+| htscodecs rans32sse 0                     | ✔️ |  63.5 %  |   732.08 MiB/s |  -              |  1,948.66 MiB/s |
+| TurboANX 4                                | ❌ |  63.0 %  |   706.92 MiB/s |  -              |  1,929.18 MiB/s |
+| **rANS32x64 16w 14 (raw)**                | ✔️ |  63.55 % |   350.13 MiB/s |   2.22 clk/byte |  1,926.82 MiB/s |
+| **rANS32x64 16w 13 (raw)**                | ✔️ |  63.61 % |   345.16 MiB/s |   2.23 clk/byte |  1,924.81 MiB/s |
+| **rANS32x64 16w 15 (raw)**                | ✔️ |  63.57 % |   340.96 MiB/s |   2.30 clk/byte |  1,861.57 MiB/s |
+| **rANS32x64 16w 13**                      | ✔️ |  63.53 % |   232.05 MiB/s |   2.32 clk/byte |  1,846.34 MiB/s |
+| **rANS32x64 16w 14**                      | ✔️ |  63.47 % |   235.14 MiB/s |   2.33 clk/byte |  1,837.19 MiB/s |
+| **rANS32x32 16w 13 (raw)**                | ✔️ |  63.61 % |   344.26 MiB/s |   2.35 clk/byte |  1,818.86 MiB/s |
+| **rANS32x32 16w 14 (raw)**                | ✔️ |  63.55 % |   324.44 MiB/s |   2.37 clk/byte |  1,810.24 MiB/s |
+| **rANS32x32 16w 14**                      | ✔️ |  63.45 % |   252.28 MiB/s |   2.42 clk/byte |  1,772.88 MiB/s |
+| **rANS32x32 16w 13**                      | ✔️ |  63.52 % |   249.07 MiB/s |   2.42 clk/byte |  1,772.30 MiB/s |
+| **rANS32x64 16w 15**                      | ✔️ |  63.48 % |   235.02 MiB/s |   2.46 clk/byte |  1,744.39 MiB/s |
+| **rANS32x32 16w 15 (raw)**                | ✔️ |  63.57 % |   336.51 MiB/s |   2.55 clk/byte |  1,679.08 MiB/s |
+| **rANS32x32 16w 15**                      | ✔️ |  63.50 % |   250.86 MiB/s |   2.64 clk/byte |  1,622.75 MiB/s |
+| TurboANX 2                                | ❌ |  64.0 %  |   656.86 MiB/s |  -              |  1,416.33 MiB/s |
+| FSE                                       | ✔️ |  63.2 %  |   736.10 MiB/s |  -              |    966.58 MiB/s |
+| TurboANX 1                                | ❌ |  66.4 %  |   522.13 MiB/s |  -              |    942.43 MiB/s |
+| htscodecs rans32avx512 1                  | ✔️ |  51.6 %  |   168.22 MiB/s |  -              |    322.22 MiB/s |
+| htscodecs rans32avx2 1                    | ✔️ |  51.6 %  |   177.36 MiB/s |  -              |    319.15 MiB/s |
+| FastHF                                    | ✔️ |  63.6 %  |   189.84 MiB/s |  -              |    151.62 MiB/s |
+| FastAC                                    | ✔️ |  63.2 %  |   223.06 MiB/s |  -              |     84.37 MiB/s |
+| htscodecs arith_dyn 1                     | ✔️ |  47.8 %  |    89.60 MiB/s |  -              |     81.63 MiB/s |
+| htscodecs arith_dyn 0                     | ✔️ |  62.0 %  |    88.09 MiB/s |  -              |     75.05 MiB/s |
 
 ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus)
-| Codec Type | License | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
+| Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w 11 (raw)**   | BSD-2  |  82.60 % |   311.60 MiB/s |   1.39 clk/byte |  3,079.98 MiB/s |
-| **rANS32x64 16w 12**         | BSD-2  |  80.17 % |   193.60 MiB/s |   1.41 clk/byte |  3,048.15 MiB/s |
-| **rANS32x64 16w 12 (raw)**   | BSD-2  |  82.57 % |   308.10 MiB/s |   1.41 clk/byte |  3,041.07 MiB/s |
-| **rANS32x64 16w 10**         | BSD-2  |  80.81 % |   193.28 MiB/s |   1.41 clk/byte |  3,040.97 MiB/s |
-| **rANS32x64 16w 10 (raw)**   | BSD-2  |  82.83 % |   305.96 MiB/s |   1.42 clk/byte |  3,027.01 MiB/s |
-| **rANS32x64 16w 11**         | BSD-2  |  80.24 % |   186.41 MiB/s |   1.42 clk/byte |  3,015.25 MiB/s |
-| TurboANX 63                  | -      |  79.6  % |   989.68 MiB/s |   -             |  2,966.83 MiB/s |
-| TurboANX 48                  | -      |  79.6  % |   979.24 MiB/s |   -             |  2,923.90 MiB/s |
-| TurboANX 40                  | -      |  79.7  % |   982.57 MiB/s |   -             |  2,904.99 MiB/s |
-| **rANS32x32 16w 11 (raw)**   | BSD-2  |  82.60 % |   303.34 MiB/s |   1.48 clk/byte |  2,886.18 MiB/s |
-| **rANS32x32 16w 10 (raw)**   | BSD-2  |  82.83 % |   301.23 MiB/s |   1.49 clk/byte |  2,881.42 MiB/s |
-| **rANS32x32 16w 12 (raw)**   | BSD-2  |  82.57 % |   307.10 MiB/s |   1.49 clk/byte |  2,872.78 MiB/s |
-| TurboANX 32                  | -      |  79.7  % |   973.82 MiB/s |   -             |  2,860.76 MiB/s |
-| **rANS32x32 16w 10**         | BSD-2  |  80.81 % |   192.99 MiB/s |   1.51 clk/byte |  2,841.71 MiB/s |
-| **rANS32x32 16w 11**         | BSD-2  |  80.24 % |   190.01 MiB/s |   1.51 clk/byte |  2,834.43 MiB/s |
-| **rANS32x32 16w 12**         | BSD-2  |  80.53 % |   195.09 MiB/s |   1.54 clk/byte |  2,787.94 MiB/s |
-| TurboANX 24                  | -      |  79.8  % |   962.68 MiB/s |   -             |  2,785.82 MiB/s |
-| TurboANX 16                  | -      |  79.9  % |   937.33 MiB/s |   -             |  2,661.07 MiB/s |
-| TurboANX 8                   | -      |  80.5  % |   864.63 MiB/s |   -             |  2,360.30 MiB/s |
-| htscodecs rans32avx2 0       | BSD-3  |  80.6  % |   966.58 MiB/s |   -             |  2,244.87 MiB/s |
-| htscodecs rans32avx512 0     | BSD-3  |  80.6  % |   739.14 MiB/s |   -             |  2,139.47 MiB/s |
-| FSE Huff0                    | BSD-2  |  80.0  % | 1,395.71 MiB/s |   -             |  1,946.34 MiB/s |
-| htscodecs rans32sse 0        | BSD-3  |  80.6  % |   723.48 MiB/s |   -             |  1,914.15 MiB/s |
-| **rANS32x64 16w 13 (raw)**   | BSD-2  |  82.57 % |   305.45 MiB/s |   2.24 clk/byte |  1,910.60 MiB/s |
-| **rANS32x64 16w 14 (raw)**   | BSD-2  |  82.58 % |   308.96 MiB/s |   2.25 clk/byte |  1,903.66 MiB/s |
-| **rANS32x64 16w 13**         | BSD-2  |  79.98 % |   191.74 MiB/s |   2.26 clk/byte |  1,892.64 MiB/s |
-| TurboANX 4                   | -      |  81.9  % |   677.08 MiB/s |   -             |  1,883.40 MiB/s |
-| **rANS32x32 16w 13 (raw)**   | BSD-2  |  82.57 % |   305.00 MiB/s |   2.29 clk/byte |  1,870.26 MiB/s |
-| **rANS32x64 16w 15 (raw)**   | BSD-2  |  82.63 % |   307.44 MiB/s |   2.30 clk/byte |  1,865.65 MiB/s |
-| **rANS32x32 16w 14 (raw)**   | BSD-2  |  82.58 % |   306.18 MiB/s |   2.30 clk/byte |  1,865.18 MiB/s |
-| **rANS32x64 16w 14**         | BSD-2  |  80.02 % |   192.71 MiB/s |   2.30 clk/byte |  1,861.42 MiB/s |
-| **rANS32x32 16w 13**         | BSD-2  |  80.01 % |   196.93 MiB/s |   2.37 clk/byte |  1,808.33 MiB/s |
-| **rANS32x64 16w 15**         | BSD-2  |  80.25 % |   193.85 MiB/s |   2.42 clk/byte |  1,773.42 MiB/s |
-| **rANS32x32 16w 14**         | BSD-2  |  80.06 % |   198.86 MiB/s |   2.42 clk/byte |  1,767.12 MiB/s |
-| **rANS32x32 16w 15 (raw)**   | BSD-2  |  82.63 % |   304.21 MiB/s |   2.44 clk/byte |  1,758.57 MiB/s |
-| **rANS32x32 16w 15**         | BSD-2  |  80.06 % |   191.91 MiB/s |   2.70 clk/byte |  1,585.77 MiB/s |
-| TurboANX 2                   | -      |  83.7  % |   600.46 MiB/s |   -             |  1,292.65 MiB/s |
-| FSE                          | BSD-2  |  80.3  % |   696.88 MiB/s |   -             |    990.39 MiB/s |
-| TurboANX 1                   | -      |  85.1  % |   387.40 MiB/s |   -             |    719.84 MiB/s |
-| htscodecs rans32avx2 1       | BSD-3  |  74.4  % |   114.89 MiB/s |   -             |    229.78 MiB/s |
-| htscodecs rans32avx512 1     | BSD-3  |  74.4  % |   104.87 MiB/s |   -             |    220.91 MiB/s |
-| FastHF                       | Custom |  80.0  % |   183.35 MiB/s |   -             |    144.30 MiB/s |
-| FastAC                       | Custom |  79.7  % |   244.35 MiB/s |   -             |     77.33 MiB/s |
-| htscodecs arith_dyn 1        | BSD-3  |  67.6  % |    45.13 MiB/s |   -             |     45.67 MiB/s |
-| htscodecs arith_dyn 0        | BSD-3  |  79.6  % |    47.12 MiB/s |   -             |     45.40 MiB/s |
+| **rANS32x64 16w 11 (raw)**   | ✔️ |  82.60 % |   311.60 MiB/s |   1.39 clk/byte |  3,079.98 MiB/s |
+| **rANS32x64 16w 12**         | ✔️ |  80.17 % |   193.60 MiB/s |   1.41 clk/byte |  3,048.15 MiB/s |
+| **rANS32x64 16w 12 (raw)**   | ✔️ |  82.57 % |   308.10 MiB/s |   1.41 clk/byte |  3,041.07 MiB/s |
+| **rANS32x64 16w 10**         | ✔️ |  80.81 % |   193.28 MiB/s |   1.41 clk/byte |  3,040.97 MiB/s |
+| **rANS32x64 16w 10 (raw)**   | ✔️ |  82.83 % |   305.96 MiB/s |   1.42 clk/byte |  3,027.01 MiB/s |
+| **rANS32x64 16w 11**         | ✔️ |  80.24 % |   186.41 MiB/s |   1.42 clk/byte |  3,015.25 MiB/s |
+| TurboANX 63                  | ❌ |  79.6  % |   989.68 MiB/s |   -             |  2,966.83 MiB/s |
+| TurboANX 48                  | ❌ |  79.6  % |   979.24 MiB/s |   -             |  2,923.90 MiB/s |
+| TurboANX 40                  | ❌ |  79.7  % |   982.57 MiB/s |   -             |  2,904.99 MiB/s |
+| **rANS32x32 16w 11 (raw)**   | ✔️ |  82.60 % |   303.34 MiB/s |   1.48 clk/byte |  2,886.18 MiB/s |
+| **rANS32x32 16w 10 (raw)**   | ✔️ |  82.83 % |   301.23 MiB/s |   1.49 clk/byte |  2,881.42 MiB/s |
+| **rANS32x32 16w 12 (raw)**   | ✔️ |  82.57 % |   307.10 MiB/s |   1.49 clk/byte |  2,872.78 MiB/s |
+| TurboANX 32                  | ❌ |  79.7  % |   973.82 MiB/s |   -             |  2,860.76 MiB/s |
+| **rANS32x32 16w 10**         | ✔️ |  80.81 % |   192.99 MiB/s |   1.51 clk/byte |  2,841.71 MiB/s |
+| **rANS32x32 16w 11**         | ✔️ |  80.24 % |   190.01 MiB/s |   1.51 clk/byte |  2,834.43 MiB/s |
+| **rANS32x32 16w 12**         | ✔️ |  80.53 % |   195.09 MiB/s |   1.54 clk/byte |  2,787.94 MiB/s |
+| TurboANX 24                  | ❌ |  79.8  % |   962.68 MiB/s |   -             |  2,785.82 MiB/s |
+| TurboANX 16                  | ❌ |  79.9  % |   937.33 MiB/s |   -             |  2,661.07 MiB/s |
+| TurboANX 8                   | ❌ |  80.5  % |   864.63 MiB/s |   -             |  2,360.30 MiB/s |
+| htscodecs rans32avx2 0       | ✔️ |  80.6  % |   966.58 MiB/s |   -             |  2,244.87 MiB/s |
+| htscodecs rans32avx512 0     | ✔️ |  80.6  % |   739.14 MiB/s |   -             |  2,139.47 MiB/s |
+| FSE Huff0                    | ✔️ |  80.0  % | 1,395.71 MiB/s |   -             |  1,946.34 MiB/s |
+| htscodecs rans32sse 0        | ✔️ |  80.6  % |   723.48 MiB/s |   -             |  1,914.15 MiB/s |
+| **rANS32x64 16w 13 (raw)**   | ✔️ |  82.57 % |   305.45 MiB/s |   2.24 clk/byte |  1,910.60 MiB/s |
+| **rANS32x64 16w 14 (raw)**   | ✔️ |  82.58 % |   308.96 MiB/s |   2.25 clk/byte |  1,903.66 MiB/s |
+| **rANS32x64 16w 13**         | ✔️ |  79.98 % |   191.74 MiB/s |   2.26 clk/byte |  1,892.64 MiB/s |
+| TurboANX 4                   | ❌ |  81.9  % |   677.08 MiB/s |   -             |  1,883.40 MiB/s |
+| **rANS32x32 16w 13 (raw)**   | ✔️ |  82.57 % |   305.00 MiB/s |   2.29 clk/byte |  1,870.26 MiB/s |
+| **rANS32x64 16w 15 (raw)**   | ✔️ |  82.63 % |   307.44 MiB/s |   2.30 clk/byte |  1,865.65 MiB/s |
+| **rANS32x32 16w 14 (raw)**   | ✔️ |  82.58 % |   306.18 MiB/s |   2.30 clk/byte |  1,865.18 MiB/s |
+| **rANS32x64 16w 14**         | ✔️ |  80.02 % |   192.71 MiB/s |   2.30 clk/byte |  1,861.42 MiB/s |
+| **rANS32x32 16w 13**         | ✔️ |  80.01 % |   196.93 MiB/s |   2.37 clk/byte |  1,808.33 MiB/s |
+| **rANS32x64 16w 15**         | ✔️ |  80.25 % |   193.85 MiB/s |   2.42 clk/byte |  1,773.42 MiB/s |
+| **rANS32x32 16w 14**         | ✔️ |  80.06 % |   198.86 MiB/s |   2.42 clk/byte |  1,767.12 MiB/s |
+| **rANS32x32 16w 15 (raw)**   | ✔️ |  82.63 % |   304.21 MiB/s |   2.44 clk/byte |  1,758.57 MiB/s |
+| **rANS32x32 16w 15**         | ✔️ |  80.06 % |   191.91 MiB/s |   2.70 clk/byte |  1,585.77 MiB/s |
+| TurboANX 2                   | ❌ |  83.7  % |   600.46 MiB/s |   -             |  1,292.65 MiB/s |
+| FSE                          | ✔️ |  80.3  % |   696.88 MiB/s |   -             |    990.39 MiB/s |
+| TurboANX 1                   | ❌ |  85.1  % |   387.40 MiB/s |   -             |    719.84 MiB/s |
+| htscodecs rans32avx2 1       | ✔️ |  74.4  % |   114.89 MiB/s |   -             |    229.78 MiB/s |
+| htscodecs rans32avx512 1     | ✔️ |  74.4  % |   104.87 MiB/s |   -             |    220.91 MiB/s |
+| FastHF                       | ✔️ |  80.0  % |   183.35 MiB/s |   -             |    144.30 MiB/s |
+| FastAC                       | ✔️ |  79.7  % |   244.35 MiB/s |   -             |     77.33 MiB/s |
+| htscodecs arith_dyn 1        | ✔️ |  67.6  % |    45.13 MiB/s |   -             |     45.67 MiB/s |
+| htscodecs arith_dyn 0        | ✔️ |  79.6  % |    47.12 MiB/s |   -             |     45.40 MiB/s |
 
 ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus)
-| Codec Type | Ratio | Encoder<br/>Clocks/Byte | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
+| Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
 | -- | --: | --: | --: | --: | --: |
-| **rANS32x64 16w                    11**   |  77.82 % |   13.84 clk/byte |   309.39 MiB/s |   1.44 clk/byte |  2,978.20 MiB/s |
-| **rANS32x64 16w                    10**   |  77.92 % |   14.16 clk/byte |   302.46 MiB/s |   1.44 clk/byte |  2,968.99 MiB/s |
-| TurboANX 63                               |  70.1  % |   -              |   965.97 MiB/s |   -             |  2,959.13 MiB/s |
-| **rANS32x64 16w                    12**   |  77.79 % |   14.21 clk/byte |   301.44 MiB/s |   1.45 clk/byte |  2,946.52 MiB/s |
-| TurboANX 48                               |  69.6  % |   -              |   954.87 MiB/s |   -             |  2,911.55 MiB/s |
-| **rANS32x32 16w                    10**   |  77.92 % |   13.97 clk/byte |   306.54 MiB/s |   1.49 clk/byte |  2,878.05 MiB/s |
-| TurboANX 40                               |  69.3  % |   -              |   941.29 MiB/s |   -             |  2,869.21 MiB/s |
-| **rANS32x32 16w                    11**   |  77.82 % |   14.34 clk/byte |   298.79 MiB/s |   1.49 clk/byte |  2,867.33 MiB/s |
-| TurboANX 32                               |  68.9  % |   -              |   927.04 MiB/s |   -             |  2,815.87 MiB/s |
-| **rANS32x32 16w                    12**   |  77.79 % |   14.25 clk/byte |   300.51 MiB/s |   1.54 clk/byte |  2,782.35 MiB/s |
-| TurboANX 24                               |  68.4  % |   -              |   900.92 MiB/s |   -             |  2,732.74 MiB/s |
-| TurboANX 16                               |  67.9  % |   -              |   854.34 MiB/s |   -             |  2,582.05 MiB/s |
-| htscodecs_rans32avx2 0                    |  69.5  % |   -              | 1,014.19 MiB/s |   -             |  2,250.58 MiB/s |
-| TurboANX 8                                |  67.2  % |   -              |   748.14 MiB/s |   -             |  2,183.29 MiB/s |
-| htscodecs_rans32avx512 0                  |  69.5  % |   -              |   760.33 MiB/s |   -             |  2,115.31 MiB/s |
-| fsehuf                                    |  69.2  % |   -              | 1,491.60 MiB/s |   -             |  2,092.00 MiB/s |
-| **rANS32x32 16w                    14**   |  77.79 % |   14.02 clk/byte |   305.49 MiB/s |   2.37 clk/byte |  1,804.10 MiB/s |
-| **rANS32x64 16w                    14**   |  77.79 % |   14.09 clk/byte |   303.97 MiB/s |   2.26 clk/byte |  1,891.46 MiB/s |
-| htscodecs_rans32sse 0                     |  69.5  % |   -              |   724.39 MiB/s |   -             |  1,884.40 MiB/s |
-| **rANS32x64 16w                    13**   |  77.79 % |   13.89 clk/byte |   308.28 MiB/s |   2.27 clk/byte |  1,883.91 MiB/s |
-| **rANS32x64 16w                    15**   |  77.85 % |   13.86 clk/byte |   309.13 MiB/s |   2.31 clk/byte |  1,855.74 MiB/s |
-| **rANS32x32 16w                    13**   |  77.78 % |   14.13 clk/byte |   303.23 MiB/s |   2.37 clk/byte |  1,806.03 MiB/s |
-| **rANS32x32 16w                    15**   |  77.84 % |   14.29 clk/byte |   299.78 MiB/s |   2.46 clk/byte |  1,743.60 MiB/s |
-| TurboANX 4                                |  67.3  % |   -              |   603.91 MiB/s |   -             |  1,658.68 MiB/s |
-| TurboANX 2                                |  68.5  % |   -              |   556.95 MiB/s |   -             |  1,106.06 MiB/s |
-| fse                                       |  69.3  % |   -              |   713.08 MiB/s |   -             |    973.71 MiB/s |
-| TurboANX 1                                |  71.6  % |   -              |   392.67 MiB/s |   -             |    677.10 MiB/s |
-| htscodecs_rans32avx512 1                  |  55.7  % |   -              |    81.02 MiB/s |   -             |    168.42 MiB/s |
-| htscodecs_rans32avx2 1                    |  55.7  % |   -              |    83.68 MiB/s |   -             |    167.19 MiB/s |
-| FastHF                                    |  71.8  % |   -              |   174.86 MiB/s |   -             |    130.78 MiB/s |
-| FastAC                                    |  70.7  % |   -              |   234.95 MiB/s |   -             |     81.01 MiB/s |
-| htscodecs_arith_dyn 1                     |  52.1  % |   -              |    62.87 MiB/s |   -             |     62.98 MiB/s |
-| htscodecs_arith_dyn 0                     |  66.4  % |   -              |    63.82 MiB/s |   -             |     59.92 MiB/s |
+| **rANS32x64 16w 11 (raw)**                | ✔️ |  77.82 % |   309.39 MiB/s |   1.44 clk/byte |  2,978.20 MiB/s |
+| TurboANX 63                               | ❌ |  70.1  % |   965.97 MiB/s |   -             |  2,959.13 MiB/s |
+| **rANS32x64 16w 12 (raw)**                | ✔️ |  77.79 % |   308.29 MiB/s |   1.45 clk/byte |  2,946.52 MiB/s |
+| **rANS32x64 16w 10**                      | ✔️ |  76.51 % |   206.82 MiB/s |   1.46 clk/byte |  2,927.30 MiB/s |
+| **rANS32x64 16w 10 (raw)**                | ✔️ |  77.93 % |   302.67 MiB/s |   1.47 clk/byte |  2,916.71 MiB/s |
+| TurboANX 48                               | ❌ |  69.6  % |   954.87 MiB/s |   -             |  2,911.55 MiB/s |
+| **rANS32x64 16w 11**                      | ✔️ |  75.36 % |   201.83 MiB/s |   1.48 clk/byte |  2,894.53 MiB/s |
+| TurboANX 40                               | ❌ |  69.3  % |   941.29 MiB/s |   -             |  2,869.21 MiB/s |
+| **rANS32x32 16w 11 (raw)**                | ✔️ |  77.82 % |   298.79 MiB/s |   1.49 clk/byte |  2,867.33 MiB/s |
+| **rANS32x32 16w 10 (raw)**                | ✔️ |  77.93 % |   300.28 MiB/s |   1.52 clk/byte |  2,826.16 MiB/s |
+| TurboANX 32                               | ❌ |  68.9  % |   927.04 MiB/s |   -             |  2,815.87 MiB/s |
+| **rANS32x32 16w 12 (raw)**                | ✔️ |  77.79 % |   305.16 MiB/s |   1.54 clk/byte |  2,782.36 MiB/s |
+| **rANS32x32 16w 10**                      | ✔️ |  76.51 % |   205.76 MiB/s |   1.55 clk/byte |  2,757.67 MiB/s |
+| **rANS32x32 16w 11**                      | ✔️ |  75.36 % |   205.29 MiB/s |   1.57 clk/byte |  2,733.71 MiB/s |
+| TurboANX 24                               | ❌ |  68.4  % |   900.92 MiB/s |   -             |  2,732.74 MiB/s |
+| **rANS32x64 16w 12**                      | ✔️ |  72.16 % |   198.68 MiB/s |   1.63 clk/byte |  2,631.17 MiB/s |
+| TurboANX 16                               | ❌ |  67.9  % |   854.34 MiB/s |   -             |  2,582.05 MiB/s |
+| **rANS32x32 16w 12**                      | ✔️ |  71.21 % |   202.45 MiB/s |   1.85 clk/byte |  2,319.35 MiB/s |
+| htscodecs rans32avx2 0                    | ✔️ |  69.5  % | 1,014.19 MiB/s |   -             |  2,250.58 MiB/s |
+| TurboANX 8                                | ❌ |  67.2  % |   748.14 MiB/s |   -             |  2,183.29 MiB/s |
+| htscodecs rans32avx512 0                  | ✔️ |  69.5  % |   760.33 MiB/s |   -             |  2,115.31 MiB/s |
+| FSE Huff0                                 | ✔️ |  69.2  % | 1,491.60 MiB/s |   -             |  2,092.00 MiB/s |
+| **rANS32x64 16w 14 (raw)**                | ✔️ |  77.79 % |   307.05 MiB/s |   2.26 clk/byte |  1,891.46 MiB/s |
+| htscodecs rans32sse 0                     | ✔️ |  69.5  % |   724.39 MiB/s |   -             |  1,884.40 MiB/s |
+| **rANS32x64 16w 13 (raw)**                | ✔️ |  77.79 % |   308.28 MiB/s |   2.27 clk/byte |  1,883.91 MiB/s |
+| **rANS32x64 16w 15 (raw)**                | ✔️ |  77.85 % |   309.13 MiB/s |   2.31 clk/byte |  1,855.74 MiB/s |
+| **rANS32x32 16w 13 (raw)**                | ✔️ |  77.78 % |   306.95 MiB/s |   2.35 clk/byte |  1,824.85 MiB/s |
+| **rANS32x32 16w 14 (raw)**                | ✔️ |  77.79 % |   302.09 MiB/s |   2.35 clk/byte |  1,818.82 MiB/s |
+| **rANS32x64 16w 13**                      | ✔️ |  73.22 % |   199.90 MiB/s |   2.43 clk/byte |  1,763.01 MiB/s |
+| **rANS32x32 16w 15 (raw)**                | ✔️ |  77.84 % |   301.06 MiB/s |   2.44 clk/byte |  1,758.41 MiB/s |
+| **rANS32x32 16w 13**                      | ✔️ |  73.24 % |   204.45 MiB/s |   2.54 clk/byte |  1,688.64 MiB/s |
+| **rANS32x64 16w 14**                      | ✔️ |  73.23 % |   199.48 MiB/s |   2.56 clk/byte |  1,672.95 MiB/s |
+| TurboANX 4                                | ❌ |  67.3  % |   603.91 MiB/s |   -             |  1,658.68 MiB/s |
+| **rANS32x32 16w 14**                      | ✔️ |  73.27 % |   204.91 MiB/s |   2.66 clk/byte |  1,611.11 MiB/s |
+| **rANS32x32 16w 15**                      | ✔️ |  74.38 % |   204.20 MiB/s |   2.78 clk/byte |  1,543.54 MiB/s |
+| **rANS32x64 16w 15**                      | ✔️ |  72.21 % |   198.42 MiB/s |   3.18 clk/byte |  1,345.59 MiB/s |
+| TurboANX 2                                | ❌ |  68.5  % |   556.95 MiB/s |   -             |  1,106.06 MiB/s |
+| FSE                                       | ✔️ |  69.3  % |   713.08 MiB/s |   -             |    973.71 MiB/s |
+| TurboANX 1                                | ❌ |  71.6  % |   392.67 MiB/s |   -             |    677.10 MiB/s |
+| htscodecs rans32avx512 1                  | ✔️ |  55.7  % |    81.02 MiB/s |   -             |    168.42 MiB/s |
+| htscodecs rans32avx2 1                    | ✔️ |  55.7  % |    83.68 MiB/s |   -             |    167.19 MiB/s |
+| FastHF                                    | ✔️ |  71.8  % |   174.86 MiB/s |   -             |    130.78 MiB/s |
+| FastAC                                    | ✔️ |  70.7  % |   234.95 MiB/s |   -             |     81.01 MiB/s |
+| htscodecs arith_dyn 1                     | ✔️ |  52.1  % |    62.87 MiB/s |   -             |     62.98 MiB/s |
+| htscodecs arith_dyn 0                     | ✔️ |  66.4  % |    63.82 MiB/s |   -             |     59.92 MiB/s |
 
 ## Easy Multithreading
 hypersonic-rANS includes a variant that's encodes blocks independently (at the expense of compression ratio) allowing for easy multithreading.
 
 ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus)
 | Codec Type | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
-| -- | --: | --: | --: | --: | --: |
-| rANS32x64 16w 10 mt |  80.23 % |   200.03 MiB/s |    0.24 clk/byte | 18,035.77 MiB/s |
-| rANS32x32 16w 10 mt |  80.17 % |   194.73 MiB/s |    0.25 clk/byte | 17,834.38 MiB/s |
-| rANS32x64 16w 11 mt |  80.08 % |   202.10 MiB/s |    0.26 clk/byte | 16,210.44 MiB/s |
-| rANS32x32 16w 11 mt |  80.02 % |   191.90 MiB/s |    0.27 clk/byte | 15,630.58 MiB/s |
-| rANS32x64 16w 12 mt |  80.05 % |   197.62 MiB/s |    0.34 clk/byte | 13,207.00 MiB/s |
-| rANS32x32 16w 12 mt |  79.99 % |   197.21 MiB/s |    0.36 clk/byte | 12,358.57 MiB/s |
-| rANS32x64 16w 13 mt |  80.04 % |   199.94 MiB/s |    0.37 clk/byte | 11,938.77 MiB/s |
-| rANS32x32 16w 13 mt |  79.99 % |   195.00 MiB/s |    0.37 clk/byte | 11,497.36 MiB/s |
-| rANS32x64 16w 14 mt |  80.05 % |   199.87 MiB/s |    0.42 clk/byte | 10,318.01 MiB/s |
-| rANS32x32 16w 14 mt |  80.01 % |   190.94 MiB/s |    0.42 clk/byte | 10,134.59 MiB/s |
-| rANS32x64 16w 15 mt |  80.09 % |   200.59 MiB/s |    0.59 clk/byte |  7,308.43 MiB/s |
-| rANS32x32 16w 15 mt |  80.03 % |   192.28 MiB/s |    0.62 clk/byte |  7,024.69 MiB/s |
+| -- | --: | --: | --: | --: |
+| rANS32x64 16w 10 mt |  80.23 % |   200.03 MiB/s |    0.24 clk/byte | **18,035.77 MiB/s** |
+| rANS32x32 16w 10 mt |  80.17 % |   194.73 MiB/s |    0.25 clk/byte | **17,834.38 MiB/s** |
+| rANS32x64 16w 11 mt |  80.08 % |   202.10 MiB/s |    0.26 clk/byte | **16,210.44 MiB/s** |
+| rANS32x32 16w 11 mt |  80.02 % |   191.90 MiB/s |    0.27 clk/byte | **15,630.58 MiB/s** |
+| rANS32x64 16w 12 mt |  80.05 % |   197.62 MiB/s |    0.34 clk/byte | **13,207.00 MiB/s** |
+| rANS32x32 16w 12 mt |  79.99 % |   197.21 MiB/s |    0.36 clk/byte | **12,358.57 MiB/s** |
+| rANS32x64 16w 13 mt |  80.04 % |   199.94 MiB/s |    0.37 clk/byte | **11,938.77 MiB/s** |
+| rANS32x32 16w 13 mt |  79.99 % |   195.00 MiB/s |    0.37 clk/byte | **11,497.36 MiB/s** |
+| rANS32x64 16w 14 mt |  80.05 % |   199.87 MiB/s |    0.42 clk/byte | **10,318.01 MiB/s** |
+| rANS32x32 16w 14 mt |  80.01 % |   190.94 MiB/s |    0.42 clk/byte | **10,134.59 MiB/s** |
+| rANS32x64 16w 15 mt |  80.09 % |   200.59 MiB/s |    0.59 clk/byte |  **7,308.43 MiB/s** |
+| rANS32x32 16w 15 mt |  80.03 % |   192.28 MiB/s |    0.62 clk/byte |  **7,024.69 MiB/s** |
 
 ## Building
 ### On Linux/WSL
diff --git a/docs/index.html b/docs/index.html
index 8aa291c..ba30a47 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -596,103 +596,6 @@ <h2>some of the fastest decoding range-based asymetric numeral systems (rANS) co
       </script>
       <hr/>
       <h2>benchmarks</h2>
-      <div class="graph_container">
-        <h3>enwik8 (wikipedia extract)</h3>
-        <p>Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.</p>
-        <!--
-        <div class="graph_metric_container">
-          <a class="graph_metricY r" onclick="javascript:setYMetric('entropy')">entropy</a>
-          <a class="graph_metricY l active" onclick="javascript:setYMetric('ratio')">ratio</a>
-        </div>
-        <div class="graph_metric_container">
-          <a class="graph_metricX r" onclick="javascript:setXMetric('encode')">encode</a>
-          <a class="graph_metricX l active" onclick="javascript:setXMetric('decode')">decode</a>
-        </div>
-        -->
-        <canvas id="enwik8" class="chart"></canvas>
-        <script>
-            var obj = document.getElementById("enwik8");
-            var ctx = obj.getContext("2d");
-            ctx.scale(pixelRatio(), pixelRatio());
-
-            init_obj(obj);
-
-            add_point_no_entropy(obj, 1675.11, 63.57,  342.60, "rANS32x32 16w 15"                     );
-            add_point_no_entropy(obj, 1795.66, 63.55,  341.70, "rANS32x32 16w 14"                     );
-            add_point_no_entropy(obj, 1800.28, 63.61,  341.20, "rANS32x32 16w 13"                     );
-            add_point_no_entropy(obj, 2784.13, 63.81,  343.55, "rANS32x32 16w 12"                     );
-            add_point_no_entropy(obj, 2856.20, 64.33,  333.03, "rANS32x32 16w 11"                     );
-            add_point_no_entropy(obj, 2845.56, 65.59,  334.68, "rANS32x32 16w 10"                     );
-
-            add_point_no_entropy(obj, 1828.28, 63.57,  350.49, "rANS32x64 16w 15"                     );
-            add_point_no_entropy(obj, 1876.95, 63.55,  346.57, "rANS32x64 16w 14"                     );
-            add_point_no_entropy(obj, 1872.44, 63.61,  348.13, "rANS32x64 16w 13"                     );
-            add_point_no_entropy(obj, 2967.92, 63.81,  342.31, "rANS32x64 16w 12"                     );
-            add_point_no_entropy(obj, 2973.71, 64.33,  347.24, "rANS32x64 16w 11"                     );
-            add_point_no_entropy(obj, 2989.66, 65.59,  341.55, "rANS32x64 16w 10"                     );
-
-            add_point_no_entropy(obj, 1550.93, 63.57,  324.33, "rANS32x32 32blk 16w 15"               );
-            add_point_no_entropy(obj, 1607.26, 63.55,  329.08, "rANS32x32 32blk 16w 14"               );
-            add_point_no_entropy(obj, 1582.28, 63.61,  341.16, "rANS32x32 32blk 16w 13"               );
-            add_point_no_entropy(obj, 2312.10, 63.81,  339.50, "rANS32x32 32blk 16w 12"               );
-            add_point_no_entropy(obj, 2299.31, 64.33,  338.00, "rANS32x32 32blk 16w 11"               );
-            add_point_no_entropy(obj, 2289.10, 65.59,  331.80, "rANS32x32 32blk 16w 10"               );
-
-            add_point_no_entropy(obj, 1381.63, 63.51,  283.41, "rANS32x32 32blk 8w 15"                );
-            add_point_no_entropy(obj, 1429.24, 63.53,  284.45, "rANS32x32 32blk 8w 14"                );
-            add_point_no_entropy(obj, 1438.01, 63.60,  284.24, "rANS32x32 32blk 8w 13"                );
-            add_point_no_entropy(obj, 1984.68, 63.82,  282.80, "rANS32x32 32blk 8w 12"                );
-            add_point_no_entropy(obj, 1988.10, 64.33,  285.45, "rANS32x32 32blk 8w 11"                );
-            add_point_no_entropy(obj, 1977.26, 65.60,  291.41, "rANS32x32 32blk 8w 10"                );
-
-            add_point_no_entropy(obj, 1017.12, 63.57,  322.51, "rANS32x16 16w 15"                     );
-            add_point_no_entropy(obj, 1175.92, 63.55,  320.41, "rANS32x16 16w 14"                     );
-            add_point_no_entropy(obj, 1190.23, 63.61,  325.92, "rANS32x16 16w 13"                     );
-            add_point_no_entropy(obj, 1681.73, 63.81,  324.24, "rANS32x16 16w 12"                     );
-            add_point_no_entropy(obj, 1676.41, 64.33,  323.17, "rANS32x16 16w 11"                     );
-            add_point_no_entropy(obj, 1684.80, 65.59,  323.07, "rANS32x16 16w 10"                     );
-
-            //add_point_no_entropy(obj,   81.63, 47.8 ,   89.60, "htscodecs_arith_dyn 1"              , true);
-            //add_point_no_entropy(obj,  322.22, 51.6 ,  168.22, "htscodecs_rans32avx512 1"           , true);
-            //add_point_no_entropy(obj,  319.15, 51.6 ,  177.36, "htscodecs_rans32avx2 1"             , true);
-            add_point_no_entropy(obj,   75.05, 62.0 ,   88.09, "htscodecs_arith_dyn 0"                , true);
-            add_point_no_entropy(obj,   84.37, 63.2 ,  223.06, "FastAC"                               , true);
-            add_point_no_entropy(obj,  966.58, 63.2 ,  736.10, "fse"                                  , true);
-            add_point_no_entropy(obj, 2515.23, 63.4 , 1581.32, "fsehuf"                               , true);
-            add_point_no_entropy(obj, 2374.04, 63.5 , 1041.93, "htscodecs_rans32avx2 0"               , true);
-            add_point_no_entropy(obj, 2221.93, 63.5 ,  796.70, "htscodecs_rans32avx512 0"             , true);
-            add_point_no_entropy(obj, 1948.66, 63.5 ,  732.08, "htscodecs_rans32sse 0"                , true);
-            add_point_no_entropy(obj,  151.62, 63.6 ,  189.84, "FastHF"                               , true);
-            add_point_no_entropy(obj,  942.43, 66.4 ,  522.13, "TurboANX 1  (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 1416.33, 64.0 ,  656.86, "TurboANX 2  (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 1929.18, 63.0 ,  706.92, "TurboANX 4  (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2347.10, 62.7 ,  823.76, "TurboANX 8  (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2631.85, 62.8 ,  902.32, "TurboANX 16 (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2765.31, 63.0 ,  936.12, "TurboANX 24 (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2856.26, 63.1 ,  951.53, "TurboANX 32 (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2883.45, 63.2 ,  964.45, "TurboANX 40 (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2917.59, 63.3 ,  969.72, "TurboANX 48 (Native Windows Build)"   , true);
-            add_point_no_entropy(obj, 2964.02, 63.4 ,  981.79, "TurboANX 63 (Native Windows Build)"   , true);
-
-            add_line(obj, ["rANS32x32 16w 15", "rANS32x32 16w 14", "rANS32x32 16w 13", "rANS32x32 16w 12", "rANS32x32 16w 11", "rANS32x32 16w 10"], getColor());
-            add_line(obj, ["rANS32x32 32blk 16w 15", "rANS32x32 32blk 16w 14", "rANS32x32 32blk 16w 13", "rANS32x32 32blk 16w 12", "rANS32x32 32blk 16w 11", "rANS32x32 32blk 16w 10"], getColor());
-            add_line(obj, ["rANS32x32 32blk 8w 15", "rANS32x32 32blk 8w 14", "rANS32x32 32blk 8w 13", "rANS32x32 32blk 8w 12", "rANS32x32 32blk 8w 11", "rANS32x32 32blk 8w 10"], getColor());
-            add_line(obj, ["rANS32x64 16w 15", "rANS32x64 16w 14", "rANS32x64 16w 13", "rANS32x64 16w 12", "rANS32x64 16w 11", "rANS32x64 16w 10"], getColor());
-            add_line(obj, ["rANS32x16 16w 15", "rANS32x16 16w 14", "rANS32x16 16w 13", "rANS32x16 16w 12", "rANS32x16 16w 11", "rANS32x16 16w 10"], getColor());
-            
-            add_line(obj, ["fse", "fsehuf" ], "#55555588");
-            add_line(obj, ["htscodecs_rans32sse 0", "htscodecs_rans32avx2 0", "htscodecs_rans32avx512 0" ], "#55555588");
-            add_line(obj, ["FastAC", "FastHF"], "#55555588");
-            add_line(obj, ["TurboANX 1  (Native Windows Build)", "TurboANX 2  (Native Windows Build)", "TurboANX 4  (Native Windows Build)", "TurboANX 8  (Native Windows Build)", "TurboANX 16 (Native Windows Build)", "TurboANX 24 (Native Windows Build)", "TurboANX 32 (Native Windows Build)", "TurboANX 40 (Native Windows Build)", "TurboANX 48 (Native Windows Build)", "TurboANX 63 (Native Windows Build)"], "#55555588");
-
-            calc_pareto_line(obj);
-            calc_pareto_line2(obj);
-
-            window.addEventListener("resize", () => { draw_obj(document.getElementById("enwik8").getContext("2d"), document.getElementById("enwik8")); });
-            window.addEventListener("load", () => { draw_obj(document.getElementById("enwik8").getContext("2d"), document.getElementById("enwik8")); });
-            obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("enwik8")); });
-        </script>
-      </div>
       <div class="graph_container">
         <h3>x-ray (medical x-ray image)</h3>
         <p>Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.</p>
@@ -742,17 +645,6 @@ <h3>x-ray (medical x-ray image)</h3>
             add_point_no_entropy(obj, 3015.25, 80.24,  186.41, "rANS32x64 16w 11"        );
             add_point_no_entropy(obj, 3040.97, 80.81,  193.28, "rANS32x64 16w 10"        );
 
-            add_point_no_entropy(obj, 2966.83, 79.6,  989.68, "TurboANX 63 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2923.90, 79.6,  979.24, "TurboANX 48 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2904.99, 79.7,  982.57, "TurboANX 40 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2860.76, 79.7,  973.82, "TurboANX 32 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2785.82, 79.8,  962.68, "TurboANX 24 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2661.07, 79.9,  937.33, "TurboANX 16 (Native Windows Build)", true);
-            add_point_no_entropy(obj, 2360.30, 80.5,  864.63, "TurboANX 8  (Native Windows Build)", true);
-            add_point_no_entropy(obj, 1883.40, 81.9,  677.08, "TurboANX 4  (Native Windows Build)", true);
-            add_point_no_entropy(obj, 1292.65, 83.7,  600.46, "TurboANX 2  (Native Windows Build)", true);
-            add_point_no_entropy(obj,  719.84, 85.1,  387.40, "TurboANX 1  (Native Windows Build)", true);
-
             add_point_no_entropy(obj, 1914.15, 80.6,  723.48, "htscodecs rans32sse 0"    , true);
             add_point_no_entropy(obj, 2244.87, 80.6,  966.58, "htscodecs rans32avx2 0"   , true);
             add_point_no_entropy(obj, 2139.47, 80.6,  739.14, "htscodecs rans32avx512 0" , true);
@@ -768,6 +660,17 @@ <h3>x-ray (medical x-ray image)</h3>
             add_point_no_entropy(obj,  144.30, 80.0,  183.35, "FastHF"                   , true);
             add_point_no_entropy(obj,   77.33, 79.7,  244.35, "FastAC"                   , true);
 
+            add_point_no_entropy(obj, 2966.83, 79.6,  989.68, "TurboANX 63 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2923.90, 79.6,  979.24, "TurboANX 48 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2904.99, 79.7,  982.57, "TurboANX 40 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2860.76, 79.7,  973.82, "TurboANX 32 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2785.82, 79.8,  962.68, "TurboANX 24 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2661.07, 79.9,  937.33, "TurboANX 16 (Native Windows Build)", true);
+            add_point_no_entropy(obj, 2360.30, 80.5,  864.63, "TurboANX 8  (Native Windows Build)", true);
+            add_point_no_entropy(obj, 1883.40, 81.9,  677.08, "TurboANX 4  (Native Windows Build)", true);
+            add_point_no_entropy(obj, 1292.65, 83.7,  600.46, "TurboANX 2  (Native Windows Build)", true);
+            add_point_no_entropy(obj,  719.84, 85.1,  387.40, "TurboANX 1  (Native Windows Build)", true);
+
             add_line(obj, ["rANS32x32 16w 15", "rANS32x32 16w 14", "rANS32x32 16w 13", "rANS32x32 16w 12", "rANS32x32 16w 11", "rANS32x32 16w 10"], getColor());
             add_line(obj, ["rANS32x64 16w 15", "rANS32x64 16w 14", "rANS32x64 16w 13", "rANS32x64 16w 12", "rANS32x64 16w 11", "rANS32x64 16w 10"], getColor());
 
@@ -787,6 +690,95 @@ <h3>x-ray (medical x-ray image)</h3>
             obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("xray")); });
         </script>
       </div>
+      <div class="graph_container">
+        <h3>enwik8 (wikipedia extract)</h3>
+        <p>Order 1 codecs excluded, they're not particularly fast, but they compress a lot better, which makes visibility a lot worse.</p>
+        <!--
+        <div class="graph_metric_container">
+          <a class="graph_metricY r" onclick="javascript:setYMetric('entropy')">entropy</a>
+          <a class="graph_metricY l active" onclick="javascript:setYMetric('ratio')">ratio</a>
+        </div>
+        <div class="graph_metric_container">
+          <a class="graph_metricX r" onclick="javascript:setXMetric('encode')">encode</a>
+          <a class="graph_metricX l active" onclick="javascript:setXMetric('decode')">decode</a>
+        </div>
+        -->
+        <canvas id="enwik8" class="chart"></canvas>
+        <script>
+            var obj = document.getElementById("enwik8");
+            var ctx = obj.getContext("2d");
+            ctx.scale(pixelRatio(), pixelRatio());
+
+            init_obj(obj);
+
+            add_point_no_entropy(obj, 1622.75, 63.50,  250.86, "rANS32x32 16w 15"        );
+            add_point_no_entropy(obj, 1772.88, 63.45,  252.28, "rANS32x32 16w 14"        );
+            add_point_no_entropy(obj, 1772.30, 63.52,  249.07, "rANS32x32 16w 13"        );
+            add_point_no_entropy(obj, 2642.01, 63.71,  243.00, "rANS32x32 16w 12"        );
+            add_point_no_entropy(obj, 2735.12, 64.30,  238.29, "rANS32x32 16w 11"        );
+            add_point_no_entropy(obj, 2765.18, 65.56,  237.21, "rANS32x32 16w 10"        );
+
+            add_point_no_entropy(obj, 1744.39, 63.48,  235.02, "rANS32x64 16w 15"        );
+            add_point_no_entropy(obj, 1837.19, 63.47,  235.14, "rANS32x64 16w 14"        );
+            add_point_no_entropy(obj, 1846.34, 63.53,  232.05, "rANS32x64 16w 13"        );
+            add_point_no_entropy(obj, 2856.76, 63.73,  230.37, "rANS32x64 16w 12"        );
+            add_point_no_entropy(obj, 2907.73, 64.30,  225.35, "rANS32x64 16w 11"        );
+            add_point_no_entropy(obj, 2934.64, 65.56,  239.77, "rANS32x64 16w 10"        );
+
+            add_point_no_entropy(obj, 1679.08, 63.57,  336.51, "rANS32x32 16w 15 (raw)"  );
+            add_point_no_entropy(obj, 1810.24, 63.55,  324.44, "rANS32x32 16w 14 (raw)"  );
+            add_point_no_entropy(obj, 1818.86, 63.61,  344.26, "rANS32x32 16w 13 (raw)"  );
+            add_point_no_entropy(obj, 2800.63, 63.83,  341.70, "rANS32x32 16w 12 (raw)"  );
+            add_point_no_entropy(obj, 2817.60, 64.48,  332.10, "rANS32x32 16w 11 (raw)"  );
+            add_point_no_entropy(obj, 2822.60, 65.97,  328.77, "rANS32x32 16w 10 (raw)"  );
+
+            add_point_no_entropy(obj, 1861.57, 63.57,  340.96, "rANS32x64 16w 15 (raw)"  );
+            add_point_no_entropy(obj, 1926.82, 63.55,  350.13, "rANS32x64 16w 14 (raw)"  );
+            add_point_no_entropy(obj, 1924.81, 63.61,  345.16, "rANS32x64 16w 13 (raw)"  );
+            add_point_no_entropy(obj, 3009.18, 63.83,  347.90, "rANS32x64 16w 12 (raw)"  );
+            add_point_no_entropy(obj, 3018.02, 64.48,  336.81, "rANS32x64 16w 11 (raw)"  );
+            add_point_no_entropy(obj, 3013.45, 65.97,  335.28, "rANS32x64 16w 10 (raw)"  );
+
+            //add_point_no_entropy(obj,   81.63, 47.8 ,   89.60, "htscodecs arith_dyn 1"              , true);
+            //add_point_no_entropy(obj,  322.22, 51.6 ,  168.22, "htscodecs rans32avx512 1"           , true);
+            //add_point_no_entropy(obj,  319.15, 51.6 ,  177.36, "htscodecs rans32avx2 1"             , true);
+            add_point_no_entropy(obj,   75.05, 62.0 ,   88.09, "htscodecs arith_dyn 0"                , true);
+            add_point_no_entropy(obj,   84.37, 63.2 ,  223.06, "FastAC"                               , true);
+            add_point_no_entropy(obj,  966.58, 63.2 ,  736.10, "FSE"                                  , true);
+            add_point_no_entropy(obj, 2515.23, 63.4 , 1581.32, "FSE Huff0"                            , true);
+            add_point_no_entropy(obj, 2374.04, 63.5 , 1041.93, "htscodecs rans32avx2 0"               , true);
+            add_point_no_entropy(obj, 2221.93, 63.5 ,  796.70, "htscodecs rans32avx512 0"             , true);
+            add_point_no_entropy(obj, 1948.66, 63.5 ,  732.08, "htscodecs rans32sse 0"                , true);
+            add_point_no_entropy(obj,  151.62, 63.6 ,  189.84, "FastHF"                               , true);
+            add_point_no_entropy(obj,  942.43, 66.4 ,  522.13, "TurboANX 1  (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 1416.33, 64.0 ,  656.86, "TurboANX 2  (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 1929.18, 63.0 ,  706.92, "TurboANX 4  (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2347.10, 62.7 ,  823.76, "TurboANX 8  (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2631.85, 62.8 ,  902.32, "TurboANX 16 (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2765.31, 63.0 ,  936.12, "TurboANX 24 (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2856.26, 63.1 ,  951.53, "TurboANX 32 (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2883.45, 63.2 ,  964.45, "TurboANX 40 (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2917.59, 63.3 ,  969.72, "TurboANX 48 (Native Windows Build)"   , true);
+            add_point_no_entropy(obj, 2964.02, 63.4 ,  981.79, "TurboANX 63 (Native Windows Build)"   , true);
+
+            add_line(obj, ["rANS32x32 16w 15", "rANS32x32 16w 14", "rANS32x32 16w 13", "rANS32x32 16w 12", "rANS32x32 16w 11", "rANS32x32 16w 10"], getColor());
+            add_line(obj, ["rANS32x64 16w 15", "rANS32x64 16w 14", "rANS32x64 16w 13", "rANS32x64 16w 12", "rANS32x64 16w 11", "rANS32x64 16w 10"], getColor());
+            add_line(obj, ["rANS32x32 16w 15 (raw)", "rANS32x32 16w 14 (raw)", "rANS32x32 16w 13 (raw)", "rANS32x32 16w 12 (raw)", "rANS32x32 16w 11 (raw)", "rANS32x32 16w 10 (raw)"], getColor());
+            add_line(obj, ["rANS32x64 16w 15 (raw)", "rANS32x64 16w 14 (raw)", "rANS32x64 16w 13 (raw)", "rANS32x64 16w 12 (raw)", "rANS32x64 16w 11 (raw)", "rANS32x64 16w 10 (raw)"], getColor());
+            
+            add_line(obj, ["FSE", "FSE Huff0" ], "#55555588");
+            add_line(obj, ["htscodecs rans32sse 0", "htscodecs rans32avx2 0", "htscodecs rans32avx512 0" ], "#55555588");
+            add_line(obj, ["FastAC", "FastHF"], "#55555588");
+            add_line(obj, ["TurboANX 1  (Native Windows Build)", "TurboANX 2  (Native Windows Build)", "TurboANX 4  (Native Windows Build)", "TurboANX 8  (Native Windows Build)", "TurboANX 16 (Native Windows Build)", "TurboANX 24 (Native Windows Build)", "TurboANX 32 (Native Windows Build)", "TurboANX 40 (Native Windows Build)", "TurboANX 48 (Native Windows Build)", "TurboANX 63 (Native Windows Build)"], "#55555588");
+
+            calc_pareto_line(obj);
+            calc_pareto_line2(obj);
+
+            window.addEventListener("resize", () => { draw_obj(document.getElementById("enwik8").getContext("2d"), document.getElementById("enwik8")); });
+            window.addEventListener("load", () => { draw_obj(document.getElementById("enwik8").getContext("2d"), document.getElementById("enwik8")); });
+            obj.addEventListener("mousemove", (e) => { handle_mouseover(e, document.getElementById("enwik8")); });
+        </script>
+      </div>
     </div>
   </body>
 </html>
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index c51e684..7d28257 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -198,19 +198,19 @@ static codec_info_t _Codecs[] =
   { "rANS32x64 16w (independent blocks)", 11, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_11>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_11, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_11>, true }, {}}},
   { "rANS32x64 16w (independent blocks)", 10, {{ "encode", encode_no_hist_wrapper<mt_rANS32x64_16w_encode_10>, true }, {}}, {{ "decode (single thread)", mt_rANS32x64_16w_decode_10, true }, { "decode (multi threaded)", decode_with_thread_pool_wrapper<mt_rANS32x64_16w_decode_mt_10>, true }, {}}},
   
-  { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
-  { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
-  { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
-  { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
-  { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
-  { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
-
-  { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}},
-  { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}},
-  { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}},
-  { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
-  { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
-  { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
+  { "rANS32x32 16w (raw)", 15, {{ "enc scalar", rANS32x32_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_15 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
+  { "rANS32x32 16w (raw)", 14, {{ "enc scalar", rANS32x32_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_14 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_14 }, {}}},
+  { "rANS32x32 16w (raw)", 13, {{ "enc scalar", rANS32x32_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_13 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_13 }, {}}},
+  { "rANS32x32 16w (raw)", 12, {{ "enc scalar", rANS32x32_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_12 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_12 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_12 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_12 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
+  { "rANS32x32 16w (raw)", 11, {{ "enc scalar", rANS32x32_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_11 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_11 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_11 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_11 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
+  { "rANS32x32 16w (raw)", 10, {{ "enc scalar", rANS32x32_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x32_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x32_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl, 1x pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (ymm perm, 1x pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_pregather_varC_10 }, { "dec avx2 (xmm shfl, 1x s pre-gthr)", rANS32x32_xmmShfl_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl2, 1x s pre-gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (ymm perm, 1x s pre-gthr)", rANS32x32_ymmPerm_16w_decode_avx2_scalarpregather_varC_10 }, { "dec avx2 (xmm shfl, 1x erly gthr)", rANS32x32_xmmShfl_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl2, 1x erly gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (ymm perm, 1x erly gthr)", rANS32x32_ymmPerm_16w_decode_avx2_earlygather_varC_10 }, { "dec avx2 (xmm shfl, 1x pref gthr)", rANS32x32_xmmShfl_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (xmm shfl2, 1x pref gthr)", rANS32x32_xmmShfl2_16w_decode_avx2_prefetch_varC_10 }, { "dec avx2 (ymm perm, 1x pref gthr)", rANS32x32_ymmPerm_16w_decode_avx2_prefetch_varC_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x32_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x32_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x32_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x32_xmmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x32_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x32_ymmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x32_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
+
+  { "rANS32x64 16w (raw)", 15, {{ "enc scalar", rANS32x64_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_15 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_15 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_15 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_15 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_15 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_15 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_15, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_15 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_15 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_15 }, {}}},
+  { "rANS32x64 16w (raw)", 14, {{ "enc scalar", rANS32x64_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_14 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_14 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_14 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_14 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_14 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_14 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_14, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_14 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_14 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_14 }, {}}},
+  { "rANS32x64 16w (raw)", 13, {{ "enc scalar", rANS32x64_16w_encode_scalar_13, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_13 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_13 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_13 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_13, true }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_13 }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_13 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_13 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_13 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_13 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_13, true }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_13 }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_13 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_13 }, {}}},
+  { "rANS32x64 16w (raw)", 12, {{ "enc scalar", rANS32x64_16w_encode_scalar_12, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_12 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_12 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_12 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_12 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_12, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_12 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_12 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_12 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_12 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_12 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_12 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_12, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_12 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_12 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_12 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_12 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_12, true }, {}}},
+  { "rANS32x64 16w (raw)", 11, {{ "enc scalar", rANS32x64_16w_encode_scalar_11, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_11 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_11 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_11 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_11 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_11, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_11 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_11 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_11 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_11 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_11 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_11 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_11, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_11 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_11 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_11 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_11 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_11, true }, {}}},
+  { "rANS32x64 16w (raw)", 10, {{ "enc scalar", rANS32x64_16w_encode_scalar_10, true }, {}}, {{ "dec scalar", rANS32x64_16w_decode_scalar_10 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx2_varC_10 }, { "dec avx2 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varA_10 }, { "dec avx2 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varB_10 }, { "dec avx2 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx2_varC_10, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varA_10 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varB_10 }, { "dec avx2 (ymm perm, 1x gthr)", rANS32x64_ymmPerm_16w_decode_avx2_varC_10 }, { "dec avx512 (xmm shfl, sym dep gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl, sym idp gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl, 1x gthr)", rANS32x64_xmmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl2, sym dep gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (xmm shfl2, sym idp gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (xmm shfl2, 1x gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (ymm shfl, sym dep gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl, sym idp gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl, 1x gthr)", rANS32x64_ymmShfl_16w_decode_avx512_varC_10 }, { "dec avx512 (ymm shfl2, sym dep gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varA_10 }, { "dec avx512 (ymm shfl2, sym idp gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varB_10 }, { "dec avx512 (ymm shfl2, 1x gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_varC_10, true }, { "dec avx512 (zmm perm, sym dep gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varA_10 }, { "dec avx512 (zmm perm, sym idp gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varB_10 }, { "dec avx512 (zmm perm, 1x gthr)", rANS32x64_zmmPerm_16w_decode_avx512_varC_10 }, { "dec avx512 (xmm shfl, 1x ymm gthr)", rANS32x64_xmmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (xmm shfl2, 1x ymm gthr)", rANS32x64_xmmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, { "dec avx512 (ymm shfl, 1x ymm gthr)", rANS32x64_ymmShfl_16w_decode_avx512_ymmGthr_varC_10 }, { "dec avx512 (ymm shfl2, 1x ymm gthr)", rANS32x64_ymmShfl2_16w_decode_avx512_ymmGthr_varC_10, true }, {}}},
   
   { "rANS32x16 16w (raw)", 15, {{ "enc scalar", rANS32x16_16w_encode_scalar_15, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_15 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_15, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_15, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_15 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_15 }, {}}},
   { "rANS32x16 16w (raw)", 14, {{ "enc scalar", rANS32x16_16w_encode_scalar_14, true }, {}}, {{ "dec scalar", rANS32x16_16w_decode_scalar_14 }, { "dec avx2 (xmm shfl, sym dep gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varA_14, true }, { "dec avx2 (xmm shfl, sym idp gthr)", rANS32x16_xmmShfl_16w_decode_avx2_varB_14, true }, { "dec avx2 (ymm perm, sym dep gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varA_14 }, { "dec avx2 (ymm perm, sym idp gthr)", rANS32x16_ymmPerm_16w_decode_avx2_varB_14 }, {}}},

From 199816c669afb9ec076136372be592f5996087d7 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:13:41 +0200
Subject: [PATCH 27/34] Trying to support clang-cl, but that's currently not
 possible apparently

---
 premake5.lua        |  4 ++--
 project.lua         | 13 ++++++++++---
 src/block_codec32.h |  5 +++--
 src/block_codec64.h |  9 +++++----
 src/simd_platform.h |  2 +-
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/premake5.lua b/premake5.lua
index bd29293..8fe4c7c 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -7,13 +7,13 @@ solution "hsrans"
     configurations { "Release", "Debug", "ReleaseClang", "DebugClang" }
     linkgroups "On"
     filter { "configurations:*Clang" }
-      toolset "clang"
+    toolset "clang"
     filter { }
   elseif os.target() == "macosx" then
     configurations { "Release", "Debug" }
     toolset "clang"
   else
-    configurations { "Debug", "Release" }
+    configurations { "Release", "Debug", "ReleaseClang", "DebugClang" }
   end
 
   dofile "project.lua"
diff --git a/project.lua b/project.lua
index fca25fe..63eb160 100644
--- a/project.lua
+++ b/project.lua
@@ -7,15 +7,22 @@ project(ProjectName)
   staticruntime "On"
 
   filter { "system:windows" }
-    buildoptions { '/Gm-' }
-    buildoptions { '/MP' }
     ignoredefaultlibraries { "msvcrt" }
-    buildoptions { '/std:c++20' }
   filter { "system:linux" }
     buildoptions { "-mxsave" }
     linkoptions { "-pthread" }
     cppdialect "C++20"
   filter { }
+
+  filter { "system:windows", "configurations:not *Clang" }
+    buildoptions { '/std:c++20' }
+    buildoptions { '/Gm-' }
+    buildoptions { '/MP' }
+
+  filter { "system:windows", "configurations:*Clang" }
+    toolset("clang")
+    cppdialect "C++17"
+    defines { "__llvm__" }
   
   filter { "configurations:Release" }
     flags { "LinkTimeOptimization" }
diff --git a/src/block_codec32.h b/src/block_codec32.h
index 1366b5e..71159b4 100644
--- a/src/block_codec32.h
+++ b/src/block_codec32.h
@@ -2,6 +2,7 @@
 #define block_codec32_h__
 
 #include "hist.h"
+#include "simd_platform.h"
 
 #include <string.h>
 
@@ -205,7 +206,7 @@ struct rans32x32_16w_decoder<r32x32_dt_scalar, TotalSymbolCountBits, hist_dec_t<
 };
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 __attribute__((target("avx2")))
 #endif
 static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
@@ -441,7 +442,7 @@ static size_t _block_rans32x32_decode_section_avx2_varA(_rans_decode_state32_t<h
 }
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 __attribute__((target("avx2")))
 #endif
 static size_t _block_rans32x32_decode_section_avx2_varC(_rans_decode_state32_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
diff --git a/src/block_codec64.h b/src/block_codec64.h
index 4ea95f3..42aaa32 100644
--- a/src/block_codec64.h
+++ b/src/block_codec64.h
@@ -2,6 +2,7 @@
 #define block_codec64_h__
 
 #include "hist.h"
+#include "simd_platform.h"
 
 #include <string.h>
 
@@ -216,7 +217,7 @@ struct rans32x64_16w_decoder<r32x64_dt_scalar, TotalSymbolCountBits, hist_dec_t<
 };
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 __attribute__((target("avx2")))
 #endif
 static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t<hist_dec2_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
@@ -601,7 +602,7 @@ static size_t _block_rans32x64_decode_section_avx2_varA(_rans_decode_state64_t<h
 }
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool WriteAligned32 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 __attribute__((target("avx2")))
 #endif
 static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t<hist_dec_pack_t<TotalSymbolCountBits>> *pState, uint8_t *pOutData, const size_t startIndex, const size_t endIndex)
@@ -976,7 +977,7 @@ static size_t _block_rans32x64_decode_section_avx2_varC(_rans_decode_state64_t<h
 }
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 #ifdef __llvm__
 __attribute__((target("avx512bw")))
 #else
@@ -1330,7 +1331,7 @@ static size_t _block_rans32x64_decode_section_avx512_varA(_rans_decode_state64_t
 }
 
 template <uint32_t TotalSymbolCountBits, bool ShuffleMask16, bool YmmShuffle, bool WriteAligned64 = false>
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__llvm__)
 #ifdef __llvm__
 __attribute__((target("avx512bw")))
 #else
diff --git a/src/simd_platform.h b/src/simd_platform.h
index 905eba1..d1efac0 100644
--- a/src/simd_platform.h
+++ b/src/simd_platform.h
@@ -4,7 +4,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__llvm__)
 #include <intrin.h>
 #define __builtin_popcount __popcnt
 #else

From e7c14bc62cd890c67b269d7d8db1fc6e8317bb6b Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:35:14 +0200
Subject: [PATCH 28/34] Adding tests, simd restrictions

---
 .github/workflows/clang.yml   |  61 +++++++++
 .github/workflows/gcc.yml     |  61 +++++++++
 .github/workflows/msbuild.yml |  62 +++++++++
 src/main.cpp                  | 247 ++++++++++++++++++++++++++++++----
 4 files changed, 406 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index c287fc6..40c9951 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -24,3 +24,64 @@ jobs:
       with:
         name: build-clang64
         path: builds/bin/hsrans
+
+  test:
+    runs-on: ubuntu-latest
+    needs: build
+
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: build-clang64
+    
+    - name: Download Samples
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: git clone https://github.com/rainerzufalldererste/rle_samples.git
+
+    - name: Test sample0 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample0 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample0 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample0 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample1 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample1 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample1 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample1 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample2 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample2 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample2 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample2 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
\ No newline at end of file
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index bc38be7..01d3de9 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -25,3 +25,64 @@ jobs:
       with:
         name: build-gcc64
         path: builds/bin/hsrans
+
+  test:
+    runs-on: ubuntu-latest
+    needs: build
+
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: build-gcc64
+    
+    - name: Download Samples
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: git clone https://github.com/rainerzufalldererste/rle_samples.git
+
+    - name: Test sample0 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample0 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample0 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample0 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample1 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample1 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample1 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample1 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample2 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample2 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample2 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample2 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml
index be996b5..4b58ad4 100644
--- a/.github/workflows/msbuild.yml
+++ b/.github/workflows/msbuild.yml
@@ -36,3 +36,65 @@ jobs:
         name: build-win64
         path: builds/bin/hsrans.exe
 
+  test:
+    runs-on: windows-latest
+    needs: build
+
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: build-win64
+    
+    - name: Download Samples
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: git clone https://github.com/rainerzufalldererste/rle_samples.git
+
+    - name: Test sample0 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample0 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample0 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample0 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample1 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample1 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample1 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample1 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+
+    - name: Test sample2 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+
+    - name: Test sample2 AVX2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      
+    - name: Test sample2 SSE4.2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      
+    - name: Test sample2 No-SIMD
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
+
+
diff --git a/src/main.cpp b/src/main.cpp
index 7d28257..e2e67f3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -58,6 +58,7 @@ static bool _ExcludeBlock = false;
 static bool _Exclude32x16 = false;
 static bool _Exclude32x32 = false;
 static bool _Exclude32x64 = false;
+static bool _IsTest = false;
 static size_t _RunCount = 8;
 static size_t _EncodeRunCount = 2;
 static size_t _DecodeRunCount = 16;
@@ -168,7 +169,7 @@ size_t decode_with_thread_pool_wrapper(const uint8_t *pInData, const size_t inLe
   return func(pInData, inLength, pOutData, outCapacity, _pGlobalThreadPool);
 }
 
-static codec_info_t _Codecs[] =
+static const codec_info_t _Codecs[] =
 {
   { "rANS32x32 16w (variable block size)", 15, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_15>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_15, true }, {}}},
   { "rANS32x32 16w (variable block size)", 14, {{ "encode", encode_no_hist_wrapper<block_rANS32x32_16w_encode_14>, true }, {}}, {{ "decode", block_rANS32x32_16w_decode_14, true }, {}}},
@@ -251,6 +252,18 @@ const char ArgumentCpuCore[] = "--cpu-core";
 const char ArgumentRuns[] = "--runs";
 const char ArgumentRunsEncode[] = "--runs-enc";
 const char ArgumentRunsDecode[] = "--runs-dec";
+const char ArgumentTest[] = "--test";
+const char ArgumentMaxSimd[] = "--max-simd";
+const char ArgumentMaxSimdAVX512BW[] = "avx512bw";
+const char ArgumentMaxSimdAVX512F[] = "avx512f";
+const char ArgumentMaxSimdAVX2[] = "avx2";
+const char ArgumentMaxSimdAVX[] = "avx";
+const char ArgumentMaxSimdSSE42[] = "sse4.2";
+const char ArgumentMaxSimdSSE41[] = "sse4.1";
+const char ArgumentMaxSimdSSSE3[] = "ssse3";
+const char ArgumentMaxSimdSSE3[] = "sse3";
+const char ArgumentMaxSimdSSE2[] = "sse2";
+const char ArgumentMaxSimdNone[] = "none";
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -273,7 +286,9 @@ int32_t main(const int32_t argc, char **pArgv)
     printf("\t%s <uint>\t\tRun the benchmark for a specified amount of times (default: 2 encode, 16 decode; will override '%s'/'%s')\n", ArgumentRuns, ArgumentRunsEncode, ArgumentRunsDecode);
     printf("\t%s <uint>\tWhen Encoding: Run the benchmark for a specified amount of times (default: 2)\n", ArgumentRunsEncode);
     printf("\t%s <uint>\tWhen Decoding: Run the benchmark for a specified amount of times (default: 16)\n", ArgumentRunsDecode);
-    printf("\t%s <uint>\tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep);
+    printf("\t%s\t\tPrevent sleeping between runs/codecs (may lead to thermal throttling)\n", ArgumentNoSleep);
+    printf("\t%s\t\t\tRun as test scenario, fail on error, call codecs\n", ArgumentTest);
+    printf("\t%s <%s / %s / %s / %s / %s / %s / %s / %s / %s / %s>\n\t\t\t\tRestrict SIMD functions to specific instruction set\n", ArgumentMaxSimd, ArgumentMaxSimdAVX512BW, ArgumentMaxSimdAVX512F, ArgumentMaxSimdAVX2, ArgumentMaxSimdAVX, ArgumentMaxSimdSSE42, ArgumentMaxSimdSSE41, ArgumentMaxSimdSSSE3, ArgumentMaxSimdSSE3, ArgumentMaxSimdSSE2, ArgumentMaxSimdNone);
     return 1;
   }
 
@@ -341,6 +356,19 @@ int32_t main(const int32_t argc, char **pArgv)
         argsRemaining--;
         _DisableSleep = true;
       }
+      else if (argsRemaining >= 1 && strncmp(pArgv[argIndex], ArgumentTest, sizeof(ArgumentTest)) == 0)
+      {
+        argIndex++;
+        argsRemaining--;
+        _IsTest = true;
+        _DisableSleep = true;
+        _EncodeRunCount = 1;
+        _DecodeRunCount = 1;
+        _Include32Block = true;
+        _IncludeRaw = true;
+        _IncludeMT = true;
+        _OnlyRelevantCodecs = false;
+      }
       else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentRuns, sizeof(ArgumentRuns)) == 0)
       {
         _RunCount = strtoull(pArgv[argIndex + 1], nullptr, 10);
@@ -432,6 +460,162 @@ int32_t main(const int32_t argc, char **pArgv)
         pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
 #endif
       }
+      else if (argsRemaining >= 2 && strncmp(pArgv[argIndex], ArgumentMaxSimd, sizeof(ArgumentMaxSimd)) == 0)
+      {
+        _DetectCPUFeatures();
+
+        do
+        {
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX512BW, sizeof(ArgumentMaxSimdAVX512BW)) == 0)
+          {
+            if (!avx512BWSupported)
+            {
+              puts("AVX512BW is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            // In future versions with other simd flavours better than avx512 supported, disable them here.
+
+            break;
+          }
+
+          avx512PFSupported = false;
+          avx512ERSupported = false;
+          avx512CDSupported = false;
+          avx512BWSupported = false;
+          avx512DQSupported = false;
+          avx512VLSupported = false;
+          avx512IFMASupported = false;
+          avx512VBMISupported = false;
+          avx512VNNISupported = false;
+          avx512VBMI2Supported = false;
+          avx512POPCNTDQSupported = false;
+          avx512BITALGSupported = false;
+          avx5124VNNIWSupported = false;
+          avx5124FMAPSSupported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX512F, sizeof(ArgumentMaxSimdAVX512F)) == 0)
+          {
+            if (!avx512FSupported)
+            {
+              puts("AVX512F is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            // In future versions with other simd flavours better than avx512 supported, disable them here.
+
+            break;
+          }
+
+          avx512FSupported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX2, sizeof(ArgumentMaxSimdAVX2)) == 0)
+          {
+            if (!avx2Supported)
+            {
+              puts("AVX2 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          avx2Supported = false;
+          fma3Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdAVX, sizeof(ArgumentMaxSimdAVX)) == 0)
+          {
+            if (!avxSupported)
+            {
+              puts("AVX is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          avxSupported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE42, sizeof(ArgumentMaxSimdSSE42)) == 0)
+          {
+            if (!sse42Supported)
+            {
+              puts("SSE4.2 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          sse42Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE41, sizeof(ArgumentMaxSimdSSE41)) == 0)
+          {
+            if (!sse41Supported)
+            {
+              puts("SSE4.1 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          sse41Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSSE3, sizeof(ArgumentMaxSimdSSSE3)) == 0)
+          {
+            if (!ssse3Supported)
+            {
+              puts("SSSE3 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          ssse3Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE3, sizeof(ArgumentMaxSimdSSE3)) == 0)
+          {
+            if (!sse3Supported)
+            {
+              puts("SSE3 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          sse3Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdSSE2, sizeof(ArgumentMaxSimdSSE2)) == 0)
+          {
+            if (!sse2Supported)
+            {
+              puts("SSE2 is not supported by this platform. Aborting.");
+              return 1;
+            }
+
+            break;
+          }
+
+          sse2Supported = false;
+
+          if (strncmp(pArgv[argIndex + 1], ArgumentMaxSimdNone, sizeof(ArgumentMaxSimdNone)) == 0)
+          {
+            printf("%s %s is only intended for testing purposes and will only restrict some codecs to no SIMD\n", ArgumentMaxSimd, ArgumentMaxSimdNone);
+
+            break;
+          }
+
+          printf("Invalid SIMD Variant '%s' specified.", pArgv[argIndex + 1]);
+          return 1;
+
+        } while (false);
+
+        argIndex += 2;
+        argsRemaining -= 2;
+      }
       else
       {
         printf("Invalid Parameter '%s'. Aborting.", pArgv[argIndex]);
@@ -580,22 +764,22 @@ int32_t main(const int32_t argc, char **pArgv)
     size_t encodedSize = 0;
     _RunCount = _EncodeRunCount;
 
-    for (size_t i = 0; i < MaxEncoderCount; i++)
+    for (size_t codecFuncIndex = 0; codecFuncIndex < MaxEncoderCount; codecFuncIndex++)
     {
-      if (_Codecs[codecId].encoders[i].name == nullptr)
+      if (_Codecs[codecId].encoders[codecFuncIndex].name == nullptr)
         break;
 
-      if (_OnlyRelevantCodecs && !_Codecs[codecId].encoders[i].candidateForFastest)
+      if (_OnlyRelevantCodecs && !_Codecs[codecId].encoders[codecFuncIndex].candidateForFastest)
           continue;
 
-      if (strstr(_Codecs[codecId].encoders[i].name, " avx2 ") != nullptr && !avx2Supported)
+      if (strstr(_Codecs[codecId].encoders[codecFuncIndex].name, " avx2 ") != nullptr && !avx2Supported)
       {
-        printf("  %-38s |          | (Skipped; No AVX2 available)\n", _Codecs[codecId].encoders[i].name);
+        printf("  %-38s |          | (Skipped; No AVX2 available)\n", _Codecs[codecId].encoders[codecFuncIndex].name);
         continue;
       }
-      else if (strstr(_Codecs[codecId].encoders[i].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported))
+      else if (strstr(_Codecs[codecId].encoders[codecFuncIndex].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported))
       {
-        printf("  %-38s |          | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].encoders[i].name);
+        printf("  %-38s |          | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].encoders[codecFuncIndex].name);
         continue;
       }
 
@@ -604,7 +788,7 @@ int32_t main(const int32_t argc, char **pArgv)
       if (_RunCount > 1)
       {
         printf("\r  (dry run)");
-        encodedSize = _Codecs[codecId].encoders[i].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist);
+        encodedSize = _Codecs[codecId].encoders[codecFuncIndex].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist);
       }
 
       SleepNs(2500ULL * 1000 * 1000);
@@ -613,7 +797,7 @@ int32_t main(const int32_t argc, char **pArgv)
       {
         const uint64_t startTick = GetCurrentTimeTicks();
         const uint64_t startClock = __rdtsc();
-        encodedSize = _Codecs[codecId].encoders[i].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist);
+        encodedSize = _Codecs[codecId].encoders[codecFuncIndex].func(pUncompressedData, fileSize, pCompressedData, compressedDataCapacity, &hist);
         const uint64_t endClock = __rdtsc();
         const uint64_t endTick = GetCurrentTimeTicks();
 
@@ -622,12 +806,12 @@ int32_t main(const int32_t argc, char **pArgv)
         _NsPerRun[run] = TicksToNs(endTick - startTick);
         _ClocksPerRun[run] = endClock - startClock;
 
-        printf("\r  %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
+        printf("\r  %-38s | %6.2f %% | compressed to %" PRIu64 " bytes (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].encoders[codecFuncIndex].name, encodedSize / (double)fileSize * 100.0, encodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
 
         SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000));
       }
 
-      printf("\r  %-38s | %6.2f %% ", _Codecs[codecId].encoders[i].name, encodedSize / (double)fileSize * 100.0);
+      printf("\r  %-38s | %6.2f %% ", _Codecs[codecId].encoders[codecFuncIndex].name, encodedSize / (double)fileSize * 100.0);
       print_perf_info(fileSize);
 
       if (_Codecs[codecId].decoders[0].func != nullptr)
@@ -635,33 +819,41 @@ int32_t main(const int32_t argc, char **pArgv)
         const size_t decodedSize = _Codecs[codecId].decoders[0].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
 
         if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize))
+        {
           puts("Failed to validate.");
+
+          if (_IsTest)
+            return 1;
+        }
       }
       else
       {
         puts("Unable to validate, no decoder available.");
+
+        if (_IsTest)
+          return 2;
       }
     }
 
     size_t decodedSize = 0;
     _RunCount = _DecodeRunCount;
 
-    for (size_t i = 0; i < MaxDecoderCount; i++)
+    for (size_t codecFuncIndex = 0; codecFuncIndex < MaxDecoderCount; codecFuncIndex++)
     {
-      if (_Codecs[codecId].decoders[i].name == nullptr)
+      if (_Codecs[codecId].decoders[codecFuncIndex].name == nullptr)
         break;
 
-      if (_OnlyRelevantCodecs && !_Codecs[codecId].decoders[i].candidateForFastest)
-          continue;
+      if (_OnlyRelevantCodecs && !_Codecs[codecId].decoders[codecFuncIndex].candidateForFastest)
+        continue;
 
-      if (strstr(_Codecs[codecId].decoders[i].name, " avx2 ") != nullptr && !avx2Supported)
+      if (strstr(_Codecs[codecId].decoders[codecFuncIndex].name, " avx2 ") != nullptr && !avx2Supported)
       {
-        printf("  %-38s |          | (Skipped; No AVX2 available)\n", _Codecs[codecId].decoders[i].name);
+        printf("  %-38s |          | (Skipped; No AVX2 available)\n", _Codecs[codecId].decoders[codecFuncIndex].name);
         continue;
       }
-      else if (strstr(_Codecs[codecId].decoders[i].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported))
+      else if (strstr(_Codecs[codecId].decoders[codecFuncIndex].name, " avx512 ") != nullptr && (!avx512FSupported || !avx512DQSupported || !avx512BWSupported))
       {
-        printf("  %-38s |          | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].decoders[i].name);
+        printf("  %-38s |          | (Skipped, No AVX-512 F/DQ/BW available)\n", _Codecs[codecId].decoders[codecFuncIndex].name);
         continue;
       }
 
@@ -670,7 +862,7 @@ int32_t main(const int32_t argc, char **pArgv)
       if (_RunCount > 1)
       {
         printf("\r(dry run)");
-        decodedSize = _Codecs[codecId].decoders[i].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
+        decodedSize = _Codecs[codecId].decoders[codecFuncIndex].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
       }
 
       SleepNs(2500ULL * 1000 * 1000);
@@ -679,7 +871,7 @@ int32_t main(const int32_t argc, char **pArgv)
       {
         const uint64_t startTick = GetCurrentTimeTicks();
         const uint64_t startClock = __rdtsc();
-        decodedSize = _Codecs[codecId].decoders[i].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
+        decodedSize = _Codecs[codecId].decoders[codecFuncIndex].func(pCompressedData, encodedSize, pDecompressedData, fileSize);
         const uint64_t endClock = __rdtsc();
         const uint64_t endTick = GetCurrentTimeTicks();
 
@@ -688,16 +880,21 @@ int32_t main(const int32_t argc, char **pArgv)
         _NsPerRun[run] = TicksToNs(endTick - startTick);
         _ClocksPerRun[run] = endClock - startClock;
 
-        printf("\r  %-38s |          | decompressed to %" PRIu64 " bytes. (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].decoders[i].name, decodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
+        printf("\r  %-38s |          | decompressed to %" PRIu64 " bytes. (%6.3f clocks/byte, %5.2f MiB/s)", _Codecs[codecId].decoders[codecFuncIndex].name, decodedSize, (endClock - startClock) / (double)fileSize, (fileSize / (1024.0 * 1024.0)) / (TicksToNs(endTick - startTick) * 1e-9));
 
         SleepNs(rans_min(_NsPerRun[run] * 2ULL, 500ULL * 1000 * 1000));
       }
 
-      printf("\r  %-38s |          ", _Codecs[codecId].decoders[i].name);
+      printf("\r  %-38s |          ", _Codecs[codecId].decoders[codecFuncIndex].name);
       print_perf_info(fileSize);
 
       if (decodedSize != fileSize || !Validate(pDecompressedData, pUncompressedData, fileSize))
+      {
         puts("\nFailed to validate.");
+
+        if (_IsTest)
+          return 1;
+      }
     }
   }
 

From 290150d7b2a1d68f0e5a6154888fca2ba2d67d47 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:37:52 +0200
Subject: [PATCH 29/34] whoops, not executable again

---
 .github/workflows/clang.yml | 4 ++++
 .github/workflows/gcc.yml   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index 40c9951..f48b2ec 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -33,6 +33,10 @@ jobs:
     - uses: actions/download-artifact@v3
       with:
         name: build-clang64
+
+    - name: Mark as Executable
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: chmod +x hsrans
     
     - name: Download Samples
       working-directory: ${{env.GITHUB_WORKSPACE}}
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 01d3de9..dd534c7 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -34,6 +34,10 @@ jobs:
     - uses: actions/download-artifact@v3
       with:
         name: build-gcc64
+
+    - name: Mark as Executable
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: chmod +x hsrans
     
     - name: Download Samples
       working-directory: ${{env.GITHUB_WORKSPACE}}

From 4674302c01baed1ac92717232496b0b530194ce5 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:40:17 +0200
Subject: [PATCH 30/34] yeah, that doesn't work, it's a different
 application...

---
 .github/workflows/clang.yml | 24 ++++++++++++------------
 .github/workflows/gcc.yml   | 24 ++++++++++++------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index f48b2ec..f83692c 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -44,48 +44,48 @@ jobs:
 
     - name: Test sample0 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2
       
     - name: Test sample0 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2
       
     - name: Test sample0 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
     - name: Test sample1 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2
       
     - name: Test sample1 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2
       
     - name: Test sample1 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
     - name: Test sample2 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2
       
     - name: Test sample2 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2
       
     - name: Test sample2 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
\ No newline at end of file
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
\ No newline at end of file
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index dd534c7..1d8aa74 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -45,48 +45,48 @@ jobs:
 
     - name: Test sample0 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2
       
     - name: Test sample0 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2
       
     - name: Test sample0 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
     - name: Test sample1 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2
       
     - name: Test sample1 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2
       
     - name: Test sample1 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
     - name: Test sample2 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2
       
     - name: Test sample2 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2
       
     - name: Test sample2 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd none

From 14c4a70f463838cd422dc2a3b769d22eaa9138d5 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:48:01 +0200
Subject: [PATCH 31/34] duh...

---
 .github/workflows/msbuild.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml
index 4b58ad4..6a0bb17 100644
--- a/.github/workflows/msbuild.yml
+++ b/.github/workflows/msbuild.yml
@@ -51,50 +51,50 @@ jobs:
 
     - name: Test sample0 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2
       
     - name: Test sample0 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2
       
     - name: Test sample0 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
     - name: Test sample1 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2
       
     - name: Test sample1 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2
       
     - name: Test sample1 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
     - name: Test sample2 AVX512BW
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx512bw
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd avx2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2
       
     - name: Test sample2 SSE4.2
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd sse4.2
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2
       
     - name: Test sample2 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --min-time 0 --runs 0 --max-simd none
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
 
 

From ff42f64868f25636b79d77c5c1ed111f24261944 Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:51:24 +0200
Subject: [PATCH 32/34] excluding avx-512 bw, as that's not supported on the
 build servers (sometimes?)

---
 .github/workflows/clang.yml   | 20 ++++++++++----------
 .github/workflows/gcc.yml     | 18 +++++++++---------
 .github/workflows/msbuild.yml | 20 +++++++++-----------
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index f83692c..251f38a 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -42,9 +42,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: git clone https://github.com/rainerzufalldererste/rle_samples.git
 
-    - name: Test sample0 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
+    #- name: Test sample0 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -58,9 +58,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
-    - name: Test sample1 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
+    #- name: Test sample1 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -74,9 +74,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
-    - name: Test sample2 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
+    #- name: Test sample2 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -88,4 +88,4 @@ jobs:
       
     - name: Test sample2 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
\ No newline at end of file
+      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 1d8aa74..633f12b 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -43,9 +43,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: git clone https://github.com/rainerzufalldererste/rle_samples.git
 
-    - name: Test sample0 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
+    #- name: Test sample0 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -59,9 +59,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
-    - name: Test sample1 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
+    #- name: Test sample1 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -75,9 +75,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
-    - name: Test sample2 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
+    #- name: Test sample2 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml
index 6a0bb17..7aaeb68 100644
--- a/.github/workflows/msbuild.yml
+++ b/.github/workflows/msbuild.yml
@@ -49,9 +49,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: git clone https://github.com/rainerzufalldererste/rle_samples.git
 
-    - name: Test sample0 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
+    #- name: Test sample0 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -65,9 +65,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
-    - name: Test sample1 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
+    #- name: Test sample1 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -81,9 +81,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
-    - name: Test sample2 AVX512BW
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
+    #- name: Test sample2 AVX512BW
+    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -96,5 +96,3 @@ jobs:
     - name: Test sample2 No-SIMD
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample2.bin --test --max-simd none
-
-

From 62a93571eb43d40c29703c4b558c74a5b3b83c7a Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 17:57:58 +0200
Subject: [PATCH 33/34] Removing GCC tests because they're apparently just
 hanging??? adding native variant in case AVX-512 BW _is_ available

---
 .github/workflows/clang.yml   |  18 ++---
 .github/workflows/gcc.yml     | 128 +++++++++++++++++-----------------
 .github/workflows/msbuild.yml |  18 ++---
 3 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index 251f38a..4bca8f3 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -42,9 +42,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: git clone https://github.com/rainerzufalldererste/rle_samples.git
 
-    #- name: Test sample0 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
+    - name: Test sample0
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -58,9 +58,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
-    #- name: Test sample1 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
+    - name: Test sample1 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -74,9 +74,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
-    #- name: Test sample2 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
+    - name: Test sample2 AVX512BW
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 633f12b..29f1eba 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -26,67 +26,67 @@ jobs:
         name: build-gcc64
         path: builds/bin/hsrans
 
-  test:
-    runs-on: ubuntu-latest
-    needs: build
-
-    steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: build-gcc64
-
-    - name: Mark as Executable
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: chmod +x hsrans
-    
-    - name: Download Samples
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: git clone https://github.com/rainerzufalldererste/rle_samples.git
-
-    #- name: Test sample0 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
-
-    - name: Test sample0 AVX2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2
-      
-    - name: Test sample0 SSE4.2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2
-      
-    - name: Test sample0 No-SIMD
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample0.bin --test --max-simd none
-
-    #- name: Test sample1 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
-
-    - name: Test sample1 AVX2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2
-      
-    - name: Test sample1 SSE4.2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2
-      
-    - name: Test sample1 No-SIMD
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample1.bin --test --max-simd none
-
-    #- name: Test sample2 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
-
-    - name: Test sample2 AVX2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2
-      
-    - name: Test sample2 SSE4.2
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2
-      
-    - name: Test sample2 No-SIMD
-      working-directory: ${{env.GITHUB_WORKSPACE}}
-      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
+#  test:
+#    runs-on: ubuntu-latest
+#    needs: build
+#
+#    steps:
+#    - uses: actions/download-artifact@v3
+#      with:
+#        name: build-gcc64
+#
+#    - name: Mark as Executable
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: chmod +x hsrans
+#    
+#    - name: Download Samples
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: git clone https://github.com/rainerzufalldererste/rle_samples.git
+#
+#    #- name: Test sample0
+#    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+#    #  run: ./hsrans rle_samples/sample0.bin --test
+#
+#    - name: Test sample0 AVX2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample0.bin --test --max-simd avx2
+#      
+#    - name: Test sample0 SSE4.2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample0.bin --test --max-simd sse4.2
+#      
+#    - name: Test sample0 No-SIMD
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample0.bin --test --max-simd none
+#
+#    #- name: Test sample1
+#    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+#    #  run: ./hsrans rle_samples/sample1.bin --test
+#
+#    - name: Test sample1 AVX2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample1.bin --test --max-simd avx2
+#      
+#    - name: Test sample1 SSE4.2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample1.bin --test --max-simd sse4.2
+#      
+#    - name: Test sample1 No-SIMD
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample1.bin --test --max-simd none
+#
+#    #- name: Test sample2
+#    #  working-directory: ${{env.GITHUB_WORKSPACE}}
+#    #  run: ./hsrans rle_samples/sample2.bin --test
+#
+#    - name: Test sample2 AVX2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample2.bin --test --max-simd avx2
+#      
+#    - name: Test sample2 SSE4.2
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample2.bin --test --max-simd sse4.2
+#      
+#    - name: Test sample2 No-SIMD
+#      working-directory: ${{env.GITHUB_WORKSPACE}}
+#      run: ./hsrans rle_samples/sample2.bin --test --max-simd none
diff --git a/.github/workflows/msbuild.yml b/.github/workflows/msbuild.yml
index 7aaeb68..3bd60fa 100644
--- a/.github/workflows/msbuild.yml
+++ b/.github/workflows/msbuild.yml
@@ -49,9 +49,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: git clone https://github.com/rainerzufalldererste/rle_samples.git
 
-    #- name: Test sample0 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample0.bin --test --max-simd avx512bw
+    - name: Test sample0
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample0.bin --test
 
     - name: Test sample0 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -65,9 +65,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample0.bin --test --max-simd none
 
-    #- name: Test sample1 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample1.bin --test --max-simd avx512bw
+    - name: Test sample1
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample1.bin --test
 
     - name: Test sample1 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}
@@ -81,9 +81,9 @@ jobs:
       working-directory: ${{env.GITHUB_WORKSPACE}}
       run: ./hsrans rle_samples/sample1.bin --test --max-simd none
 
-    #- name: Test sample2 AVX512BW
-    #  working-directory: ${{env.GITHUB_WORKSPACE}}
-    #  run: ./hsrans rle_samples/sample2.bin --test --max-simd avx512bw
+    - name: Test sample2
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: ./hsrans rle_samples/sample2.bin --test
 
     - name: Test sample2 AVX2
       working-directory: ${{env.GITHUB_WORKSPACE}}

From a6a054f5f46e742722a14e2ebc94af3a00386b7c Mon Sep 17 00:00:00 2001
From: Christoph Stiller <c.stiller@live.de>
Date: Sat, 8 Jul 2023 18:07:39 +0200
Subject: [PATCH 34/34] Center aligning checkmarks

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5fa70cb..29ebdcc 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@
 
 ### [enwik8](http://mattmahoney.net/dc/textdata.html) (Wikipedia Extract, 100,000,000 Bytes)
 | Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
-| -- | --: | --: | --: | --: | --: |
+| -- | :-: | --: | --: | --: | --: |
 | **rANS32x64 16w 11 (raw)**                | ✔️ |  64.48 % |   336.81 MiB/s |   1.42 clk/byte |  3,018.02 MiB/s |
 | **rANS32x64 16w 10 (raw)**                | ✔️ |  65.97 % |   335.28 MiB/s |   1.42 clk/byte |  3,013.45 MiB/s |
 | **rANS32x64 16w 12 (raw)**                | ✔️ |  63.83 % |   347.90 MiB/s |   1.42 clk/byte |  3,009.18 MiB/s |
@@ -77,7 +77,7 @@
 
 ### [x-ray](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (X-Ray Medical Image, Part of the Silesia Corpus)
 | Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
-| -- | --: | --: | --: | --: | --: |
+| -- | :-: | --: | --: | --: | --: |
 | **rANS32x64 16w 11 (raw)**   | ✔️ |  82.60 % |   311.60 MiB/s |   1.39 clk/byte |  3,079.98 MiB/s |
 | **rANS32x64 16w 12**         | ✔️ |  80.17 % |   193.60 MiB/s |   1.41 clk/byte |  3,048.15 MiB/s |
 | **rANS32x64 16w 12 (raw)**   | ✔️ |  82.57 % |   308.10 MiB/s |   1.41 clk/byte |  3,041.07 MiB/s |
@@ -126,7 +126,7 @@
 
 ### [mozilla](https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia) (Tarred executables of Mozilla 1.0, Part of the Silesia Corpus)
 | Codec Type | Open-<br/>Source | Ratio | Encoder<br/>Throughput | Decoder<br/>Clocks/Byte | Decoder<br/>Throughput |
-| -- | --: | --: | --: | --: | --: |
+| -- | :-: | --: | --: | --: | --: |
 | **rANS32x64 16w 11 (raw)**                | ✔️ |  77.82 % |   309.39 MiB/s |   1.44 clk/byte |  2,978.20 MiB/s |
 | TurboANX 63                               | ❌ |  70.1  % |   965.97 MiB/s |   -             |  2,959.13 MiB/s |
 | **rANS32x64 16w 12 (raw)**                | ✔️ |  77.79 % |   308.29 MiB/s |   1.45 clk/byte |  2,946.52 MiB/s |