diff --git a/samples/99_matrixexperiments/matrix_helpers.cl b/samples/99_matrixexperiments/matrix_helpers.cl
index 97b2ce5..1164c24 100644
--- a/samples/99_matrixexperiments/matrix_helpers.cl
+++ b/samples/99_matrixexperiments/matrix_helpers.cl
@@ -601,6 +601,8 @@ enum LSC_LDCC {
 typedef ushort __attribute__((ext_vector_type(32))) ushort32;
 typedef ushort __attribute__((ext_vector_type(64))) ushort64;
 
+typedef uint __attribute__((ext_vector_type(32))) uint32;
+
 // Define block reads, prefetches, and writes.  These are supported by the hardware but are not in the headers:
 
 ushort   __builtin_IB_subgroup_block_read_flat_u16_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
@@ -616,6 +618,11 @@ ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(long baseoffset, int
 uint8  __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
 uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
 
+uint16  __builtin_IB_subgroup_block_read_flat_transform_u16_k32(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
+
+uint16  __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
+uint32  __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
+
 
 void __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
 void __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
@@ -693,43 +700,85 @@ uint16 intel_subgroup_block_read_u32_m16k16(const __global void* base_address, i
     return __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
 }
 
+// Each block is K rows x N columns, where the K rows have been VNNI transformed.
+int8 intel_subgroup_block_read_transform_u16_k16n16(__global void *base_address, int width, int height, int pitch, int2 coord)
+{
+    // Note: this function is in the headers, but is named confusingly and returns unsigned integers rather than signed integers:
+    return as_int8(intel_subgroup_block_read_transform_u16_k16(base_address, width, height, pitch, coord));
+}
+int16 intel_subgroup_block_read_transform_u16_k32n16(__global void *base_address, int width, int height, int pitch, int2 coord)
+{
+    return as_int16(__builtin_IB_subgroup_block_read_flat_transform_u16_k32(as_long(base_address), width - 1, height - 1, pitch - 1, coord));
+}
+int16 intel_subgroup_block_read_transform_u16_k16n16v2(__global void *base_address, int width, int height, int pitch, int2 coord)
+{
+    return as_int16(__builtin_IB_subgroup_block_read_flat_transform_u16_k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord));
+}
+void intel_subgroup_block_read_transform_u16_k32n16v2(__global void *base_address, int width, int height, int pitch, int2 coord, int8 dst[2][2])
+{
+    uint32 tmp = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
+    dst[0][0] = as_int8(tmp.lo.lo);
+    dst[0][1] = as_int8(tmp.lo.hi);
+    dst[1][0] = as_int8(tmp.hi.lo);
+    dst[1][1] = as_int8(tmp.hi.hi);
+}
+
+
 #define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
 
 void intel_subgroup_block_prefetch_u16_m1k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m2k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m4k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m8k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m8k16v2(__global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m16k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m32k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m16k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 void intel_subgroup_block_prefetch_u16_m32k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord)
 {
+#if defined(PREFETCH_DEFAULT)
     __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
+#endif // defined(PREFETCH_DEFAULT)
 }
 
 
diff --git a/samples/99_matrixexperiments/matrix_kernel_tiled.cl b/samples/99_matrixexperiments/matrix_kernel_tiled.cl
index f1bb189..fda52ca 100644
--- a/samples/99_matrixexperiments/matrix_kernel_tiled.cl
+++ b/samples/99_matrixexperiments/matrix_kernel_tiled.cl
@@ -453,11 +453,42 @@ void HELPER_NAME(atile_load_blockread_rowmajor, MM, NN)(global ushort* A, int tM
     }
 }
 
+// TODO: consider swapping KK and NN order!
 void HELPER_NAME(btile_load_blockread_rowmajor, MM, NN)(global ushort* B, int tN, int K, int N, int k, int n, int8 bData[KK][NN])
 {
-    for (int kk = 0; kk < KK; kk++) {
-        for (int nn = 0; nn < NN; nn++) {
-            bData[kk][nn] = as_int8(intel_subgroup_block_read_transform_u16_k16(B, N * sizeof(ushort), K, N * sizeof(ushort), (int2)(n + nn * tN, k + kk * tK)));
+    if (KK % 2 == 0 & NN % 2 == 0) {
+        for (int kk = 0; kk < KK; kk+=2) {
+            for (int nn = 0; nn < NN; nn+=2) {
+                int8 tmp[2][2];
+                intel_subgroup_block_read_transform_u16_k32n16v2(B, N * sizeof(ushort), K, N * sizeof(ushort), (int2)(n + nn * tN, k + kk * tK), tmp);
+                for (int tnn = 0; tnn < 2; tnn++) {
+                    for (int tkk = 0; tkk < 2; tkk++) {
+                        bData[kk + tkk][nn + tnn] = tmp[tnn][tkk];
+                    }
+                }
+            }
+        }
+    } else if (NN % 2 == 0) {
+        for (int kk = 0; kk < KK; kk++) {
+            for (int nn = 0; nn < NN; nn+=2) {
+                int16 bTemp = intel_subgroup_block_read_transform_u16_k16n16v2(B, N * sizeof(ushort), K, N * sizeof(ushort), (int2)(n + nn * tN, k + kk * tK));
+                bData[kk][nn + 0] = bTemp.lo;
+                bData[kk][nn + 1] = bTemp.hi;
+            }
+        }
+    } else if (KK % 2 == 0) {
+        for (int kk = 0; kk < KK; kk+=2) {
+            for (int nn = 0; nn < NN; nn++) {
+                int16 bTemp = intel_subgroup_block_read_transform_u16_k32n16(B, N * sizeof(ushort), K, N * sizeof(ushort), (int2)(n + nn * tN, k + kk * tK));
+                bData[kk + 0][nn] = bTemp.lo;
+                bData[kk + 1][nn] = bTemp.hi;
+            }
+        }
+    } else {
+        for (int kk = 0; kk < KK; kk++) {
+            for (int nn = 0; nn < NN; nn++) {
+                bData[kk][nn] = intel_subgroup_block_read_transform_u16_k16n16(B, N * sizeof(ushort), K, N * sizeof(ushort), (int2)(n + nn * tN, k + kk * tK));
+            }
         }
     }
 }