tell the compiler K is always greater than zero

This probably won't affect performance, but it does enable more concise code, becuase the compiler will know the K loop will always execute at least once.
bashbaug · Mar 7, 2024 · 133fb58 · 133fb58
1 parent 8d23a8c
commit 133fb58
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 0 deletions.
diff --git a/samples/99_matrixexperiments/matrix_helpers.cl b/samples/99_matrixexperiments/matrix_helpers.cl
@@ -51,6 +51,13 @@ float8 activation(float8 f)
  return res;
 }
 
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+#if __has_builtin(__builtin_expect) == 0
+#define __builtin_expect(x)
+#endif
+
 #if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short)
 
 typedef global ushort* global_aligned_ushort_ptr __attribute__((align_value(4)));

diff --git a/samples/99_matrixexperiments/matrix_kernel_tiled.cl b/samples/99_matrixexperiments/matrix_kernel_tiled.cl
@@ -112,6 +112,7 @@ void HELPER_NAME(atile_load_rowmajor_sg8, MM, NN)(global ushort* A, int tM, int
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
 kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 8;
  const int N = get_global_size(0) * NN;
@@ -173,6 +174,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global fl
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
 kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 8;
  const int N = get_global_size(0) * NN;
@@ -282,6 +284,7 @@ void HELPER_NAME(atile_load_rowmajor, MM, NN)(global ushort* A, int tM, int K, i
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
 kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int N = get_global_size(0) * NN;
@@ -343,6 +346,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global f
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
 kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int N = get_global_size(0) * NN;
@@ -599,6 +603,7 @@ __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_si
 kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K)
 
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int M = get_global_size(1) * tM * MM;
@@ -658,6 +663,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
 kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int M = get_global_size(1) * tM * MM;

diff --git a/samples/99_matrixexperiments/matrix_kernels.cl b/samples/99_matrixexperiments/matrix_kernels.cl
@@ -39,6 +39,7 @@ kernel void bfloat16_naive(global float* C, global ushort* A, global ushort* B,
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -59,6 +60,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, glob
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -79,6 +81,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, glob
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -99,6 +102,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, glob
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -121,6 +125,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, glob
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -141,6 +146,7 @@ kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global u
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -161,6 +167,7 @@ kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global u
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -181,6 +188,7 @@ kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global u
 __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
 kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 8;
  const int N = get_global_size(0);
@@ -205,6 +213,7 @@ kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global u
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -225,6 +234,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, glo
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -245,6 +255,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, glo
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -265,6 +276,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, glo
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -287,6 +299,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, glo
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -307,6 +320,7 @@ kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -327,6 +341,7 @@ kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -347,6 +362,7 @@ kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int N = get_global_size(0);
@@ -369,6 +385,7 @@ kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 16;
  const int M = get_global_size(1);
@@ -390,6 +407,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global usho
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -411,6 +429,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global usho
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -432,6 +451,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global usho
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -453,6 +473,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global usho
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 1;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -474,6 +495,7 @@ kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort*
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 2;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -495,6 +517,7 @@ kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort*
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 4;
  const int tN = 16;
  const int M = get_global_size(1) * tM;
@@ -516,6 +539,7 @@ kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort*
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
 kernel void bfloat16_dpas_blockread_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
 {
+ __builtin_assume(K > 0); // Always at least one K iteration.
  const int tM = 8;
  const int tN = 16;
  const int M = get_global_size(1) * tM;