From 133fb58a22d9a1a2d2e1f59b99976b29a4737bc5 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Thu, 7 Mar 2024 14:59:55 -0800 Subject: [PATCH] tell the compiler K is always greater than zero This probably won't affect performance, but it does enable more concise code, becuase the compiler will know the K loop will always execute at least once. --- .../99_matrixexperiments/matrix_helpers.cl | 7 ++++++ .../matrix_kernel_tiled.cl | 6 +++++ .../99_matrixexperiments/matrix_kernels.cl | 24 +++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/samples/99_matrixexperiments/matrix_helpers.cl b/samples/99_matrixexperiments/matrix_helpers.cl index e8a08ff..dd7dbba 100644 --- a/samples/99_matrixexperiments/matrix_helpers.cl +++ b/samples/99_matrixexperiments/matrix_helpers.cl @@ -51,6 +51,13 @@ float8 activation(float8 f) return res; } +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif +#if __has_builtin(__builtin_expect) == 0 +#define __builtin_expect(x) +#endif + #if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) typedef global ushort* global_aligned_ushort_ptr __attribute__((align_value(4))); diff --git a/samples/99_matrixexperiments/matrix_kernel_tiled.cl b/samples/99_matrixexperiments/matrix_kernel_tiled.cl index beea5ab..8027474 100644 --- a/samples/99_matrixexperiments/matrix_kernel_tiled.cl +++ b/samples/99_matrixexperiments/matrix_kernel_tiled.cl @@ -112,6 +112,7 @@ void HELPER_NAME(atile_load_rowmajor_sg8, MM, NN)(global ushort* A, int tM, int __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1))) kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 8; const int N = get_global_size(0) * NN; @@ -173,6 +174,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global fl __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1))) kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 8; const int N = get_global_size(0) * NN; @@ -282,6 +284,7 @@ void HELPER_NAME(atile_load_rowmajor, MM, NN)(global ushort* A, int tM, int K, i __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1))) kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int N = get_global_size(0) * NN; @@ -343,6 +346,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global f __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1))) kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int N = get_global_size(0) * NN; @@ -599,6 +603,7 @@ __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_si kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int M = get_global_size(1) * tM * MM; @@ -658,6 +663,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1))) kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int M = get_global_size(1) * tM * MM; diff --git a/samples/99_matrixexperiments/matrix_kernels.cl b/samples/99_matrixexperiments/matrix_kernels.cl index 42e3181..f225455 100644 --- a/samples/99_matrixexperiments/matrix_kernels.cl +++ b/samples/99_matrixexperiments/matrix_kernels.cl @@ -39,6 +39,7 @@ kernel void bfloat16_naive(global float* C, global ushort* A, global ushort* B, __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 8; const int N = get_global_size(0); @@ -59,6 +60,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, glob __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 8; const int N = get_global_size(0); @@ -79,6 +81,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, glob __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 8; const int N = get_global_size(0); @@ -99,6 +102,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, glob __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 8; const int N = get_global_size(0); @@ -121,6 +125,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, glob __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 8; const int N = get_global_size(0); @@ -141,6 +146,7 @@ kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global u __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 8; const int N = get_global_size(0); @@ -161,6 +167,7 @@ kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global u __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 8; const int N = get_global_size(0); @@ -181,6 +188,7 @@ kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global u __attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1))) kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 8; const int N = get_global_size(0); @@ -205,6 +213,7 @@ kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global u __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 16; const int N = get_global_size(0); @@ -225,6 +234,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, glo __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 16; const int N = get_global_size(0); @@ -245,6 +255,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, glo __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 16; const int N = get_global_size(0); @@ -265,6 +276,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, glo __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int N = get_global_size(0); @@ -287,6 +299,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, glo __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 16; const int N = get_global_size(0); @@ -307,6 +320,7 @@ kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 16; const int N = get_global_size(0); @@ -327,6 +341,7 @@ kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 16; const int N = get_global_size(0); @@ -347,6 +362,7 @@ kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int N = get_global_size(0); @@ -369,6 +385,7 @@ kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 16; const int M = get_global_size(1); @@ -390,6 +407,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global usho __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 16; const int M = get_global_size(1) * tM; @@ -411,6 +429,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global usho __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 16; const int M = get_global_size(1) * tM; @@ -432,6 +451,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global usho __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int M = get_global_size(1) * tM; @@ -453,6 +473,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global usho __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 1; const int tN = 16; const int M = get_global_size(1) * tM; @@ -474,6 +495,7 @@ kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort* __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 2; const int tN = 16; const int M = get_global_size(1) * tM; @@ -495,6 +517,7 @@ kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort* __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 4; const int tN = 16; const int M = get_global_size(1) * tM; @@ -516,6 +539,7 @@ kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort* __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) kernel void bfloat16_dpas_blockread_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K) { + __builtin_assume(K > 0); // Always at least one K iteration. const int tM = 8; const int tN = 16; const int M = get_global_size(1) * tM;