Skip to content

Commit

Permalink
tell the compiler K is always greater than zero
Browse files Browse the repository at this point in the history
This probably won't affect performance, but it does enable more
concise code, becuase the compiler will know the K loop will always
execute at least once.
  • Loading branch information
bashbaug committed Mar 7, 2024
1 parent 8d23a8c commit 133fb58
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
7 changes: 7 additions & 0 deletions samples/99_matrixexperiments/matrix_helpers.cl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ float8 activation(float8 f)
return res;
}

#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if __has_builtin(__builtin_expect) == 0
#define __builtin_expect(x)
#endif

#if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short)

typedef global ushort* global_aligned_ushort_ptr __attribute__((align_value(4)));
Expand Down
6 changes: 6 additions & 0 deletions samples/99_matrixexperiments/matrix_kernel_tiled.cl
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ void HELPER_NAME(atile_load_rowmajor_sg8, MM, NN)(global ushort* A, int tM, int
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 8;
const int N = get_global_size(0) * NN;
Expand Down Expand Up @@ -173,6 +174,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 8, MM, NN)(global fl
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 8, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 8;
const int N = get_global_size(0) * NN;
Expand Down Expand Up @@ -282,6 +284,7 @@ void HELPER_NAME(atile_load_rowmajor, MM, NN)(global ushort* A, int tM, int K, i
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int N = get_global_size(0) * NN;
Expand Down Expand Up @@ -343,6 +346,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_rowmajor_tiled, 8, 16, MM, NN)(global f
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
kernel void MM_KERNEL_NAME(bfloat16_dpas_vnni_tiled, 8, 16, MM, NN)(global float* C, global_aligned_ushort_ptr A, global_aligned_ushort_ptr B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int N = get_global_size(0) * NN;
Expand Down Expand Up @@ -599,6 +603,7 @@ __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_si
kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K)

{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM * MM;
Expand Down Expand Up @@ -658,6 +663,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16 * SGS_PER_WG_X, SGS_PER_WG_Y, 1)))
kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM * MM;
Expand Down
24 changes: 24 additions & 0 deletions samples/99_matrixexperiments/matrix_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ kernel void bfloat16_naive(global float* C, global ushort* A, global ushort* B,
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -59,6 +60,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n8(global float* C, global ushort* A, glob
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -79,6 +81,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n8(global float* C, global ushort* A, glob
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -99,6 +102,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n8(global float* C, global ushort* A, glob
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -121,6 +125,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n8(global float* C, global ushort* A, glob
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -141,6 +146,7 @@ kernel void bfloat16_dpas_vnni_m1_n8(global float* C, global ushort* A, global u
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -161,6 +167,7 @@ kernel void bfloat16_dpas_vnni_m2_n8(global float* C, global ushort* A, global u
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -181,6 +188,7 @@ kernel void bfloat16_dpas_vnni_m4_n8(global float* C, global ushort* A, global u
__attribute__((intel_reqd_sub_group_size(8))) __attribute__((reqd_work_group_size(8, 1, 1)))
kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 8;
const int N = get_global_size(0);
Expand All @@ -205,6 +213,7 @@ kernel void bfloat16_dpas_vnni_m8_n8(global float* C, global ushort* A, global u
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -225,6 +234,7 @@ kernel void bfloat16_dpas_rowmajor_m1_n16(global float* C, global ushort* A, glo
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -245,6 +255,7 @@ kernel void bfloat16_dpas_rowmajor_m2_n16(global float* C, global ushort* A, glo
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -265,6 +276,7 @@ kernel void bfloat16_dpas_rowmajor_m4_n16(global float* C, global ushort* A, glo
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -287,6 +299,7 @@ kernel void bfloat16_dpas_rowmajor_m8_n16(global float* C, global ushort* A, glo
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -307,6 +320,7 @@ kernel void bfloat16_dpas_vnni_m1_n16(global float* C, global ushort* A, global
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -327,6 +341,7 @@ kernel void bfloat16_dpas_vnni_m2_n16(global float* C, global ushort* A, global
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -347,6 +362,7 @@ kernel void bfloat16_dpas_vnni_m4_n16(global float* C, global ushort* A, global
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int N = get_global_size(0);
Expand All @@ -369,6 +385,7 @@ kernel void bfloat16_dpas_vnni_m8_n16(global float* C, global ushort* A, global
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 16;
const int M = get_global_size(1);
Expand All @@ -390,6 +407,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global usho
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -411,6 +429,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global usho
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -432,6 +451,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global usho
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -453,6 +473,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global usho
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 1;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -474,6 +495,7 @@ kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort*
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 2;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -495,6 +517,7 @@ kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort*
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 4;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand All @@ -516,6 +539,7 @@ kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort*
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
kernel void bfloat16_dpas_blockread_vnni_m8_n16(global float* C, global ushort* A, global ushort* B, int K)
{
__builtin_assume(K > 0); // Always at least one K iteration.
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM;
Expand Down

0 comments on commit 133fb58

Please sign in to comment.