Skip to content

Commit

Permalink
fix a few more bugs and improve validation testing
Browse files Browse the repository at this point in the history
  • Loading branch information
bashbaug committed Jan 17, 2024
1 parent d4eb405 commit b6be2d4
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 25 deletions.
18 changes: 9 additions & 9 deletions samples/99_matrixexperiments/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ static void go_naive(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -222,7 +222,7 @@ static void go_dpas_rowmajor(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -276,7 +276,7 @@ static void go_dpas_rowmajor_tiled(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -324,7 +324,7 @@ static void go_dpas_vnni(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -378,7 +378,7 @@ static void go_dpas_vnni_tiled(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -426,7 +426,7 @@ static void go_dpas_blockread_rowmajor(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -480,7 +480,7 @@ static void go_dpas_blockread_rowmajor_tiled(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -528,7 +528,7 @@ static void go_dpas_blockread_vnni(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down Expand Up @@ -582,7 +582,7 @@ static void go_dpas_blockread_vnni_tiled(
kernel.setArg(2, B);
kernel.setArg(3, static_cast<cl_int>(K));

queue.enqueueFillBuffer(C, 0, 0, C_ref.size());
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));

float best = 999.0f;
for (int test = 0; test < testIterations; test++) {
Expand Down
8 changes: 4 additions & 4 deletions samples/99_matrixexperiments/matrix_helpers.cl
Original file line number Diff line number Diff line change
Expand Up @@ -464,19 +464,19 @@ uint8 intel_subgroup_block_read_u32_m8k16(const __global void* base_address, int
return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
}

void intel_subgroup_block_write_u32_m1k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint data)
void intel_subgroup_block_write_u32_m1k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint data)
{
__builtin_IB_subgroup_block_write_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
}
void intel_subgroup_block_write_u32_m2k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data)
void intel_subgroup_block_write_u32_m2k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data)
{
__builtin_IB_subgroup_block_write_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
}
void intel_subgroup_block_write_u32_m4k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data)
void intel_subgroup_block_write_u32_m4k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data)
{
__builtin_IB_subgroup_block_write_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
}
void intel_subgroup_block_write_u32_m8k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data)
void intel_subgroup_block_write_u32_m8k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data)
{
__builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
}
Expand Down
8 changes: 4 additions & 4 deletions samples/99_matrixexperiments/matrix_kernel_tiled.cl
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN
{
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM;
const int M = get_global_size(1) * tM * MM;
const int N = get_global_size(0) * NN;
const int m = get_group_id(1) * tM * MM;
const int n = get_group_id(0) * tN * NN;
Expand Down Expand Up @@ -220,7 +220,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN

for (int mm = 0; mm < MM; mm++) {
for (int nn = 0; nn < NN; nn++) {
intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn]));
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn]));
}
}
}
Expand All @@ -230,7 +230,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl
{
const int tM = 8;
const int tN = 16;
const int M = get_global_size(1) * tM;
const int M = get_global_size(1) * tM * MM;
const int N = get_global_size(0) * NN;
const int m = get_group_id(1) * tM * MM;
const int n = get_group_id(0) * tN * NN;
Expand Down Expand Up @@ -262,7 +262,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl

for (int mm = 0; mm < MM; mm++) {
for (int nn = 0; nn < NN; nn++) {
intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn]));
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn]));
}
}
}
Expand Down
16 changes: 8 additions & 8 deletions samples/99_matrixexperiments/matrix_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global usho
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m1k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -386,7 +386,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global usho
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m2k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -406,7 +406,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global usho
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m4k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -426,7 +426,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global usho
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -446,7 +446,7 @@ kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort*
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m1k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -466,7 +466,7 @@ kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort*
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m2k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -486,7 +486,7 @@ kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort*
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m4k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
}

__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
Expand All @@ -506,7 +506,7 @@ kernel void bfloat16_dpas_blockread_vnni_m8_n16(global float* C, global ushort*
sum = mat_mul_sg16(aData, bData, sum);
}

intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
}

#endif // cl_intel_subgroup_extended_block_read
Expand Down

0 comments on commit b6be2d4

Please sign in to comment.