diff --git a/samples/99_matrixexperiments/main.cpp b/samples/99_matrixexperiments/main.cpp index 42b3cbe..044739e 100644 --- a/samples/99_matrixexperiments/main.cpp +++ b/samples/99_matrixexperiments/main.cpp @@ -174,7 +174,7 @@ static void go_naive( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -222,7 +222,7 @@ static void go_dpas_rowmajor( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -276,7 +276,7 @@ static void go_dpas_rowmajor_tiled( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -324,7 +324,7 @@ static void go_dpas_vnni( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -378,7 +378,7 @@ static void go_dpas_vnni_tiled( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -426,7 +426,7 @@ static void go_dpas_blockread_rowmajor( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -480,7 +480,7 @@ static void go_dpas_blockread_rowmajor_tiled( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -528,7 +528,7 @@ static void go_dpas_blockread_vnni( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { @@ -582,7 +582,7 @@ static void go_dpas_blockread_vnni_tiled( kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); - queue.enqueueFillBuffer(C, 0, 0, C_ref.size()); + queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); float best = 999.0f; for (int test = 0; test < testIterations; test++) { diff --git a/samples/99_matrixexperiments/matrix_helpers.cl b/samples/99_matrixexperiments/matrix_helpers.cl index ef68fa4..04a63c5 100644 --- a/samples/99_matrixexperiments/matrix_helpers.cl +++ b/samples/99_matrixexperiments/matrix_helpers.cl @@ -464,19 +464,19 @@ uint8 intel_subgroup_block_read_u32_m8k16(const __global void* base_address, int return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord); } -void intel_subgroup_block_write_u32_m1k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint data) +void intel_subgroup_block_write_u32_m1k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint data) { __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data); } -void intel_subgroup_block_write_u32_m2k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data) +void intel_subgroup_block_write_u32_m2k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data) { __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data); } -void intel_subgroup_block_write_u32_m4k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data) +void intel_subgroup_block_write_u32_m4k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data) { __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data); } -void intel_subgroup_block_write_u32_m8k16v1(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data) +void intel_subgroup_block_write_u32_m8k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data) { __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data); } diff --git a/samples/99_matrixexperiments/matrix_kernel_tiled.cl b/samples/99_matrixexperiments/matrix_kernel_tiled.cl index b363b14..a01e0f2 100644 --- a/samples/99_matrixexperiments/matrix_kernel_tiled.cl +++ b/samples/99_matrixexperiments/matrix_kernel_tiled.cl @@ -188,7 +188,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN { const int tM = 8; const int tN = 16; - const int M = get_global_size(1) * tM; + const int M = get_global_size(1) * tM * MM; const int N = get_global_size(0) * NN; const int m = get_group_id(1) * tM * MM; const int n = get_group_id(0) * tN * NN; @@ -220,7 +220,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN for (int mm = 0; mm < MM; mm++) { for (int nn = 0; nn < NN; nn++) { - intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn])); + intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn])); } } } @@ -230,7 +230,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl { const int tM = 8; const int tN = 16; - const int M = get_global_size(1) * tM; + const int M = get_global_size(1) * tM * MM; const int N = get_global_size(0) * NN; const int m = get_group_id(1) * tM * MM; const int n = get_group_id(0) * tN * NN; @@ -262,7 +262,7 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl for (int mm = 0; mm < MM; mm++) { for (int nn = 0; nn < NN; nn++) { - intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn])); + intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[mm][nn])); } } } diff --git a/samples/99_matrixexperiments/matrix_kernels.cl b/samples/99_matrixexperiments/matrix_kernels.cl index a310db6..efc8f4f 100644 --- a/samples/99_matrixexperiments/matrix_kernels.cl +++ b/samples/99_matrixexperiments/matrix_kernels.cl @@ -366,7 +366,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m1_n16(global float* C, global usho sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m1k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum)); + intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -386,7 +386,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m2_n16(global float* C, global usho sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m2k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum)); + intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -406,7 +406,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m4_n16(global float* C, global usho sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m4k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum)); + intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -426,7 +426,7 @@ kernel void bfloat16_dpas_blockread_rowmajor_m8_n16(global float* C, global usho sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum)); + intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -446,7 +446,7 @@ kernel void bfloat16_dpas_blockread_vnni_m1_n16(global float* C, global ushort* sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m1k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum)); + intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -466,7 +466,7 @@ kernel void bfloat16_dpas_blockread_vnni_m2_n16(global float* C, global ushort* sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m2k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum)); + intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -486,7 +486,7 @@ kernel void bfloat16_dpas_blockread_vnni_m4_n16(global float* C, global ushort* sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m4k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum)); + intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum)); } __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -506,7 +506,7 @@ kernel void bfloat16_dpas_blockread_vnni_m8_n16(global float* C, global ushort* sum = mat_mul_sg16(aData, bData, sum); } - intel_subgroup_block_write_u32_m8k16v1(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum)); + intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum)); } #endif // cl_intel_subgroup_extended_block_read