diff --git a/samples/99_matrixexperiments/CMakeLists.txt b/samples/99_matrixexperiments/CMakeLists.txt index 86599fb..9fe36d8 100644 --- a/samples/99_matrixexperiments/CMakeLists.txt +++ b/samples/99_matrixexperiments/CMakeLists.txt @@ -6,6 +6,6 @@ add_opencl_sample( TEST NUMBER 99 TARGET matrixexperiments - VERSION 120 + VERSION 200 # for clSetKernelExecInfo SOURCES main.cpp KERNELS matrix_helpers.cl matrix_kernels.cl matrix_kernel_tiled.cl) diff --git a/samples/99_matrixexperiments/main.cpp b/samples/99_matrixexperiments/main.cpp index d79ca8e..fae01b3 100644 --- a/samples/99_matrixexperiments/main.cpp +++ b/samples/99_matrixexperiments/main.cpp @@ -27,6 +27,7 @@ bool validate = false; bool emulate = false; bool wallclock = false; bool skipinit = false; +bool roundRobin = false; int testIterations = 16; float threshold = 0.01f; @@ -75,6 +76,18 @@ static size_t findMinSubGroupSize(cl::Device& device) return 0; } +static void setRoundRobin(cl::Kernel& kernel) +{ + constexpr cl_kernel_exec_info CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_INTEL = 0x10025; + constexpr cl_uint CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL = 0x10023; + const cl_uint policy = CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL; + clSetKernelExecInfo( + kernel(), + CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_INTEL, + sizeof(policy), + &policy); +} + template static void fill_matrix(std::vector& M, size_t numRows, size_t numCols) { @@ -440,6 +453,9 @@ static void bfloat16_dpas_blockread_rowmajor( kernel.setArg(1, A); kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); + if (roundRobin) { + setRoundRobin(kernel); + } if (!skipinit) { queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); @@ -496,6 +512,9 @@ static void bfloat16_dpas_blockread_rowmajor_tiled( kernel.setArg(1, A); kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); + if (roundRobin) { + setRoundRobin(kernel); + } if (!skipinit) { queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); @@ -546,6 +565,9 @@ static void bfloat16_dpas_blockread_vnni( kernel.setArg(1, A); kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); + if (roundRobin) { + setRoundRobin(kernel); + } if (!skipinit) { queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); @@ -602,6 +624,9 @@ static void bfloat16_dpas_blockread_vnni_tiled( kernel.setArg(1, A); kernel.setArg(2, B); kernel.setArg(3, static_cast(K)); + if (roundRobin) { + setRoundRobin(kernel); + } if (!skipinit) { queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0])); @@ -658,6 +683,7 @@ int main(int argc, char** argv) op.add("", "emulate", "Unconditionally Emulate dpas", &emulate); op.add("", "wallclock", "Measure Wallclock Time", &wallclock); op.add("", "skipinit", "Do Not Initialize Buffers", &skipinit); + op.add("", "roundrobin", "Use Round Robin Scheduling", &roundRobin); op.add>("", "threshold", "Local Error Threshold", threshold, &threshold); op.add, popl::Attribute::advanced>("", "mask", "Test Mask", mask, &mask); bool printUsage = false;