From 2cd1cc3767665cc0aa39e9e0636bac8c293e67d6 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 8 Feb 2024 17:36:06 +0000 Subject: [PATCH] [EXP][Command-buffer] OpenCL kernel command update Implement the API for updating the kernel commands in a command-buffer defined by https://github.com/oneapi-src/unified-runtime/pull/1089 for the OpenCL adapter. This depends on support for the [cl_khr_command_buffer_mutable_dispatch](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#cl_khr_command_buffer_mutable_dispatch) extension. Tested on Intel GPU OpenCL implementations with the [command-buffer emulation layer](https://github.com/bashbaug/SimpleOpenCLSamples/tree/main/layers/10_cmdbufemu). ```bash $ OPENCL_LAYERS= ./bin/test-exp_command_buffer --platform="Intel(R) OpenCL Graphics" ``` --- source/adapters/opencl/command_buffer.cpp | 347 ++++++++++++++---- source/adapters/opencl/command_buffer.hpp | 83 ++++- source/adapters/opencl/common.cpp | 32 ++ source/adapters/opencl/common.hpp | 9 + source/adapters/opencl/device.cpp | 6 +- .../buffer_fill_kernel_update.cpp | 26 +- .../buffer_saxpy_kernel_update.cpp | 29 +- .../conformance/exp_command_buffer/fixtures.h | 4 + .../exp_command_buffer/invalid_update.cpp | 4 + .../exp_command_buffer/ndrange_update.cpp | 12 + .../usm_fill_kernel_update.cpp | 8 +- 11 files changed, 478 insertions(+), 82 deletions(-) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 88c661b4ae..9772bb4a23 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -11,9 +11,50 @@ #include "command_buffer.hpp" #include "common.hpp" +namespace { +ur_result_t +commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) { + if (CommandBuffer->decrementInternalReferenceCount() != 0) { + return UR_RESULT_SUCCESS; + } + + delete CommandBuffer; + return UR_RESULT_SUCCESS; +} + +ur_result_t +commandHandleReleaseInternal(ur_exp_command_buffer_command_handle_t Command) { + if (Command->decrementInternalReferenceCount() != 0) { + return UR_RESULT_SUCCESS; + } + + // Decrement parent command-buffer internal ref count + commandBufferReleaseInternal(Command->hCommandBuffer); + + delete Command; + return UR_RESULT_SUCCESS; +} +} // end anonymous namespace + +/// The ur_exp_command_buffer_handle_t_ destructor calls CL release +/// command-buffer to free the underlying object. +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { + urQueueRelease(hInternalQueue); + + cl_context CLContext = cl_adapter::cast(hContext); + cl_ext::clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = nullptr; + cl_int Res = + cl_ext::getExtFuncFromContext( + CLContext, cl_ext::ExtFuncPtrCache->clReleaseCommandBufferKHRCache, + cl_ext::ReleaseCommandBufferName, &clReleaseCommandBufferKHR); + assert(Res == CL_SUCCESS); + + clReleaseCommandBufferKHR(CLCommandBuffer); +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, - [[maybe_unused]] const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { ur_queue_handle_t Queue = nullptr; @@ -29,13 +70,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( if (!clCreateCommandBufferKHR || Res != CL_SUCCESS) return UR_RESULT_ERROR_INVALID_OPERATION; + const bool IsUpdatable = + pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; + + bool SupportsUpdate = false; + cl_device_id CLDevice = cl_adapter::cast(hDevice); + CL_RETURN_ON_FAILURE( + deviceSupportsURCommandBufferKernelUpdate(CLDevice, SupportsUpdate)); + + const bool Updatable = IsUpdatable && SupportsUpdate; + + cl_command_buffer_properties_khr Properties[3] = { + CL_COMMAND_BUFFER_FLAGS_KHR, + Updatable ? CL_COMMAND_BUFFER_MUTABLE_KHR : 0u, 0}; auto CLCommandBuffer = clCreateCommandBufferKHR( - 1, cl_adapter::cast(&Queue), nullptr, &Res); + 1, cl_adapter::cast(&Queue), Properties, &Res); CL_RETURN_ON_FAILURE_AND_SET_NULL(Res, phCommandBuffer); try { auto URCommandBuffer = std::make_unique( - Queue, hContext, CLCommandBuffer); + Queue, hContext, CLCommandBuffer, Updatable); *phCommandBuffer = URCommandBuffer.release(); } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -47,38 +101,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - UR_RETURN_ON_FAILURE(urQueueRetain(hCommandBuffer->hInternalQueue)); - - cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); - cl_ext::clRetainCommandBufferKHR_fn clRetainCommandBuffer = nullptr; - cl_int Res = cl_ext::getExtFuncFromContext( - CLContext, cl_ext::ExtFuncPtrCache->clRetainCommandBufferKHRCache, - cl_ext::RetainCommandBufferName, &clRetainCommandBuffer); - - if (!clRetainCommandBuffer || Res != CL_SUCCESS) - return UR_RESULT_ERROR_INVALID_OPERATION; - - CL_RETURN_ON_FAILURE(clRetainCommandBuffer(hCommandBuffer->CLCommandBuffer)); + hCommandBuffer->incrementInternalReferenceCount(); + hCommandBuffer->incrementExternalReferenceCount(); return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - UR_RETURN_ON_FAILURE(urQueueRelease(hCommandBuffer->hInternalQueue)); - - cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); - cl_ext::clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = nullptr; - cl_int Res = - cl_ext::getExtFuncFromContext( - CLContext, cl_ext::ExtFuncPtrCache->clReleaseCommandBufferKHRCache, - cl_ext::ReleaseCommandBufferName, &clReleaseCommandBufferKHR); - - if (!clReleaseCommandBufferKHR || Res != CL_SUCCESS) - return UR_RESULT_ERROR_INVALID_OPERATION; + if (hCommandBuffer->decrementExternalReferenceCount() == 0) { + // External ref count has reached zero, internal release of created + // commands. + for (auto Command : hCommandBuffer->CommandHandles) { + commandHandleReleaseInternal(Command); + } + } - CL_RETURN_ON_FAILURE( - clReleaseCommandBufferKHR(hCommandBuffer->CLCommandBuffer)); - return UR_RESULT_SUCCESS; + return commandBufferReleaseInternal(hCommandBuffer); } UR_APIEXPORT ur_result_t UR_APICALL @@ -95,6 +133,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { CL_RETURN_ON_FAILURE( clFinalizeCommandBufferKHR(hCommandBuffer->CLCommandBuffer)); + hCommandBuffer->IsFinalized = true; return UR_RESULT_SUCCESS; } @@ -105,7 +144,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, - ur_exp_command_buffer_command_handle_t *) { + ur_exp_command_buffer_command_handle_t *phCommandHandle) { cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); cl_ext::clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr; @@ -117,11 +156,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (!clCommandNDRangeKernelKHR || Res != CL_SUCCESS) return UR_RESULT_ERROR_INVALID_OPERATION; + cl_mutable_command_khr CommandHandle = nullptr; + cl_mutable_command_khr *OutCommandHandle = + hCommandBuffer->IsUpdatable ? &CommandHandle : nullptr; + + cl_ndrange_kernel_command_properties_khr UpdateProperties[] = { + CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR, + CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR | + CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | + CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR | + CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR, + 0}; + + cl_ndrange_kernel_command_properties_khr *Properties = + hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr; CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR( - hCommandBuffer->CLCommandBuffer, nullptr, nullptr, + hCommandBuffer->CLCommandBuffer, nullptr, Properties, cl_adapter::cast(hKernel), workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint, nullptr)); + pSyncPointWaitList, pSyncPoint, OutCommandHandle)); + + try { + auto URCommandHandle = + std::make_unique( + hCommandBuffer, CommandHandle, workDim); + *phCommandHandle = URCommandHandle.release(); + hCommandBuffer->CommandHandles.push_back(*phCommandHandle); + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } return UR_RESULT_SUCCESS; } @@ -359,65 +422,213 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( - [[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_command_handle_t hCommand) { + hCommand->incrementExternalReferenceCount(); + hCommand->incrementInternalReferenceCount(); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( - [[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_command_handle_t hCommand) { + hCommand->decrementExternalReferenceCount(); + return commandHandleReleaseInternal(hCommand); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - [[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand, - [[maybe_unused]] const ur_exp_command_buffer_update_kernel_launch_desc_t + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t *pUpdateKernelLaunch) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( - ur_exp_command_buffer_handle_t hCommandBuffer, - ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { + ur_exp_command_buffer_handle_t hCommandBuffer = hCommand->hCommandBuffer; cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); - cl_ext::clGetCommandBufferInfoKHR_fn clGetCommandBufferInfoKHR = nullptr; + cl_ext::clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr; cl_int Res = - cl_ext::getExtFuncFromContext( - CLContext, cl_ext::ExtFuncPtrCache->clGetCommandBufferInfoKHRCache, - cl_ext::GetCommandBufferInfoName, &clGetCommandBufferInfoKHR); + cl_ext::getExtFuncFromContext( + CLContext, cl_ext::ExtFuncPtrCache->clUpdateMutableCommandsKHRCache, + cl_ext::UpdateMutableCommandsName, &clUpdateMutableCommandsKHR); - if (!clGetCommandBufferInfoKHR || Res != CL_SUCCESS) + if (!clUpdateMutableCommandsKHR || Res != CL_SUCCESS) return UR_RESULT_ERROR_INVALID_OPERATION; - if (propName != UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT) { - return UR_RESULT_ERROR_INVALID_ENUMERATION; + if (!hCommandBuffer->IsFinalized || !hCommandBuffer->IsUpdatable) + return UR_RESULT_ERROR_INVALID_OPERATION; + + // Find the CL execution info to update + const uint32_t NumExecInfos = pUpdateKernelLaunch->numNewExecInfos; + const ur_exp_command_buffer_update_exec_info_desc_t *ExecInfoList = + pUpdateKernelLaunch->pNewExecInfoList; + std::vector CLExecInfos; + for (uint32_t i = 0; i < NumExecInfos; i++) { + const ur_exp_command_buffer_update_exec_info_desc_t &URExecInfo = + ExecInfoList[i]; + + if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS) { + cl_bool TrueVal = CL_TRUE; + cl_mutable_dispatch_exec_info_khr CLExecInfo; + CLExecInfo.param_value_size = sizeof(cl_bool); + CLExecInfo.param_value = &TrueVal; + CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL; + CLExecInfos.push_back(CLExecInfo); + + CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL; + CLExecInfos.push_back(CLExecInfo); + + CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL; + CLExecInfos.push_back(CLExecInfo); + } else if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_PTRS) { + cl_mutable_dispatch_exec_info_khr CLExecInfo{}; + CLExecInfo.param_value_size = URExecInfo.propSize; + CLExecInfo.param_value = URExecInfo.pNewExecInfo; + CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL; + CLExecInfos.push_back(CLExecInfo); + } else if (URExecInfo.propName != UR_KERNEL_EXEC_INFO_CACHE_CONFIG) { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } } - if (pPropSizeRet) { - *pPropSizeRet = sizeof(cl_uint); + // Find the CL USM pointer arguments to the kernel. + // WARNING - This relies on USM and SVM using the same implementation, + // which is not guaranteed. + // See https://github.com/KhronosGroup/OpenCL-Docs/issues/843 + const uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = + pUpdateKernelLaunch->pNewPointerArgList; + std::vector CLUSMArgs(NumPointerArgs); + for (uint32_t i = 0; i < NumPointerArgs; i++) { + const ur_exp_command_buffer_update_pointer_arg_desc_t &URPointerArg = + ArgPointerList[i]; + cl_mutable_dispatch_arg_khr &USMArg = CLUSMArgs[i]; + USMArg.arg_index = URPointerArg.argIndex; + USMArg.arg_value = *(void **)URPointerArg.pNewPointerArg; } - cl_uint ref_count; - CL_RETURN_ON_FAILURE(clGetCommandBufferInfoKHR( - hCommandBuffer->CLCommandBuffer, CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR, - sizeof(ref_count), &ref_count, nullptr)); + // Find the memory object and scalar arguments to the kernel. + const uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = + pUpdateKernelLaunch->pNewMemObjArgList; + const uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = + pUpdateKernelLaunch->pNewValueArgList; + + std::vector CLArgs; + for (uint32_t i = 0; i < NumMemobjArgs; i++) { + const ur_exp_command_buffer_update_memobj_arg_desc_t &URMemObjArg = + ArgMemobjList[i]; + cl_mutable_dispatch_arg_khr CLArg{ + URMemObjArg.argIndex, // arg_index + sizeof(cl_mem), // arg_size + cl_adapter::cast( + &URMemObjArg.hNewMemObjArg) // arg_value + }; + + CLArgs.push_back(CLArg); + } + + for (uint32_t i = 0; i < NumValueArgs; i++) { + const ur_exp_command_buffer_update_value_arg_desc_t &URValueArg = + ArgValueList[i]; + cl_mutable_dispatch_arg_khr CLArg{ + URValueArg.argIndex, // arg_index + URValueArg.argSize, // arg_size + URValueArg.pNewValueArg // arg_value + }; + CLArgs.push_back(CLArg); + } + + const cl_uint NewWorkDim = pUpdateKernelLaunch->newWorkDim; + cl_uint &CLWorkDim = hCommand->WorkDim; + if (NewWorkDim != 0 && NewWorkDim != CLWorkDim) { + // Limitation of the cl_khr_command_buffer_mutable_dispatch specification + // that it is an error to change the ND-Range size. + // https://github.com/KhronosGroup/OpenCL-Docs/issues/1057 + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } - if (pPropValue) { - if (propSize != sizeof(cl_uint)) { - return UR_RESULT_ERROR_INVALID_SIZE; + // Update the ND-Range configuration of the kernel. + const size_t CopySize = sizeof(size_t) * CLWorkDim; + std::vector CLGlobalWorkOffset, CLGlobalWorkSize, CLLocalWorkSize; + + if (auto GlobalWorkOffsetPtr = pUpdateKernelLaunch->pNewGlobalWorkOffset) { + CLGlobalWorkOffset.resize(CLWorkDim); + std::memcpy(CLGlobalWorkOffset.data(), GlobalWorkOffsetPtr, CopySize); + if (CLWorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim); + std::memset(CLGlobalWorkOffset.data() + CLWorkDim, 0, ZeroSize); + } + } + + if (auto GlobalWorkSizePtr = pUpdateKernelLaunch->pNewGlobalWorkSize) { + CLGlobalWorkSize.resize(CLWorkDim); + std::memcpy(CLGlobalWorkSize.data(), GlobalWorkSizePtr, CopySize); + if (CLWorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim); + std::memset(CLGlobalWorkSize.data() + CLWorkDim, 0, ZeroSize); } - static_assert(sizeof(cl_uint) == sizeof(uint32_t)); - *static_cast(pPropValue) = static_cast(ref_count); } + if (auto LocalWorkSizePtr = pUpdateKernelLaunch->pNewLocalWorkSize) { + CLLocalWorkSize.resize(CLWorkDim); + std::memcpy(CLLocalWorkSize.data(), LocalWorkSizePtr, CopySize); + if (CLWorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim); + std::memset(CLLocalWorkSize.data() + CLWorkDim, 0, ZeroSize); + } + } + + cl_mutable_command_khr command = + cl_adapter::cast(hCommand->CLMutableCommand); + cl_mutable_dispatch_config_khr dispatch_config = { + CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR, + nullptr, + command, + static_cast(CLArgs.size()), // num_args + static_cast(CLUSMArgs.size()), // num_svm_args + static_cast(CLExecInfos.size()), // num_exec_infos + CLWorkDim, // work_dim + CLArgs.data(), // arg_list + CLUSMArgs.data(), // arg_svm_list + CLExecInfos.data(), // exec_info_list + CLGlobalWorkOffset.data(), // global_work_offset + CLGlobalWorkSize.data(), // global_work_size + CLLocalWorkSize.data(), // local_work_size + }; + cl_mutable_base_config_khr config = { + CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1, &dispatch_config}; + CL_RETURN_ON_FAILURE( + clUpdateMutableCommandsKHR(hCommandBuffer->CLCommandBuffer, &config)); + return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( + ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(hCommandBuffer->getExternalReferenceCount()); + default: + assert(!"Command-buffer info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( - [[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand, - [[maybe_unused]] ur_exp_command_buffer_command_info_t propName, - [[maybe_unused]] size_t propSize, [[maybe_unused]] void *pPropValue, - [[maybe_unused]] size_t *pPropSizeRet) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_command_handle_t hCommand, + ur_exp_command_buffer_command_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_COMMAND_INFO_REFERENCE_COUNT: + return ReturnValue(hCommand->getExternalReferenceCount()); + default: + assert(!"Command-buffer command info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp index d80f29594b..8f3bdbb55b 100644 --- a/source/adapters/opencl/command_buffer.hpp +++ b/source/adapters/opencl/command_buffer.hpp @@ -11,14 +11,93 @@ #include #include +/// Handle to a kernel command. +struct ur_exp_command_buffer_command_handle_t_ { + /// Command-buffer this command belongs to. + ur_exp_command_buffer_handle_t hCommandBuffer; + /// OpenCL command-handle. + cl_mutable_command_khr CLMutableCommand; + /// Work-dimension the command was originally created with. + cl_uint WorkDim; + /// Internal & External reference counts. + /// We need to maintain these because in OpenCL a command-handle isn't + /// reference counting, but is tied to the lifetime of the parent + /// command-buffer. This is not the case in UR where a command-handle is + /// reference counted. + std::atomic_uint32_t RefCountInternal; + std::atomic_uint32_t RefCountExternal; + + ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t hCommandBuffer, + cl_mutable_command_khr CLMutableCommand, cl_uint WorkDim) + : hCommandBuffer(hCommandBuffer), CLMutableCommand(CLMutableCommand), + WorkDim(WorkDim), RefCountInternal(0), RefCountExternal(0) {} + + uint32_t incrementInternalReferenceCount() noexcept { + return ++RefCountInternal; + } + uint32_t decrementInternalReferenceCount() noexcept { + return --RefCountInternal; + } + + uint32_t incrementExternalReferenceCount() noexcept { + return ++RefCountExternal; + } + uint32_t decrementExternalReferenceCount() noexcept { + return --RefCountExternal; + } + uint32_t getExternalReferenceCount() const noexcept { + return RefCountExternal; + } +}; + +/// Handle to a command-buffer object. struct ur_exp_command_buffer_handle_t_ { + /// UR queue belonging to the command-buffer, required for OpenCL creation. ur_queue_handle_t hInternalQueue; + /// Context the command-buffer is created for. ur_context_handle_t hContext; + /// OpenCL command-buffer object. cl_command_buffer_khr CLCommandBuffer; + /// Set to true if the kernel commands in the command-buffer can be updated, + /// false otherwise + bool IsUpdatable; + /// Set to true if the command-buffer has been finalized, false otherwise + bool IsFinalized; + /// List of commands in the command-buffer. + std::vector CommandHandles; + /// Internal & External reference counts of the command-buffer. We do this + /// manually rather than forward to the OpenCL retain/release APIs because + /// we also need to track the lifetimes of command handle objects, which + /// extended the lifetime of a UR command-buffer even if its reference + /// count is zero. + std::atomic_uint32_t RefCountInternal; + std::atomic_uint32_t RefCountExternal; ur_exp_command_buffer_handle_t_(ur_queue_handle_t hQueue, ur_context_handle_t hContext, - cl_command_buffer_khr CLCommandBuffer) + cl_command_buffer_khr CLCommandBuffer, + bool IsUpdatable) : hInternalQueue(hQueue), hContext(hContext), - CLCommandBuffer(CLCommandBuffer) {} + CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable), + IsFinalized(false), RefCountInternal(0), RefCountExternal(0) {} + + ~ur_exp_command_buffer_handle_t_(); + + uint32_t incrementInternalReferenceCount() noexcept { + return ++RefCountInternal; + } + uint32_t decrementInternalReferenceCount() noexcept { + return --RefCountInternal; + } + + uint32_t incrementExternalReferenceCount() noexcept { + return ++RefCountExternal; + } + uint32_t decrementExternalReferenceCount() noexcept { + return --RefCountExternal; + } + uint32_t getExternalReferenceCount() const noexcept { + return RefCountExternal; + } }; diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp index 4fe8bed408..267c4fc705 100644 --- a/source/adapters/opencl/common.cpp +++ b/source/adapters/opencl/common.cpp @@ -101,3 +101,35 @@ ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle) { *NativeHandle = reinterpret_cast(URObj); return UR_RESULT_SUCCESS; } + +cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, + bool &Result) { + size_t ExtSize = 0; + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &ExtSize)); + + std::string ExtStr(ExtSize, '\0'); + CL_RETURN_ON_FAILURE(clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, ExtSize, + ExtStr.data(), nullptr)); + + std::string SupportedExtensions(ExtStr.c_str()); + if (ExtStr.find("cl_khr_command_buffer_mutable_dispatch") == + std::string::npos) { + Result = false; + return CL_SUCCESS; + } + + // All the CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR capabilities must + // be supported by a device for UR update. + cl_mutable_dispatch_fields_khr mutable_capabilities; + CL_RETURN_ON_FAILURE(clGetDeviceInfo( + Dev, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR, + sizeof(mutable_capabilities), &mutable_capabilities, nullptr)); + const cl_mutable_dispatch_fields_khr required_caps = + CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | + CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR | + CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR | + CL_MUTABLE_DISPATCH_EXEC_INFO_KHR; + Result = (mutable_capabilities & required_caps) == required_caps; + return CL_SUCCESS; +} diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp index 0667cd3d17..bdb94c3feb 100644 --- a/source/adapters/opencl/common.hpp +++ b/source/adapters/opencl/common.hpp @@ -215,6 +215,7 @@ CONSTFIX char CommandCopyBufferRectName[] = "clCommandCopyBufferRectKHR"; CONSTFIX char CommandFillBufferName[] = "clCommandFillBufferKHR"; CONSTFIX char EnqueueCommandBufferName[] = "clEnqueueCommandBufferKHR"; CONSTFIX char GetCommandBufferInfoName[] = "clGetCommandBufferInfoKHR"; +CONSTFIX char UpdateMutableCommandsName[] = "clUpdateMutableCommandsKHR"; #undef CONSTFIX @@ -305,6 +306,10 @@ using clGetCommandBufferInfoKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( cl_command_buffer_khr command_buffer, cl_command_buffer_info_khr param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret); +using clUpdateMutableCommandsKHR_fn = CL_API_ENTRY +cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr *mutable_config); + template struct FuncPtrCache { std::map Map; std::mutex Mutex; @@ -344,6 +349,7 @@ struct ExtFuncPtrCacheT { FuncPtrCache clCommandFillBufferKHRCache; FuncPtrCache clEnqueueCommandBufferKHRCache; FuncPtrCache clGetCommandBufferInfoKHRCache; + FuncPtrCache clUpdateMutableCommandsKHRCache; }; // A raw pointer is used here since the lifetime of this map has to be tied to // piTeardown to avoid issues with static destruction order (a user application @@ -414,3 +420,6 @@ static ur_result_t getExtFuncFromContext(cl_context Context, ur_result_t mapCLErrorToUR(cl_int Result); ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle); + +cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, + bool &Result); diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 115b9b2e09..7792839d29 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -967,7 +967,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, std::string::npos); } case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - return ReturnValue(false); + cl_device_id Dev = cl_adapter::cast(hDevice); + bool Supported = false; + CL_RETURN_ON_FAILURE( + deviceSupportsURCommandBufferKernelUpdate(Dev, Supported)); + return ReturnValue(Supported); } default: { return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp index ea5295dc6b..6bb1b51568 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp @@ -14,10 +14,30 @@ struct BufferFillCommandTest UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); - // First argument is buffer to fill (will also be hidden accessor arg) - AddBuffer1DArg(sizeof(val) * global_size, &buffer); + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + sizeof(val) * global_size, nullptr, + &buffer)); + + // TODO - Enable single code path after https://github.com/oneapi-src/unified-runtime/pull/1176 + // is merged + if (backend != UR_PLATFORM_BACKEND_OPENCL) { + // First argument is buffer to fill + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer)); + } else { + // First argument is buffer to fill + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(buffer), + nullptr, &buffer)); + } + // second arg is hidden accessor + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(accessor), nullptr, + &accessor)); + // Second argument is scalar to fill with. - AddPodArg(val); + ASSERT_SUCCESS( + urKernelSetArgValue(kernel, 2, sizeof(val), nullptr, &val)); // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( diff --git a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp index 879b3a9bc6..9f0ad92f14 100644 --- a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp @@ -29,8 +29,17 @@ struct BufferSaxpyKernelTest 0, nullptr, nullptr)); } - // Index 0 is output buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffers[0])); + // TODO: Enable single code path once https://github.com/oneapi-src/unified-runtime/pull/1176 + // is merged + if (backend != UR_PLATFORM_BACKEND_OPENCL) { + // Index 0 is output buffer + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, 0, nullptr, buffers[0])); + } else { + // Index 0 is output buffer + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, 0, sizeof(ur_mem_handle_t), nullptr, &buffers[0])); + } // Index 1 is output accessor struct { size_t offsets[1] = {0}; @@ -41,13 +50,25 @@ struct BufferSaxpyKernelTest // Index 2 is A ASSERT_SUCCESS(urKernelSetArgValue(kernel, 2, sizeof(A), nullptr, &A)); // Index 3 is X buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 3, nullptr, buffers[1])); + if (backend != UR_PLATFORM_BACKEND_OPENCL) { + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, 3, nullptr, buffers[1])); + } else { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, 3, sizeof(ur_mem_handle_t), nullptr, &buffers[1])); + } // Index 4 is X buffer accessor ASSERT_SUCCESS(urKernelSetArgValue(kernel, 4, sizeof(accessor), nullptr, &accessor)); // Index 5 is Y buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 5, nullptr, buffers[2])); + if (backend != UR_PLATFORM_BACKEND_OPENCL) { + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, 5, nullptr, buffers[2])); + } else { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, 5, sizeof(ur_mem_handle_t), nullptr, &buffers[2])); + } // Index 6 is Y buffer accessor ASSERT_SUCCESS(urKernelSetArgValue(kernel, 6, sizeof(accessor), nullptr, diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 4e9bff35f9..cbf441db2f 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -59,6 +59,9 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + size_t returned_size; ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, nullptr, &returned_size)); @@ -97,6 +100,7 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; ur_bool_t updatable_command_buffer_support = false; + ur_platform_backend_t backend{}; }; struct urUpdatableCommandBufferExpExecutionTest diff --git a/test/conformance/exp_command_buffer/invalid_update.cpp b/test/conformance/exp_command_buffer/invalid_update.cpp index 00cf04ea85..dbf0534437 100644 --- a/test/conformance/exp_command_buffer/invalid_update.cpp +++ b/test/conformance/exp_command_buffer/invalid_update.cpp @@ -41,6 +41,10 @@ struct InvalidUpdateTest } void TearDown() override { + // Workaround an issue with the OpenCL adapter implementing urUsmFree + // using a blocking free where hangs + EXPECT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + if (shared_ptr) { EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); } diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/ndrange_update.cpp index e5631f9176..bd3781c4a4 100644 --- a/test/conformance/exp_command_buffer/ndrange_update.cpp +++ b/test/conformance/exp_command_buffer/ndrange_update.cpp @@ -155,6 +155,12 @@ TEST_P(NDRangeUpdateTest, Update3D) { // Update the kernel work dimensions to 2, and update global size, local size, // and global offset to new values. TEST_P(NDRangeUpdateTest, Update2D) { + if (backend == UR_PLATFORM_BACKEND_OPENCL) { + // OpenCL cl_khr_command_buffer_mutable_dispatch does not support + // updating the work dimension. + GTEST_SKIP(); + } + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -205,6 +211,12 @@ TEST_P(NDRangeUpdateTest, Update2D) { // Update the kernel work dimensions to 1, and check that previously // set global size, local size, and global offset update accordingly. TEST_P(NDRangeUpdateTest, Update1D) { + if (backend == UR_PLATFORM_BACKEND_OPENCL) { + // OpenCL cl_khr_command_buffer_mutable_dispatch does not support + // updating the work dimension. + GTEST_SKIP(); + } + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp index 7e6cab6ee3..20e2bf9c16 100644 --- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp @@ -154,7 +154,7 @@ TEST_P(USMFillCommandTest, UpdateExecInfo) { ur_exp_command_buffer_update_exec_info_desc_t new_exec_info_descs[3]; // Update direct access flag - bool indirect_access = false; + bool indirect_access = true; new_exec_info_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_EXEC_INFO_DESC, // stype nullptr, // pNext @@ -179,14 +179,14 @@ TEST_P(USMFillCommandTest, UpdateExecInfo) { ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, allocation_size, &new_shared_ptr)); ASSERT_NE(new_shared_ptr, nullptr); - void *pointers = {new_shared_ptr}; + void *pointers[1] = {new_shared_ptr}; new_exec_info_descs[2] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_EXEC_INFO_DESC, // stype nullptr, // pNext UR_KERNEL_EXEC_INFO_USM_PTRS, // propName sizeof(pointers), // propSize - nullptr, // pProperties - &pointers, // pPropValue + nullptr, // pProperties + pointers, // pPropValue }; ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {