Skip to content

Commit

Permalink
[EXP][Command-buffer] OpenCL kernel command update
Browse files Browse the repository at this point in the history
Implement the API for updating the kernel commands in a command-buffer
defined by oneapi-src#1089 for
the OpenCL adapter.

This depends on support for the
[cl_khr_command_buffer_mutable_dispatch](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#cl_khr_command_buffer_mutable_dispatch)
extension.

Tested on Intel GPU OpenCL implementations with the
[command-buffer emulation
layer](https://github.com/bashbaug/SimpleOpenCLSamples/tree/main/layers/10_cmdbufemu).
  • Loading branch information
EwanC committed Feb 13, 2024
1 parent 92e154b commit ce980d4
Show file tree
Hide file tree
Showing 11 changed files with 362 additions and 28 deletions.
235 changes: 221 additions & 14 deletions source/adapters/opencl/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
[[maybe_unused]] const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
ur_exp_command_buffer_handle_t *phCommandBuffer) {

ur_queue_handle_t Queue = nullptr;
Expand All @@ -29,13 +29,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
if (!clCreateCommandBufferKHR || Res != CL_SUCCESS)
return UR_RESULT_ERROR_INVALID_OPERATION;

bool IsUpdatable =
pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;

bool SupportsUpdate = false;
cl_device_id CLDevice = cl_adapter::cast<cl_device_id>(hDevice);
CL_RETURN_ON_FAILURE(
deviceSupportsURCommandBufferKernelUpdate(CLDevice, SupportsUpdate));

bool Updatable = IsUpdatable && SupportsUpdate;

cl_command_buffer_properties_khr Properties[3] = {
CL_COMMAND_BUFFER_FLAGS_KHR,
Updatable ? CL_COMMAND_BUFFER_MUTABLE_KHR : 0u, 0};
auto CLCommandBuffer = clCreateCommandBufferKHR(
1, cl_adapter::cast<cl_command_queue *>(&Queue), nullptr, &Res);
1, cl_adapter::cast<cl_command_queue *>(&Queue), Properties, &Res);
CL_RETURN_ON_FAILURE_AND_SET_NULL(Res, phCommandBuffer);

try {
auto URCommandBuffer = std::make_unique<ur_exp_command_buffer_handle_t_>(
Queue, hContext, CLCommandBuffer);
Queue, hContext, CLCommandBuffer, Updatable);
*phCommandBuffer = URCommandBuffer.release();
} catch (...) {
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
Expand Down Expand Up @@ -95,6 +108,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {

CL_RETURN_ON_FAILURE(
clFinalizeCommandBufferKHR(hCommandBuffer->CLCommandBuffer));
hCommandBuffer->Finalized = true;
return UR_RESULT_SUCCESS;
}

Expand All @@ -105,7 +119,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint,
ur_exp_command_buffer_command_handle_t *) {
ur_exp_command_buffer_command_handle_t *phCommandHandle) {

cl_context CLContext = cl_adapter::cast<cl_context>(hCommandBuffer->hContext);
cl_ext::clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr;
Expand All @@ -117,11 +131,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
if (!clCommandNDRangeKernelKHR || Res != CL_SUCCESS)
return UR_RESULT_ERROR_INVALID_OPERATION;

cl_mutable_command_khr CommandHandle = nullptr;
cl_mutable_command_khr *OutCommandHandle =
hCommandBuffer->Updatable ? &CommandHandle : nullptr;

cl_ndrange_kernel_command_properties_khr UpdateProperties[] = {
CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR |
CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR |
CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR |
CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR,
0};

cl_ndrange_kernel_command_properties_khr *Properties =
hCommandBuffer->Updatable ? UpdateProperties : nullptr;
CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR(
hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
hCommandBuffer->CLCommandBuffer, nullptr, Properties,
cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList,
pSyncPointWaitList, pSyncPoint, nullptr));
pSyncPointWaitList, pSyncPoint, OutCommandHandle));

try {
auto URCommandHandle =
std::make_unique<ur_exp_command_buffer_command_handle_t_>(
hCommandBuffer, CommandHandle, workDim);
*phCommandHandle = URCommandHandle.release();
hCommandBuffer->CommandHandles.push_back(*phCommandHandle);
} catch (...) {
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
}

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -360,19 +398,180 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
hCommand->incrementReferenceCount();
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
if (hCommand->decrementReferenceCount() == 0) {
// TODO
}
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand,
[[maybe_unused]] const ur_exp_command_buffer_update_kernel_launch_desc_t
*pUpdateKernelLaunch) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;

ur_exp_command_buffer_handle_t hCommandBuffer = hCommand->hCommandBuffer;
cl_context CLContext = cl_adapter::cast<cl_context>(hCommandBuffer->hContext);
cl_ext::clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
cl_int Res =
cl_ext::getExtFuncFromContext<decltype(clUpdateMutableCommandsKHR)>(
CLContext, cl_ext::ExtFuncPtrCache->clUpdateMutableCommandsKHRCache,
cl_ext::UpdateMutableCommandsName, &clUpdateMutableCommandsKHR);

if (!clUpdateMutableCommandsKHR || Res != CL_SUCCESS)
return UR_RESULT_ERROR_INVALID_OPERATION;

if (!hCommandBuffer->Finalized || !hCommandBuffer->Updatable)
return UR_RESULT_ERROR_INVALID_OPERATION;

// Find the CL execution info to update
uint32_t NumExecInfos = pUpdateKernelLaunch->numNewExecInfos;
const ur_exp_command_buffer_update_exec_info_desc_t *ExecInfoList =
pUpdateKernelLaunch->pNewExecInfoList;
std::vector<cl_mutable_dispatch_exec_info_khr> CLExecInfos;
for (uint32_t i = 0; i < NumExecInfos; i++) {
const ur_exp_command_buffer_update_exec_info_desc_t &URExecInfo =
ExecInfoList[i];

if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS) {
cl_bool TrueVal = CL_TRUE;
cl_mutable_dispatch_exec_info_khr CLExecInfo;
CLExecInfo.param_value_size = sizeof(cl_bool);
CLExecInfo.param_value = &TrueVal;
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL;
CLExecInfos.push_back(CLExecInfo);

CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL;
CLExecInfos.push_back(CLExecInfo);

CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL;
CLExecInfos.push_back(CLExecInfo);
} else if (URExecInfo.propName == UR_KERNEL_EXEC_INFO_USM_PTRS) {
cl_mutable_dispatch_exec_info_khr CLExecInfo{};
CLExecInfo.param_value_size = URExecInfo.propSize;
CLExecInfo.param_value = URExecInfo.pNewExecInfo;
CLExecInfo.param_name = CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL;
CLExecInfos.push_back(CLExecInfo);
} else if (URExecInfo.propName != UR_KERNEL_EXEC_INFO_CACHE_CONFIG) {
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}
}

// Find the CL USM pointer arguments to the kernel
// WARNING - This relies on USM and SVM using the same implementation,
// which is not guaranteed.
// See https://github.com/KhronosGroup/OpenCL-Docs/issues/843
uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs;
const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList =
pUpdateKernelLaunch->pNewPointerArgList;
std::vector<cl_mutable_dispatch_arg_khr> CLUSMArgs(NumPointerArgs);
for (uint32_t i = 0; i < NumPointerArgs; i++) {
const ur_exp_command_buffer_update_pointer_arg_desc_t &URPointerArg =
ArgPointerList[i];
cl_mutable_dispatch_arg_khr &USMArg = CLUSMArgs[i];
USMArg.arg_index = URPointerArg.argIndex;
USMArg.arg_value = *(void **)URPointerArg.pNewPointerArg;
}

uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs;
const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList =
pUpdateKernelLaunch->pNewMemObjArgList;
uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs;
const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList =
pUpdateKernelLaunch->pNewValueArgList;

std::vector<cl_mutable_dispatch_arg_khr> CLArgs;
for (uint32_t i = 0; i < NumMemobjArgs; i++) {
const ur_exp_command_buffer_update_memobj_arg_desc_t &URMemObjArg =
ArgMemobjList[i];
cl_mutable_dispatch_arg_khr CLArg{
URMemObjArg.argIndex, // arg_index
sizeof(cl_mem), // arg_size
cl_adapter::cast<const cl_mem *>(
&URMemObjArg.hNewMemObjArg) // arg_value
};

CLArgs.push_back(CLArg);
}

for (uint32_t i = 0; i < NumValueArgs; i++) {
const ur_exp_command_buffer_update_value_arg_desc_t &URValueArg =
ArgValueList[i];
cl_mutable_dispatch_arg_khr CLArg{
URValueArg.argIndex, // arg_index
URValueArg.argSize, // arg_size
URValueArg.pNewValueArg // arg_value
};
CLArgs.push_back(CLArg);
}

const cl_uint NewWorkDim = pUpdateKernelLaunch->newWorkDim;
cl_uint &CLWorkDim = hCommand->WorkDim;
if (NewWorkDim != 0 && NewWorkDim != CLWorkDim) {
// Limitation of the cl_khr_command_buffer_mutable_dispatch specification
// that it is an error to change the ND-Range size.
// https://github.com/KhronosGroup/OpenCL-Docs/issues/1057
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

const size_t CopySize = sizeof(size_t) * CLWorkDim;
std::vector<size_t> CLGlobalWorkOffset, CLGlobalWorkSize, CLLocalWorkSize;

if (auto GlobalWorkOffsetPtr = pUpdateKernelLaunch->pNewGlobalWorkOffset) {
CLGlobalWorkOffset.resize(CLWorkDim);
std::memcpy(CLGlobalWorkOffset.data(), GlobalWorkOffsetPtr, CopySize);
if (CLWorkDim < 3) {
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
std::memset(CLGlobalWorkOffset.data() + CLWorkDim, 0, ZeroSize);
}
}

if (auto GlobalWorkSizePtr = pUpdateKernelLaunch->pNewGlobalWorkSize) {
CLGlobalWorkSize.resize(CLWorkDim);
std::memcpy(CLGlobalWorkSize.data(), GlobalWorkSizePtr, CopySize);
if (CLWorkDim < 3) {
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
std::memset(CLGlobalWorkSize.data() + CLWorkDim, 0, ZeroSize);
}
}

if (auto LocalWorkSizePtr = pUpdateKernelLaunch->pNewLocalWorkSize) {
CLLocalWorkSize.resize(CLWorkDim);
std::memcpy(CLLocalWorkSize.data(), LocalWorkSizePtr, CopySize);
if (CLWorkDim < 3) {
const size_t ZeroSize = sizeof(size_t) * (3 - CLWorkDim);
std::memset(CLLocalWorkSize.data() + CLWorkDim, 0, ZeroSize);
}
}

cl_mutable_command_khr command =
cl_adapter::cast<cl_mutable_command_khr>(hCommand->CLMutableCommand);
cl_mutable_dispatch_config_khr dispatch_config = {
CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR,
nullptr,
command,
static_cast<cl_uint>(CLArgs.size()), // num_args
static_cast<cl_uint>(CLUSMArgs.size()), // num_svm_args
static_cast<cl_uint>(CLExecInfos.size()), // num_exec_infos
CLWorkDim, // work_dim
CLArgs.data(), // arg_list
CLUSMArgs.data(), // arg_svm_list
CLExecInfos.data(), // exec_info_list
CLGlobalWorkOffset.data(), // global_work_offset
CLGlobalWorkSize.data(), // global_work_size
CLLocalWorkSize.data(), // local_work_size
};
cl_mutable_base_config_khr config = {
CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1, &dispatch_config};
CL_RETURN_ON_FAILURE(
clUpdateMutableCommandsKHR(hCommandBuffer->CLCommandBuffer, &config));

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp(
Expand Down Expand Up @@ -415,9 +614,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp(
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp(
[[maybe_unused]] ur_exp_command_buffer_command_handle_t hCommand,
[[maybe_unused]] ur_exp_command_buffer_command_info_t propName,
[[maybe_unused]] size_t propSize, [[maybe_unused]] void *pPropValue,
[[maybe_unused]] size_t *pPropSizeRet) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
ur_exp_command_buffer_command_handle_t hCommand,
ur_exp_command_buffer_command_info_t propName, size_t propSize,
void *pPropValue, size_t *pPropSizeRet) {
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);

switch (propName) {
case UR_EXP_COMMAND_BUFFER_COMMAND_INFO_REFERENCE_COUNT:
return ReturnValue(hCommand->getReferenceCount());
default:
assert(!"Command-buffer command info request not implemented");
}

return UR_RESULT_ERROR_INVALID_ENUMERATION;
}
27 changes: 25 additions & 2 deletions source/adapters/opencl/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,37 @@
#include <CL/cl_ext.h>
#include <ur/ur.hpp>

// Handle to a kernel command.
struct ur_exp_command_buffer_command_handle_t_ {
ur_exp_command_buffer_handle_t hCommandBuffer;
cl_mutable_command_khr CLMutableCommand;
cl_uint WorkDim;
std::atomic_uint32_t RefCount;

ur_exp_command_buffer_command_handle_t_(
ur_exp_command_buffer_handle_t hCommandBuffer,
cl_mutable_command_khr CLMutableCommand, cl_uint WorkDim)
: hCommandBuffer(hCommandBuffer), CLMutableCommand(CLMutableCommand),
WorkDim(WorkDim), RefCount{0} {}

uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
uint32_t decrementReferenceCount() noexcept { return --RefCount; }
uint32_t getReferenceCount() const noexcept { return RefCount; }
};

struct ur_exp_command_buffer_handle_t_ {
ur_queue_handle_t hInternalQueue;
ur_context_handle_t hContext;
cl_command_buffer_khr CLCommandBuffer;
bool Updatable;
bool Finalized;
std::vector<ur_exp_command_buffer_command_handle_t> CommandHandles;

ur_exp_command_buffer_handle_t_(ur_queue_handle_t hQueue,
ur_context_handle_t hContext,
cl_command_buffer_khr CLCommandBuffer)
cl_command_buffer_khr CLCommandBuffer,
bool Updatable)
: hInternalQueue(hQueue), hContext(hContext),
CLCommandBuffer(CLCommandBuffer) {}
CLCommandBuffer(CLCommandBuffer), Updatable(Updatable),
Finalized(false) {}
};
30 changes: 30 additions & 0 deletions source/adapters/opencl/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,33 @@ ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle) {
*NativeHandle = reinterpret_cast<ur_native_handle_t>(URObj);
return UR_RESULT_SUCCESS;
}

cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev,
bool &Result) {
size_t ExtSize = 0;
CL_RETURN_ON_FAILURE(
clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &ExtSize));

std::string ExtStr(ExtSize, '\0');
CL_RETURN_ON_FAILURE(clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, ExtSize,
ExtStr.data(), nullptr));

std::string SupportedExtensions(ExtStr.c_str());
if (ExtStr.find("cl_khr_command_buffer_mutable_dispatch") ==
std::string::npos) {
Result = false;
return CL_SUCCESS;
}

cl_mutable_dispatch_fields_khr mutable_capabilities;
CL_RETURN_ON_FAILURE(clGetDeviceInfo(
Dev, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR,
sizeof(mutable_capabilities), &mutable_capabilities, nullptr));
const cl_mutable_dispatch_fields_khr required_caps =
CL_MUTABLE_DISPATCH_ARGUMENTS_KHR |
CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR |
CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR |
CL_MUTABLE_DISPATCH_EXEC_INFO_KHR;
Result = (mutable_capabilities & required_caps) == required_caps;
return CL_SUCCESS;
}
Loading

0 comments on commit ce980d4

Please sign in to comment.