Skip to content

Commit

Permalink
Merge pull request #1135 from nrspruit/multi_device_kernel_compilatio…
Browse files Browse the repository at this point in the history
…n_main

[L0] Add support for multi-device kernel compilation
  • Loading branch information
kbenzie authored Jan 23, 2024
2 parents 4f80080 + 1b2cd5b commit 7c58060
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 149 deletions.
111 changes: 78 additions & 33 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
*OutEvent ///< [in,out][optional] return an event object that identifies
///< this particular kernel execution instance.
) {
auto ZeDevice = Queue->Device->ZeDevice;

ze_kernel_handle_t ZeKernel{};
if (Kernel->ZeKernelMap.empty()) {
ZeKernel = Kernel->ZeKernel;
} else {
auto It = Kernel->ZeKernelMap.find(ZeDevice);
ZeKernel = It->second;
}
// Lock automatically releases when this goes out of scope.
std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
Expand All @@ -51,7 +60,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}

ZE2UR_CALL(zeKernelSetGlobalOffsetExp,
(Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
(ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
GlobalWorkOffset[2]));
}

Expand All @@ -65,18 +74,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
Queue->Device));
}
ZE2UR_CALL(zeKernelSetArgumentValue,
(Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
(ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
}
Kernel->PendingArguments.clear();

ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
uint32_t WG[3]{};

// global_work_size of unused dimensions must be set to 1
UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1,
UR_RESULT_ERROR_INVALID_VALUE);
UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1,
UR_RESULT_ERROR_INVALID_VALUE);
if (WorkDim >= 2) {
UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1,
UR_RESULT_ERROR_INVALID_VALUE);
if (WorkDim == 3) {
UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1,
UR_RESULT_ERROR_INVALID_VALUE);
}
}
if (LocalWorkSize) {
// L0
UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits<uint32_t>::max)(),
Expand All @@ -99,7 +112,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
(ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
Expand Down Expand Up @@ -175,7 +188,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2]));

bool UseCopyEngine = false;
_ur_ze_event_list_t TmpWaitList;
Expand Down Expand Up @@ -227,18 +240,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
Queue->CaptureIndirectAccesses();
// Add the command to the command list, which implies submission.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
ZeEvent, (*Event)->WaitList.Length,
(*Event)->WaitList.ZeEventList));
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
} else {
// Add the command to the command list for later submission.
// No lock is needed here, unlike the immediate commandlist case above,
// because the kernels are not actually submitted yet. Kernels will be
// submitted only when the comamndlist is closed. Then, a lock is held.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
(CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
ZeEvent, (*Event)->WaitList.Length,
(*Event)->WaitList.ZeEventList));
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
}

urPrint("calling zeCommandListAppendLaunchKernel() with"
Expand Down Expand Up @@ -363,23 +374,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(
return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
}

ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
ZeKernelDesc.flags = 0;
ZeKernelDesc.pKernelName = KernelName;

ze_kernel_handle_t ZeKernel;
ZE2UR_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));

try {
ur_kernel_handle_t_ *UrKernel =
new ur_kernel_handle_t_(ZeKernel, true, Program);
ur_kernel_handle_t_ *UrKernel = new ur_kernel_handle_t_(true, Program);
*RetKernel = reinterpret_cast<ur_kernel_handle_t>(UrKernel);
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (...) {
return UR_RESULT_ERROR_UNKNOWN;
}

for (auto It : Program->ZeModuleMap) {
auto ZeModule = It.second;
ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
ZeKernelDesc.flags = 0;
ZeKernelDesc.pKernelName = KernelName;

ze_kernel_handle_t ZeKernel;
ZE2UR_CALL(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel));

auto ZeDevice = It.first;

// Store the kernel in the ZeKernelMap so the correct
// kernel can be retrieved later for a specific device
// where a queue is being submitted.
(*RetKernel)->ZeKernelMap[ZeDevice] = ZeKernel;
(*RetKernel)->ZeKernels.push_back(ZeKernel);

// If the device used to create the module's kernel is a root-device
// then store the kernel also using the sub-devices, since application
// could submit the root-device's kernel to a sub-device's queue.
uint32_t SubDevicesCount = 0;
zeDeviceGetSubDevices(ZeDevice, &SubDevicesCount, nullptr);
std::vector<ze_device_handle_t> ZeSubDevices(SubDevicesCount);
zeDeviceGetSubDevices(ZeDevice, &SubDevicesCount, ZeSubDevices.data());
for (auto ZeSubDevice : ZeSubDevices) {
(*RetKernel)->ZeKernelMap[ZeSubDevice] = ZeKernel;
}
}

(*RetKernel)->ZeKernel = (*RetKernel)->ZeKernelMap.begin()->second;

UR_CALL((*RetKernel)->initialize());

return UR_RESULT_SUCCESS;
Expand All @@ -396,6 +430,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
) {
std::ignore = Properties;

UR_ASSERT(Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);

// OpenCL: "the arg_value pointer can be NULL or point to a NULL value
// in which case a NULL value will be used as the value for the argument
// declared as a pointer to global or constant memory in the kernel"
Expand All @@ -409,8 +445,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
}

std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
ZE2UR_CALL(zeKernelSetArgumentValue,
(Kernel->ZeKernel, ArgIndex, ArgSize, PArgValue));
for (auto It : Kernel->ZeKernelMap) {
auto ZeKernel = It.second;
ZE2UR_CALL(zeKernelSetArgumentValue,
(ZeKernel, ArgIndex, ArgSize, PArgValue));
}

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -596,16 +635,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease(

auto KernelProgram = Kernel->Program;
if (Kernel->OwnNativeHandle) {
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
for (auto &ZeKernel : Kernel->ZeKernels) {
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
}
Kernel->ZeKernelMap.clear();
if (IndirectAccessTrackingEnabled) {
UR_CALL(urContextRelease(KernelProgram->Context));
}
// do a release on the program this kernel was part of
UR_CALL(urProgramRelease(KernelProgram));
// do a release on the program this kernel was part of without delete of the
// program handle
KernelProgram->ur_release_program_resources(false);

delete Kernel;

return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -639,6 +683,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
std::ignore = PropSize;
std::ignore = Properties;

auto ZeKernel = Kernel->ZeKernel;
std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
*(static_cast<const ur_bool_t *>(PropValue)) == true) {
Expand All @@ -649,7 +694,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST |
ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags));
ZE2UR_CALL(zeKernelSetIndirectAccess, (ZeKernel, IndirectFlags));
} else if (PropName == UR_KERNEL_EXEC_INFO_CACHE_CONFIG) {
ze_cache_config_flag_t ZeCacheConfig{};
auto CacheConfig =
Expand All @@ -663,7 +708,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
else
// Unexpected cache configuration value.
return UR_RESULT_ERROR_INVALID_VALUE;
ZE2UR_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig););
ZE2UR_CALL(zeKernelSetCacheConfig, (ZeKernel, ZeCacheConfig););
} else {
urPrint("urKernelSetExecInfo: unsupported ParamName\n");
return UR_RESULT_ERROR_INVALID_VALUE;
Expand Down
15 changes: 11 additions & 4 deletions source/adapters/level_zero/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
#include <unordered_set>

struct ur_kernel_handle_t_ : _ur_object {
ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle,
ur_program_handle_t Program)
: Context{nullptr}, Program{Program}, ZeKernel{Kernel},
SubmissionsCount{0}, MemAllocs{} {
ur_kernel_handle_t_(bool OwnZeHandle, ur_program_handle_t Program)
: Program{Program}, SubmissionsCount{0}, MemAllocs{} {
OwnNativeHandle = OwnZeHandle;
}

Expand All @@ -37,6 +35,15 @@ struct ur_kernel_handle_t_ : _ur_object {
// Level Zero function handle.
ze_kernel_handle_t ZeKernel;

// Map of L0 kernels created for all the devices for which a UR Program
// has been built. It may contain duplicated kernel entries for a root
// device and its sub-devices.
std::unordered_map<ze_device_handle_t, ze_kernel_handle_t> ZeKernelMap;

// Vector of L0 kernels. Each entry is unique, so this is used for
// destroying the kernels instead of ZeKernelMap
std::vector<ze_kernel_handle_t> ZeKernels;

// Counter to track the number of submissions of the kernel.
// When this value is zero, it means that kernel is not submitted for an
// execution - at this time we can release memory allocations referenced by
Expand Down
Loading

0 comments on commit 7c58060

Please sign in to comment.