From 88682306b0c8658f258a39c96325e4271daa778d Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Tue, 28 Nov 2023 15:37:23 -0800 Subject: [PATCH 1/3] [L0] Add support for multi-device kernel compilation Signed-off-by: Spruit, Neil R --- source/adapters/level_zero/kernel.cpp | 93 ++++++--- source/adapters/level_zero/kernel.hpp | 15 +- source/adapters/level_zero/program.cpp | 190 ++++++++++-------- source/adapters/level_zero/program.hpp | 9 + .../kernel/kernel_adapter_level_zero.match | 1 + 5 files changed, 195 insertions(+), 113 deletions(-) diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index dfa8915197..cea537b3fe 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -41,6 +41,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { + auto ZeDevice = Queue->Device->ZeDevice; + + ze_kernel_handle_t ZeKernel{}; + if (Kernel->ZeKernelMap.empty()) { + ZeKernel = Kernel->ZeKernel; + } else { + auto It = Kernel->ZeKernelMap.find(ZeDevice); + ZeKernel = It->second; + } // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); @@ -51,7 +60,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } ZE2UR_CALL(zeKernelSetGlobalOffsetExp, - (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], + (ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2])); } @@ -65,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( Queue->Device)); } ZE2UR_CALL(zeKernelSetArgumentValue, - (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); } Kernel->PendingArguments.clear(); @@ -99,7 +108,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } if (SuggestGroupSize) { ZE2UR_CALL(zeKernelSuggestGroupSize, - (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1], + (ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1], GlobalWorkSize[2], &WG[0], &WG[1], &WG[2])); } else { for (int I : {0, 1, 2}) { @@ -175,7 +184,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } - ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); + ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); bool UseCopyEngine = false; _ur_ze_event_list_t TmpWaitList; @@ -227,18 +236,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( Queue->CaptureIndirectAccesses(); // Add the command to the command list, which implies submission. ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); + (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, + (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } else { // Add the command to the command list for later submission. // No lock is needed here, unlike the immediate commandlist case above, // because the kernels are not actually submitted yet. Kernels will be // submitted only when the comamndlist is closed. Then, a lock is held. ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); + (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, + (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } urPrint("calling zeCommandListAppendLaunchKernel() with" @@ -363,16 +370,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; } - ZeStruct ZeKernelDesc; - ZeKernelDesc.flags = 0; - ZeKernelDesc.pKernelName = KernelName; - - ze_kernel_handle_t ZeKernel; - ZE2UR_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel)); - try { - ur_kernel_handle_t_ *UrKernel = - new ur_kernel_handle_t_(ZeKernel, true, Program); + ur_kernel_handle_t_ *UrKernel = new ur_kernel_handle_t_(true, Program); *RetKernel = reinterpret_cast(UrKernel); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; @@ -380,6 +379,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( return UR_RESULT_ERROR_UNKNOWN; } + for (auto It : Program->ZeModuleMap) { + auto ZeModule = It.second; + ZeStruct ZeKernelDesc; + ZeKernelDesc.flags = 0; + ZeKernelDesc.pKernelName = KernelName; + + ze_kernel_handle_t ZeKernel; + ZE2UR_CALL(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel)); + + auto ZeDevice = It.first; + + // Store the kernel in the ZeKernelMap so the correct + // kernel can be retrieved later for a specific device + // where a queue is being submitted. + (*RetKernel)->ZeKernelMap[ZeDevice] = ZeKernel; + (*RetKernel)->ZeKernels.push_back(ZeKernel); + + // If the device used to create the module's kernel is a root-device + // then store the kernel also using the sub-devices, since application + // could submit the root-device's kernel to a sub-device's queue. + uint32_t SubDevicesCount = 0; + zeDeviceGetSubDevices(ZeDevice, &SubDevicesCount, nullptr); + std::vector ZeSubDevices(SubDevicesCount); + zeDeviceGetSubDevices(ZeDevice, &SubDevicesCount, ZeSubDevices.data()); + for (auto ZeSubDevice : ZeSubDevices) { + (*RetKernel)->ZeKernelMap[ZeSubDevice] = ZeKernel; + } + } + + (*RetKernel)->ZeKernel = (*RetKernel)->ZeKernelMap.begin()->second; + UR_CALL((*RetKernel)->initialize()); return UR_RESULT_SUCCESS; @@ -396,6 +426,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( ) { std::ignore = Properties; + UR_ASSERT(Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value // in which case a NULL value will be used as the value for the argument // declared as a pointer to global or constant memory in the kernel" @@ -409,8 +441,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( } std::scoped_lock Guard(Kernel->Mutex); - ZE2UR_CALL(zeKernelSetArgumentValue, - (Kernel->ZeKernel, ArgIndex, ArgSize, PArgValue)); + for (auto It : Kernel->ZeKernelMap) { + auto ZeKernel = It.second; + ZE2UR_CALL(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); + } return UR_RESULT_SUCCESS; } @@ -596,11 +631,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( auto KernelProgram = Kernel->Program; if (Kernel->OwnNativeHandle) { - auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); + for (auto &ZeKernel : Kernel->ZeKernels) { + auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } } + Kernel->ZeKernelMap.clear(); if (IndirectAccessTrackingEnabled) { UR_CALL(urContextRelease(KernelProgram->Context)); } @@ -639,6 +677,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( std::ignore = PropSize; std::ignore = Properties; + auto ZeKernel = Kernel->ZeKernel; std::scoped_lock Guard(Kernel->Mutex); if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS && *(static_cast(PropValue)) == true) { @@ -649,7 +688,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST | ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; - ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags)); + ZE2UR_CALL(zeKernelSetIndirectAccess, (ZeKernel, IndirectFlags)); } else if (PropName == UR_KERNEL_EXEC_INFO_CACHE_CONFIG) { ze_cache_config_flag_t ZeCacheConfig{}; auto CacheConfig = @@ -663,7 +702,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( else // Unexpected cache configuration value. return UR_RESULT_ERROR_INVALID_VALUE; - ZE2UR_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig);); + ZE2UR_CALL(zeKernelSetCacheConfig, (ZeKernel, ZeCacheConfig);); } else { urPrint("urKernelSetExecInfo: unsupported ParamName\n"); return UR_RESULT_ERROR_INVALID_VALUE; diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp index 4ef21ce18b..a6d85d2baa 100644 --- a/source/adapters/level_zero/kernel.hpp +++ b/source/adapters/level_zero/kernel.hpp @@ -14,10 +14,8 @@ #include struct ur_kernel_handle_t_ : _ur_object { - ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle, - ur_program_handle_t Program) - : Context{nullptr}, Program{Program}, ZeKernel{Kernel}, - SubmissionsCount{0}, MemAllocs{} { + ur_kernel_handle_t_(bool OwnZeHandle, ur_program_handle_t Program) + : Program{Program}, SubmissionsCount{0}, MemAllocs{} { OwnNativeHandle = OwnZeHandle; } @@ -37,6 +35,15 @@ struct ur_kernel_handle_t_ : _ur_object { // Level Zero function handle. ze_kernel_handle_t ZeKernel; + // Map of L0 kernels created for all the devices for which a UR Program + // has been built. It may contain duplicated kernel entries for a root + // device and its sub-devices. + std::unordered_map ZeKernelMap; + + // Vector of L0 kernels. Each entry is unique, so this is used for + // destroying the kernels instead of ZeKernelMap + std::vector ZeKernels; + // Counter to track the number of submissions of the kernel. // When this value is zero, it means that kernel is not submitted for an // execution - at this time we can release memory allocations referenced by diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index bb2d964422..636289e195 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -167,48 +167,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str(); ZeModuleDesc.pConstants = Shim.ze(); + ur_result_t Result = UR_RESULT_SUCCESS; - ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice; - ze_context_handle_t ZeContext = hProgram->Context->ZeContext; - std::ignore = numDevices; - ze_module_handle_t ZeModule = nullptr; + for (uint32_t i = 0; i < numDevices; i++) { + ze_device_handle_t ZeDevice = phDevices[i]->ZeDevice; + ze_context_handle_t ZeContext = hProgram->Context->ZeContext; + ze_module_handle_t ZeModuleHandle = nullptr; + ze_module_build_log_handle_t ZeBuildLog{}; - ur_result_t Result = UR_RESULT_SUCCESS; - hProgram->State = ur_program_handle_t_::Exe; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, - &ZeModule, &hProgram->ZeBuildLog)); - if (ZeResult != ZE_RESULT_SUCCESS) { - // We adjust ur_program below to avoid attempting to release zeModule when - // RT calls urProgramRelease(). - hProgram->State = ur_program_handle_t_::Invalid; - Result = ze2urResult(ZeResult); - if (ZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); - ZeModule = nullptr; - } - } else { - // The call to zeModuleCreate does not report an error if there are - // unresolved symbols because it thinks these could be resolved later via a - // call to zeModuleDynamicLink. However, modules created with - // urProgramBuild are supposed to be fully linked and ready to use. - // Therefore, do an extra check now for unresolved symbols. - ZeResult = checkUnresolvedSymbols(ZeModule, &hProgram->ZeBuildLog); + hProgram->State = ur_program_handle_t_::Exe; + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, + &ZeModuleHandle, &ZeBuildLog)); if (ZeResult != ZE_RESULT_SUCCESS) { + // We adjust ur_program below to avoid attempting to release zeModule when + // RT calls urProgramRelease(). hProgram->State = ur_program_handle_t_::Invalid; - Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) - ? UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE - : ze2urResult(ZeResult); - if (ZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); - ZeModule = nullptr; + Result = ze2urResult(ZeResult); + if (ZeModuleHandle) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModuleHandle)); + ZeModuleHandle = nullptr; + } + } else { + // The call to zeModuleCreate does not report an error if there are + // unresolved symbols because it thinks these could be resolved later via + // a call to zeModuleDynamicLink. However, modules created with + // urProgramBuild are supposed to be fully linked and ready to use. + // Therefore, do an extra check now for unresolved symbols. + ZeResult = checkUnresolvedSymbols(ZeModuleHandle, &ZeBuildLog); + if (ZeResult != ZE_RESULT_SUCCESS) { + hProgram->State = ur_program_handle_t_::Invalid; + Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) + ? UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE + : ze2urResult(ZeResult); + if (ZeModuleHandle) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModuleHandle)); + ZeModuleHandle = nullptr; + } } + hProgram->ZeModuleMap.insert(std::make_pair(ZeDevice, ZeModuleHandle)); + hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } } // We no longer need the IL / native code. hProgram->Code.reset(); - hProgram->ZeModule = ZeModule; + if (!hProgram->ZeModuleMap.empty()) + hProgram->ZeModule = hProgram->ZeModuleMap.begin()->second; + if (!hProgram->ZeBuildLogMap.empty()) + hProgram->ZeBuildLog = hProgram->ZeBuildLogMap.begin()->second; return Result; } @@ -292,9 +299,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( ur_program_handle_t *phProgram ///< [out] pointer to handle of program object created. ) { - std::ignore = numDevices; - UR_ASSERT(hContext->isValidDevice(phDevices[0]), - UR_RESULT_ERROR_INVALID_DEVICE); + for (uint32_t i = 0; i < numDevices; i++) { + UR_ASSERT(hContext->isValidDevice(phDevices[i]), + UR_RESULT_ERROR_INVALID_DEVICE); + } // We do not support any link flags at this time because the Level Zero API // does not have any way to pass flags that are specific to linking. @@ -402,49 +410,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( return UR_RESULT_ERROR_INVALID_VALUE; } } - - // Call the Level Zero API to compile, link, and create the module. - ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice; - ze_context_handle_t ZeContext = hContext->ZeContext; - ze_module_handle_t ZeModule = nullptr; - ze_module_build_log_handle_t ZeBuildLog = nullptr; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, - &ZeModule, &ZeBuildLog)); - - // We still create a ur_program_handle_t_ object even if there is a - // BUILD_FAILURE because we need the object to hold the ZeBuildLog. There - // is no build log created for other errors, so we don't create an object. - UrResult = ze2urResult(ZeResult); - if (ZeResult != ZE_RESULT_SUCCESS && - ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) { - return ze2urResult(ZeResult); - } - - // The call to zeModuleCreate does not report an error if there are - // unresolved symbols because it thinks these could be resolved later via a - // call to zeModuleDynamicLink. However, modules created with piProgramLink - // are supposed to be fully linked and ready to use. Therefore, do an extra - // check now for unresolved symbols. Note that we still create a - // ur_program_handle_t_ if there are unresolved symbols because the - // ZeBuildLog tells which symbols are unresolved. - if (ZeResult == ZE_RESULT_SUCCESS) { - ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog); - if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) { - UrResult = - UR_RESULT_ERROR_UNKNOWN; // TODO: - // UR_RESULT_ERROR_PROGRAM_LINK_FAILURE; - } else if (ZeResult != ZE_RESULT_SUCCESS) { + std::unordered_map ZeModuleMap; + std::unordered_map + ZeBuildLogMap; + + for (uint32_t i = 0; i < numDevices; i++) { + + // Call the Level Zero API to compile, link, and create the module. + ze_device_handle_t ZeDevice = phDevices[i]->ZeDevice; + ze_context_handle_t ZeContext = hContext->ZeContext; + ze_module_handle_t ZeModule = nullptr; + ze_module_build_log_handle_t ZeBuildLog = nullptr; + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, + &ZeModule, &ZeBuildLog)); + + // We still create a ur_program_handle_t_ object even if there is a + // BUILD_FAILURE because we need the object to hold the ZeBuildLog. There + // is no build log created for other errors, so we don't create an object. + UrResult = ze2urResult(ZeResult); + if (ZeResult != ZE_RESULT_SUCCESS && + ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) { return ze2urResult(ZeResult); } + + // The call to zeModuleCreate does not report an error if there are + // unresolved symbols because it thinks these could be resolved later via + // a call to zeModuleDynamicLink. However, modules created with + // piProgramLink are supposed to be fully linked and ready to use. + // Therefore, do an extra check now for unresolved symbols. Note that we + // still create a ur_program_handle_t_ if there are unresolved symbols + // because the ZeBuildLog tells which symbols are unresolved. + if (ZeResult == ZE_RESULT_SUCCESS) { + ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog); + if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) { + UrResult = + UR_RESULT_ERROR_UNKNOWN; // TODO: + // UR_RESULT_ERROR_PROGRAM_LINK_FAILURE; + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + } + ZeModuleMap.insert(std::make_pair(ZeDevice, ZeModule)); + ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } ur_program_handle_t_::state State = (UrResult == UR_RESULT_SUCCESS) ? ur_program_handle_t_::Exe : ur_program_handle_t_::Invalid; ur_program_handle_t_ *UrProgram = - new ur_program_handle_t_(State, hContext, ZeModule, ZeBuildLog); + new ur_program_handle_t_(State, hContext, ZeModuleMap.begin()->second, + ZeBuildLogMap.begin()->second); *phProgram = reinterpret_cast(UrProgram); + (*phProgram)->ZeModuleMap = ZeModuleMap; + (*phProgram)->ZeBuildLogMap = ZeBuildLogMap; } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -715,23 +734,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( } // Next check if there is a Level Zero build log. - if (Program->ZeBuildLog) { + if (Program->ZeBuildLogMap.find(Device->ZeDevice) != + Program->ZeBuildLogMap.end()) { + ze_module_build_log_handle_t ZeBuildLog = + Program->ZeBuildLogMap.begin()->second; size_t LogSize = PropSize; ZE2UR_CALL(zeModuleBuildLogGetString, - (Program->ZeBuildLog, &LogSize, ur_cast(PropValue))); + (ZeBuildLog, &LogSize, ur_cast(PropValue))); if (PropSizeRet) { *PropSizeRet = LogSize; } if (PropValue) { - // When the program build fails in urProgramBuild(), we delayed cleaning - // up the build log because RT later calls this routine to get the - // failed build log. - // To avoid memory leaks, we should clean up the failed build log here - // because RT does not create sycl::program when urProgramBuild() fails, - // thus it won't call urProgramRelease() to clean up the build log. + // When the program build fails in urProgramBuild(), we delayed + // cleaning up the build log because RT later calls this routine to + // get the failed build log. To avoid memory leaks, we should clean up + // the failed build log here because RT does not create sycl::program + // when urProgramBuild() fails, thus it won't call urProgramRelease() + // to clean up the build log. if (Program->State == ur_program_handle_t_::Invalid) { - ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog)); - Program->ZeBuildLog = nullptr; + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog)); + Program->ZeBuildLogMap.erase(Device->ZeDevice); + ZeBuildLog = nullptr; } } return UR_RESULT_SUCCESS; @@ -817,12 +840,15 @@ ur_program_handle_t_::~ur_program_handle_t_() { // According to Level Zero Specification, all kernels and build logs // must be destroyed before the Module can be destroyed. So, be sure // to destroy build log before destroying the module. - if (ZeBuildLog) { - ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog)); + for (auto &ZeBuildLogPair : this->ZeBuildLogMap) { + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLogPair.second)); } if (ZeModule && OwnZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); + for (auto &ZeModulePair : this->ZeModuleMap) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second)); + } + this->ZeModuleMap.clear(); } } diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp index 5aa6ff89a3..1cb233dd45 100644 --- a/source/adapters/level_zero/program.hpp +++ b/source/adapters/level_zero/program.hpp @@ -131,6 +131,15 @@ struct ur_program_handle_t_ : _ur_object { // The Level Zero module handle. Used primarily in Exe state. ze_module_handle_t ZeModule{}; + // Map of L0 Modules created for all the devices for which a UR Program + // has been built. + std::unordered_map ZeModuleMap; + // The Level Zero build log from the last call to zeModuleCreate(). ze_module_build_log_handle_t ZeBuildLog{}; + + // Map of L0 Module Build logs created for all the devices for which a UR + // Program has been built. + std::unordered_map + ZeBuildLogMap; }; diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 2668b6821a..8194c7ddad 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -11,6 +11,7 @@ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runt urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgPointerNegativeTest.InvalidNullHandleKernel/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ From c2682e9418ec2bc07a053f34d44a736e1c0674cc Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Tue, 16 Jan 2024 17:54:55 -0800 Subject: [PATCH 2/3] [L0] don't delete program handle in kernel release and check work dim size - Decrement the program handle reference count & release the L0 resources if the refcnt == 0, but don't delete the program handle in the kernel teardown to avoid invalid memory access during program handle teardown. Signed-off-by: Spruit, Neil R --- source/adapters/level_zero/kernel.cpp | 18 ++++++++----- source/adapters/level_zero/program.cpp | 26 ++++++++++++++----- source/adapters/level_zero/program.hpp | 5 ++++ .../kernel/kernel_adapter_level_zero.match | 1 - 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index cea537b3fe..bf131b73e2 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -82,10 +82,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( uint32_t WG[3]{}; // global_work_size of unused dimensions must be set to 1 - UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, - UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, - UR_RESULT_ERROR_INVALID_VALUE); + if (WorkDim >= 2) { + UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + if (WorkDim == 3) { + UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + } + } if (LocalWorkSize) { // L0 UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits::max)(), @@ -642,8 +646,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( if (IndirectAccessTrackingEnabled) { UR_CALL(urContextRelease(KernelProgram->Context)); } - // do a release on the program this kernel was part of - UR_CALL(urProgramRelease(KernelProgram)); + // do a release on the program this kernel was part of without delete of the + // program handle + KernelProgram->ur_release_program_resources(false); + delete Kernel; return UR_RESULT_SUCCESS; diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index 636289e195..1cab028453 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -837,18 +837,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( } ur_program_handle_t_::~ur_program_handle_t_() { + if (!resourcesReleased) { + ur_release_program_resources(true); + } +} + +void ur_program_handle_t_::ur_release_program_resources(bool deletion) { // According to Level Zero Specification, all kernels and build logs // must be destroyed before the Module can be destroyed. So, be sure // to destroy build log before destroying the module. - for (auto &ZeBuildLogPair : this->ZeBuildLogMap) { - ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLogPair.second)); + if (!deletion) { + if (!RefCount.decrementAndTest()) { + return; + } } + if (!resourcesReleased) { + for (auto &ZeBuildLogPair : this->ZeBuildLogMap) { + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLogPair.second)); + } - if (ZeModule && OwnZeModule) { - for (auto &ZeModulePair : this->ZeModuleMap) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second)); + if (ZeModule && OwnZeModule) { + for (auto &ZeModulePair : this->ZeModuleMap) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second)); + } + this->ZeModuleMap.clear(); } - this->ZeModuleMap.clear(); + resourcesReleased = true; } } diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp index 1cb233dd45..8d148c8fa2 100644 --- a/source/adapters/level_zero/program.hpp +++ b/source/adapters/level_zero/program.hpp @@ -100,6 +100,11 @@ struct ur_program_handle_t_ : _ur_object { State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {} ~ur_program_handle_t_(); + void ur_release_program_resources(bool deletion); + + // Tracks the release state of the program handle to determine if the + // internal handle needs to be released. + bool resourcesReleased = false; const ur_context_handle_t Context; // Context of the program. diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 8194c7ddad..2668b6821a 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -11,7 +11,6 @@ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runt urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerNegativeTest.InvalidNullHandleKernel/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ From 1b2cd5b34dda3055c5274b2eb381adef12fd95c8 Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Thu, 18 Jan 2024 14:34:34 -0800 Subject: [PATCH 3/3] [L0] Remove Device Filtering with urProgramBuild Handling MultiDevice Signed-off-by: Spruit, Neil R --- source/adapters/level_zero/queue.cpp | 31 ++-------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index f07e0df675..e25c58f848 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -281,35 +281,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( ur_queue_handle_t *Queue ///< [out] pointer to handle of queue object created ) { - - // Make the Device appear as the first device in the context since this - // is where the urProgramBuild will only build the module to. Also, if - // the Device is a sub-device then see if there is a also its root-device - // in the context and make that go first instead (because sub-device can - // run code built for its root-device). - // - // TODO: this is all hacky and should be removed when we add support - // for building to all the devices in the context. - // - { // Lock context for thread-safe update - std::scoped_lock Lock(Context->Mutex); - UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE); - - auto MakeFirst = Context->Devices.begin(); - for (auto I = Context->Devices.begin(); I != Context->Devices.end(); ++I) { - if (*I == Device) { - MakeFirst = I; - if (!Device->RootDevice) - break; - // continue the search for possible root-device in the context - } else if (*I == Device->RootDevice) { - MakeFirst = I; - break; // stop the search - } - } - if (MakeFirst != Context->Devices.begin()) - std::iter_swap(MakeFirst, Context->Devices.begin()); - } ur_queue_flags_t Flags{}; if (Props) { Flags = Props->flags; @@ -328,6 +299,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( } } + UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE); + // Create placeholder queues in the compute queue group. // Actual L0 queues will be created at first use. std::vector ZeComputeCommandQueues(