diff --git a/include/ur_api.h b/include/ur_api.h index 9d88eecbc6..80df5a6fc0 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -224,6 +224,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224, ///< Enumerator for ::urEnqueueKernelLaunchCustomExp + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -5230,6 +5231,43 @@ urKernelCreateWithNativeHandle( ur_kernel_handle_t *phKernel ///< [out] pointer to the handle of the kernel object created. ); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t *pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -9943,6 +9981,19 @@ typedef struct ur_kernel_create_with_native_handle_params_t { ur_kernel_handle_t **pphKernel; } ur_kernel_create_with_native_handle_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKernelGetSuggestedLocalWorkSize +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_kernel_get_suggested_local_work_size_params_t { + ur_kernel_handle_t *phKernel; + ur_queue_handle_t *phQueue; + uint32_t *pnumWorkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + size_t **ppSuggestedLocalWorkSize; +} ur_kernel_get_suggested_local_work_size_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urKernelSetArgValue /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index fb1f1823b3..4aaa6d9fe3 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -535,6 +535,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelCreateWithNativeHandle_t)( const ur_kernel_native_properties_t *, ur_kernel_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSize +typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)( + ur_kernel_handle_t, + ur_queue_handle_t, + uint32_t, + const size_t *, + const size_t *, + size_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urKernelSetArgValue typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)( @@ -603,6 +613,7 @@ typedef struct ur_kernel_dditable_t { ur_pfnKernelRelease_t pfnRelease; ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle; ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle; + ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize; ur_pfnKernelSetArgValue_t pfnSetArgValue; ur_pfnKernelSetArgLocal_t pfnSetArgLocal; ur_pfnKernelSetArgPointer_t pfnSetArgPointer; diff --git a/include/ur_print.h b/include/ur_print.h index 753875ace9..c8fb41753e 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1442,6 +1442,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetNativeHandleParams(const str /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelCreateWithNativeHandleParams(const struct ur_kernel_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_get_suggested_local_work_size_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetSuggestedLocalWorkSizeParams(const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_kernel_set_arg_value_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index db230c91d7..0e5026c521 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -929,6 +929,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP"; break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: + os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE"; + break; default: os << "unknown enumerator"; break; @@ -11462,6 +11465,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_get_suggested_local_work_size_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_kernel_get_suggested_local_work_size_params_t *params) { + + os << ".hKernel = "; + + ur::details::printPtr(os, + *(params->phKernel)); + + os << ", "; + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".numWorkDim = "; + + os << *(params->pnumWorkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".pSuggestedLocalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppSuggestedLocalWorkSize)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_kernel_set_arg_value_params_t type /// @returns @@ -17143,6 +17189,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE: { os << (const struct ur_kernel_create_with_native_handle_params_t *)params; } break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: { + os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)params; + } break; case UR_FUNCTION_KERNEL_SET_ARG_VALUE: { os << (const struct ur_kernel_set_arg_value_params_t *)params; } break; diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml index 4a0bf0bab1..5446f3bc1d 100644 --- a/scripts/core/kernel.yml +++ b/scripts/core/kernel.yml @@ -534,3 +534,44 @@ params: returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If the adapter has no underlying equivalent handle." +--- #-------------------------------------------------------------------------- +type: function +desc: "Get the suggested local work size for a kernel." +class: $xKernel +name: GetSuggestedLocalWorkSize +ordinal: "0" +details: + - "Query a suggested local work size for a kernel given a global size for each dimension." + - "The application may call this function from simultaneous threads for the same context." +params: + - type: $x_kernel_handle_t + name: hKernel + desc: | + [in] handle of the kernel + - type: $x_queue_handle_t + name: hQueue + desc: | + [in] handle of the queue object + - type: uint32_t + name: numWorkDim + desc: | + [in] number of dimensions, from 1 to 3, to specify the global + and work-group work-items + - type: const size_t* + name: pGlobalWorkOffset + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the offset used to calculate the global ID of a work-item + - type: const size_t* + name: pGlobalWorkSize + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the number of global work-items in workDim that will execute the + kernel function + - type: size_t* + name: pSuggestedLocalWorkSize + desc: | + [out] pointer to an array of numWorkDim unsigned values that specify + suggested local work size that will contain the result of the query +returns: + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index b0a61e7f88..52585ade3a 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -586,6 +586,9 @@ etors: - name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP desc: Enumerator for $xEnqueueKernelLaunchCustomExp value: '224' +- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE + desc: Enumerator for $xKernelGetSuggestedLocalWorkSize + value: '225' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp index c925a27295..be141f7b20 100644 --- a/source/adapters/cuda/enqueue.hpp +++ b/source/adapters/cuda/enqueue.hpp @@ -17,6 +17,10 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList); +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + ur_kernel_handle_t Kernel); + bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, ur_kernel_handle_t Kernel, size_t BlockSize); diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 675fdbe0a3..5e01845a56 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -9,7 +9,9 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "enqueue.hpp" #include "memory.hpp" +#include "queue.hpp" #include "sampler.hpp" UR_APIEXPORT ur_result_t UR_APICALL @@ -380,3 +382,30 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, } return Result; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + // Preconditions + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_device_handle_t Device = hQueue->Device; + ur_result_t Result = UR_RESULT_SUCCESS; + size_t ThreadsPerBlock[3] = {}; + + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext Active(Device); + + guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim, + hKernel); + + std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim, + pSuggestedLocalWorkSize); + return Result; +} diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index b70198b227..fc8cad9d43 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/enqueue.hpp b/source/adapters/hip/enqueue.hpp index a1f86b3678..eacac72a82 100644 --- a/source/adapters/hip/enqueue.hpp +++ b/source/adapters/hip/enqueue.hpp @@ -30,3 +30,7 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, const hipMemoryType DstType, ur_rect_offset_t DstOffset, size_t DstRowPitch, size_t DstSlicePitch, hipMemcpy3DParms &Params); + +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3]); diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index f35d3957bc..b433c06852 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "enqueue.hpp" #include "memory.hpp" #include "sampler.hpp" @@ -349,3 +350,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( [[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + [[maybe_unused]] ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, + uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + size_t MaxThreadsPerBlock[3]; + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + + MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX(); + MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY(); + MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ(); + + ur_device_handle_t Device = hQueue->getDevice(); + ScopedContext Active(Device); + + guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim, + MaxThreadsPerBlock); + std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim, + pSuggestedLocalWorkSize); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 71979b75b1..7a28623e0b 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 40d6260ac9..3f8e8c6986 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -13,6 +13,93 @@ #include "ur_api.h" #include "ur_level_zero.hpp" +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + uint32_t LocalWorkSize[3]; + size_t GlobalWorkSize3D[3]{1, 1, 1}; + std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D); + + ze_kernel_handle_t ZeKernel{}; + UR_CALL(getZeKernel(hQueue, hKernel, &ZeKernel)); + + UR_CALL(getSuggestedLocalWorkSize(hQueue, ZeKernel, GlobalWorkSize3D, + LocalWorkSize)); + + std::copy(LocalWorkSize, LocalWorkSize + workDim, pSuggestedLocalWorkSize); + return UR_RESULT_SUCCESS; +} + +ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + ze_kernel_handle_t *phZeKernel) { + auto ZeDevice = hQueue->Device->ZeDevice; + + if (hKernel->ZeKernelMap.empty()) { + *phZeKernel = hKernel->ZeKernel; + } else { + auto It = hKernel->ZeKernelMap.find(ZeDevice); + if (It == hKernel->ZeKernelMap.end()) { + /* kernel and queue don't match */ + return UR_RESULT_ERROR_INVALID_QUEUE; + } + *phZeKernel = It->second; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]) { + uint32_t *WG = SuggestedLocalWorkSize3D; + + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize3D[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], + GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeX, + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeY, + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); + while (GlobalWorkSize3D[I] % GroupSize[I]) { + --GroupSize[I]; + } + if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { + logger::error("getSuggestedLocalWorkSize: can't find a WG size " + "suitable for global work size > UINT32_MAX"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + logger::debug( + "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}", + WG[0], WG[1], WG[2]); + } + + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object @@ -43,19 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { - auto ZeDevice = Queue->Device->ZeDevice; - ze_kernel_handle_t ZeKernel{}; - if (Kernel->ZeKernelMap.empty()) { - ZeKernel = Kernel->ZeKernel; - } else { - auto It = Kernel->ZeKernelMap.find(ZeDevice); - if (It == Kernel->ZeKernelMap.end()) { - /* kernel and queue don't match */ - return UR_RESULT_ERROR_INVALID_QUEUE; - } - ZeKernel = It->second; - } + UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel)); + // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); @@ -92,54 +169,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); if (LocalWorkSize) { - // L0 - UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - WG[0] = static_cast(LocalWorkSize[0]); - WG[1] = static_cast(LocalWorkSize[1]); - WG[2] = static_cast(LocalWorkSize[2]); - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - - if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { - logger::error("urEnqueueKernelLaunch: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - logger::debug( - "urEnqueueKernelLaunch: using computed WG size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); + for (uint32_t I = 0; I < WorkDim; ++I) { + UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + WG[I] = static_cast(LocalWorkSize[I]); } + } else { + UR_CALL(getSuggestedLocalWorkSize(Queue, ZeKernel, GlobalWorkSize3D, WG)); } // TODO: assert if sizes do not fit into 32-bit? diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp index 1cc146d262..2db3af0514 100644 --- a/source/adapters/level_zero/kernel.hpp +++ b/source/adapters/level_zero/kernel.hpp @@ -107,3 +107,10 @@ struct ur_kernel_handle_t_ : _ur_object { ZeCache> ZeKernelProperties; ZeCache ZeKernelName; }; + +ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]); +ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + ze_kernel_handle_t *phZeKernel); diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index d6d6060ea6..45568a7885 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -151,6 +151,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return retVal; } diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 7ef17b0c28..29b54503eb 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -297,3 +297,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( DIE_NO_IMPLEMENTATION } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_queue_handle_t hQueue, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + [[maybe_unused]] const size_t *pGlobalWorkSize, + [[maybe_unused]] size_t *pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 065012613e..053fc32d9f 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -123,6 +123,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 2278d5907e..a713a385a7 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -2616,6 +2616,43 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnGetSuggestedLocalWorkSize = + d_context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr != pfnGetSuggestedLocalWorkSize) { + result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + } else { + // generic implementation + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -6248,6 +6285,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = driver::urKernelCreateWithNativeHandle; + pDdiTable->pfnGetSuggestedLocalWorkSize = + driver::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal; diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 4fcbdeefa5..3accd84778 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -419,3 +419,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( CL_RETURN_ON_FAILURE(RetErr); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + size_t *pSuggestedLocalWorkSize) { + cl_device_id Device; + cl_platform_id Platform; + + CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( + cl_adapter::cast(hQueue), CL_QUEUE_DEVICE, + sizeof(cl_device_id), &Device, nullptr)); + + CL_RETURN_ON_FAILURE(clGetDeviceInfo( + Device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &Platform, nullptr)); + + auto GetKernelSuggestedLocalWorkSizeFuncPtr = + (clGetKernelSuggestedLocalWorkSizeKHR_fn) + clGetExtensionFunctionAddressForPlatform( + Platform, "clGetKernelSuggestedLocalWorkSizeKHR"); + if (!GetKernelSuggestedLocalWorkSizeFuncPtr) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + CL_RETURN_ON_FAILURE(GetKernelSuggestedLocalWorkSizeFuncPtr( + cl_adapter::cast(hQueue), + cl_adapter::cast(hKernel), workDim, pGlobalWorkOffset, + pGlobalWorkSize, pSuggestedLocalWorkSize)); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index effb2128c3..687b541911 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 56f270a9d9..da61c34992 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -3420,6 +3420,57 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + auto pfnGetSuggestedLocalWorkSize = + context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_kernel_get_suggested_local_work_size_params_t params = { + &hKernel, &hQueue, &numWorkDim, + &pGlobalWorkOffset, &pGlobalWorkSize, &pSuggestedLocalWorkSize}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, + "urKernelGetSuggestedLocalWorkSize", ¶ms); + + context.logger.info("---> urKernelGetSuggestedLocalWorkSize"); + + ur_result_t result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + context.notify_end(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, + "urKernelGetSuggestedLocalWorkSize", ¶ms, &result, + instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -8348,6 +8399,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = ur_tracing_layer::urKernelCreateWithNativeHandle; + dditable.pfnGetSuggestedLocalWorkSize = + pDdiTable->pfnGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_tracing_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 4bdd801c1a..6435cc24e1 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -3857,6 +3857,71 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + auto pfnGetSuggestedLocalWorkSize = + context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hKernel) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == pGlobalWorkOffset) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == pGlobalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hKernel)) { + refCountContext.logInvalidReference(hKernel); + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -10006,6 +10071,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = ur_validation_layer::urKernelCreateWithNativeHandle; + dditable.pfnGetSuggestedLocalWorkSize = + pDdiTable->pfnGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_validation_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index d7a9447b06..fb392dd607 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -3449,6 +3449,49 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hKernel)->dditable; + auto pfnGetSuggestedLocalWorkSize = + dditable->ur.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hKernel = reinterpret_cast(hKernel)->handle; + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // forward to device-platform + result = pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -8599,6 +8642,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetNativeHandle = ur_loader::urKernelGetNativeHandle; pDdiTable->pfnCreateWithNativeHandle = ur_loader::urKernelCreateWithNativeHandle; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_loader::urKernelGetSuggestedLocalWorkSize; pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 35e5d68e36..2fa318e71c 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -4096,6 +4096,58 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query + ) try { + auto pfnGetSuggestedLocalWorkSize = + ur_lib::context->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 5af2165ea4..79107c733d 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1821,6 +1821,14 @@ ur_result_t urPrintKernelCreateWithNativeHandleParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams( + const struct ur_kernel_get_suggested_local_work_size_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintKernelSetArgValueParams( const struct ur_kernel_set_arg_value_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index bf90700e7d..b8496a83c7 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -3472,6 +3472,49 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// diff --git a/test/conformance/kernel/CMakeLists.txt b/test/conformance/kernel/CMakeLists.txt index df19ba2550..73ab3f1101 100644 --- a/test/conformance/kernel/CMakeLists.txt +++ b/test/conformance/kernel/CMakeLists.txt @@ -18,4 +18,5 @@ add_conformance_test_with_kernels_environment(kernel urKernelSetArgSampler.cpp urKernelSetArgValue.cpp urKernelSetExecInfo.cpp - urKernelSetSpecializationConstants.cpp) + urKernelSetSpecializationConstants.cpp + urKernelGetSuggestedLocalWorkSize.cpp) diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 2668b6821a..82c92e3f28 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -8,9 +8,6 @@ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index 93e3ddd67d..818c625e92 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -162,3 +162,12 @@ urKernelSetSpecializationConstantsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidSuggestedLocalWorkSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp new file mode 100644 index 0000000000..4eeabf5573 --- /dev/null +++ b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urKernelGetSuggestedLocalWorkSizeTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "bar"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + size_t global_size = 32; + size_t global_offset = 0; + size_t n_dimensions = 1; + + size_t suggested_local_work_size; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSuggestedLocalWorkSizeTest); + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success) { + suggested_local_work_size = SIZE_MAX; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, &global_offset, &global_size, + &suggested_local_work_size); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + ASSERT_LE(suggested_local_work_size, global_size); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success2D) { + size_t global_size_2d[2] = {32, 32}; + size_t global_offset_2d[2] = {0, 0}; + size_t suggested_local_work_size_2d[2] = {SIZE_MAX, SIZE_MAX}; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, 2, global_offset_2d, global_size_2d, + suggested_local_work_size_2d); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + for (int I = 0; I < 2; ++I) { + ASSERT_LE(suggested_local_work_size_2d[I], global_size_2d[I]); + } +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success3D) { + size_t global_size_3d[3] = {32, 32, 32}; + size_t global_offset_3d[3] = {0, 0, 0}; + size_t suggested_local_work_size_3d[3] = {SIZE_MAX, SIZE_MAX, SIZE_MAX}; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, 3, global_offset_3d, global_size_3d, + suggested_local_work_size_3d); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + for (int I = 0; I < 3; ++I) { + ASSERT_LE(suggested_local_work_size_3d[I], global_size_3d[I]); + } +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleKernel) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + nullptr, queue, n_dimensions, &global_offset, + &global_size, &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleQueue) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + kernel, nullptr, n_dimensions, &global_offset, + &global_size, &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidWorkDimension) { + uint32_t max_work_item_dimensions = 0; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS, + sizeof(max_work_item_dimensions), &max_work_item_dimensions, nullptr)); + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, max_work_item_dimensions + 1, &global_offset, + &global_size, &suggested_local_work_size); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalOffset) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, nullptr, &global_size, + &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalSize) { + ASSERT_EQ_RESULT( + urKernelGetSuggestedLocalWorkSize(kernel, queue, n_dimensions, + &global_offset, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidSuggestedLocalWorkSize) { + ASSERT_EQ_RESULT( + urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, &global_offset, &global_size, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +}