From bcca15a54555e364e60f9b464a4eabac4adb3d2d Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 20 Aug 2024 17:43:33 +0100 Subject: [PATCH] Add two new properties to ur_kernel_group_info_t These two properties allow the program to specify a maximum work-group size in various ways. They are intended to be targeted from languages such as SYCL (see https://github.com/intel/llvm/pull/14518). This PR implements them for CUDA and Native CPU. It should also be able support them for HIP, in the same fashion. Other adapters using SPIR-V and/or Level Zero would require further changes to both of those specifications. --- include/ur_api.h | 6 +++- include/ur_print.hpp | 32 +++++++++++++++++ scripts/core/kernel.yml | 8 +++++ source/adapters/cuda/enqueue.cpp | 11 ++++++ source/adapters/cuda/kernel.cpp | 24 +++++++++++++ source/adapters/cuda/kernel.hpp | 14 ++++++++ source/adapters/cuda/program.cpp | 22 +++++++----- source/adapters/cuda/program.hpp | 6 +++- source/adapters/hip/kernel.cpp | 4 +++ source/adapters/level_zero/kernel.cpp | 4 +++ source/adapters/level_zero/v2/kernel.cpp | 4 +++ source/adapters/native_cpu/enqueue.cpp | 19 +++++++--- source/adapters/native_cpu/kernel.cpp | 25 ++++++++++--- source/adapters/native_cpu/kernel.hpp | 25 ++++++++----- source/adapters/native_cpu/program.cpp | 35 ++++++++++++------- source/adapters/native_cpu/program.hpp | 7 ++-- source/adapters/opencl/kernel.cpp | 4 +++ source/loader/layers/validation/ur_valddi.cpp | 3 +- source/loader/ur_libapi.cpp | 2 +- source/ur/ur.hpp | 4 +++ source/ur_api.cpp | 2 +- .../kernel/kernel_adapter_native_cpu.match | 9 +++++ .../kernel/urKernelGetGroupInfo.cpp | 20 +++++++++-- 23 files changed, 243 insertions(+), 47 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index a707d40a3f..98468fb9d8 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -4854,6 +4854,10 @@ typedef enum ur_kernel_group_info_t { UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5, ///< [size_t] Return minimum amount of private memory in bytes used by each ///< work item in the Kernel + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6, ///< [size_t[3]] Return the maximum Work Group size guaranteed by the + ///< source code, or (0, 0, 0) if unspecified + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z) + ///< guaranteed by the source code, or 0 if unspecified /// @cond UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -4957,7 +4961,7 @@ urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 681e8e814d..6e84ce97a5 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -7994,6 +7994,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_kernel_group_info_t va case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: os << "UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE"; break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE"; + break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE"; + break; default: os << "unknown enumerator"; break; @@ -8086,6 +8092,32 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_kernel_grou os << ")"; } break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: { + + const size_t *tptr = (const size_t *)ptr; + os << "{"; + size_t nelems = size / sizeof(size_t); + for (size_t i = 0; i < nelems; ++i) { + if (i != 0) { + os << ", "; + } + + os << tptr[i]; + } + os << "}"; + } break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: { + const size_t *tptr = (const size_t *)ptr; + if (sizeof(size_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml index 8585cffe6f..1ddcd83934 100644 --- a/scripts/core/kernel.yml +++ b/scripts/core/kernel.yml @@ -144,6 +144,14 @@ etors: desc: "[size_t] Return preferred multiple of Work Group size for launch" - name: PRIVATE_MEM_SIZE desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel" + - name: COMPILE_MAX_WORK_GROUP_SIZE + desc: | + [size_t[3]] Return the maximum Work Group size guaranteed by the + source code, or (0, 0, 0) if unspecified + - name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE + desc: | + [size_t] Return the maximum linearized Work Group size (X * Y * Z) + guaranteed by the source code, or 0 if unspecified --- #-------------------------------------------------------------------------- type: enum desc: "Get Kernel SubGroup information" diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 1c074025a9..d1d7b8afee 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -203,6 +203,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, // Set the active context here as guessLocalWorkSize needs an active context ScopedContext Active(Device); { + size_t *MaxThreadsPerBlock = Kernel->MaxThreadsPerBlock; size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; MaxWorkGroupSize = Device->getMaxWorkGroupSize(); @@ -212,6 +213,10 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (MaxThreadsPerBlock[Dim] != 0 && + LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim)) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; // Checks that local work sizes are a divisor of the global work sizes @@ -235,6 +240,12 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, KernelLocalWorkGroupSize *= LocalWorkSize[Dim]; } + if (size_t MaxLinearThreadsPerBlock = Kernel->MaxLinearThreadsPerBlock; + MaxLinearThreadsPerBlock && + MaxLinearThreadsPerBlock < KernelLocalWorkGroupSize) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (hasExceededMaxRegistersPerBlock(Device, Kernel, KernelLocalWorkGroupSize)) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 2061893744..04904528c4 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -124,6 +124,30 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); return ReturnValue(uint64_t(Bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: { + size_t MaxGroupSize[3] = {0, 0, 0}; + const auto &MaxWGSizeMDMap = + hKernel->getProgram()->KernelMaxWorkGroupSizeMD; + const auto MaxWGSizeMD = MaxWGSizeMDMap.find(hKernel->getName()); + if (MaxWGSizeMD != MaxWGSizeMDMap.end()) { + const auto MaxWGSize = MaxWGSizeMD->second; + MaxGroupSize[0] = std::get<0>(MaxWGSize); + MaxGroupSize[1] = std::get<1>(MaxWGSize); + MaxGroupSize[2] = std::get<2>(MaxWGSize); + } + return ReturnValue(MaxGroupSize, 3); + } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: { + size_t MaxLinearGroupSize = 0; + const auto &MaxLinearWGSizeMDMap = + hKernel->getProgram()->KernelMaxLinearWorkGroupSizeMD; + const auto MaxLinearWGSizeMD = + MaxLinearWGSizeMDMap.find(hKernel->getName()); + if (MaxLinearWGSizeMD != MaxLinearWGSizeMDMap.end()) { + MaxLinearGroupSize = MaxLinearWGSizeMD->second; + } + return ReturnValue(MaxLinearGroupSize); + } default: break; } diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index c6761d8525..7ad20a4f0e 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -46,6 +46,8 @@ struct ur_kernel_handle_t_ { static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + size_t MaxThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + size_t MaxLinearThreadsPerBlock{0}; int RegsPerThread{0}; /// Structure that holds the arguments to the kernel. @@ -169,6 +171,18 @@ struct ur_kernel_handle_t_ { sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); (void)RetError; assert(RetError == UR_RESULT_SUCCESS); + /// Note: this code assumes that there is only one device per context + RetError = urKernelGetGroupInfo( + this, Program->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr); + assert(RetError == UR_RESULT_SUCCESS); + /// Note: this code assumes that there is only one device per context + RetError = urKernelGetGroupInfo( + this, Program->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE, + sizeof(MaxLinearThreadsPerBlock), &MaxLinearThreadsPerBlock, nullptr); + assert(RetError == UR_RESULT_SUCCESS); UR_CHECK_ERROR( cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func)); } diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp index 98757d710e..9b14a0a4eb 100644 --- a/source/adapters/cuda/program.cpp +++ b/source/adapters/cuda/program.cpp @@ -54,9 +54,10 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, auto [Prefix, Tag] = splitMetadataName(MetadataElementName); - if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - // If metadata is reqd_work_group_size, record it for the corresponding - // kernel name. + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE || + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) { + // If metadata is reqd_work_group_size/max_work_group_size, record it for + // the corresponding kernel name. size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); // Expect between 1 and 3 32-bit integer values. @@ -69,11 +70,13 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); // Read values and pad with 1's for values not present. - std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); - KernelReqdWorkGroupSizeMD[Prefix] = - std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], - ReqdWorkGroupElements[2]); + std::array WorkGroupElements = {1, 1, 1}; + std::memcpy(WorkGroupElements.data(), ValuePtr, MDElemsSize); + (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE + ? KernelReqdWorkGroupSizeMD + : KernelMaxWorkGroupSizeMD)[Prefix] = + std::make_tuple(WorkGroupElements[0], WorkGroupElements[1], + WorkGroupElements[2]); } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { const char *MetadataValPtr = reinterpret_cast(MetadataElement.value.pData) + @@ -81,6 +84,9 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, const char *MetadataValPtrEnd = MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; + } else if (Tag == + __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) { + KernelMaxLinearWorkGroupSizeMD[Prefix] = MetadataElement.value.data64; } } return UR_RESULT_SUCCESS; diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp index 5d41374d34..10998cae2c 100644 --- a/source/adapters/cuda/program.hpp +++ b/source/adapters/cuda/program.hpp @@ -36,6 +36,9 @@ struct ur_program_handle_t_ { std::unordered_map> KernelReqdWorkGroupSizeMD; std::unordered_map GlobalIDMD; + std::unordered_map> + KernelMaxWorkGroupSizeMD; + std::unordered_map KernelMaxLinearWorkGroupSizeMD; constexpr static size_t MaxLogSize = 8192u; @@ -45,7 +48,8 @@ struct ur_program_handle_t_ { ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device) : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, - Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} { + Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{}, + KernelMaxWorkGroupSizeMD{}, KernelMaxLinearWorkGroupSizeMD{} { urContextRetain(Context); urDeviceRetain(Device); } diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index aa46843963..cdf0364ae2 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -127,6 +127,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, &Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); return ReturnValue(uint64_t(Bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // FIXME: could be added + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 8e627f3ade..4cf3c9c267 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -840,6 +840,10 @@ ur_result_t urKernelGetGroupInfo( case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize}); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // No corresponding enumeration in Level Zero + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: { logger::error( "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})", diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp index 8bfad2d2ad..fc3cb4a120 100644 --- a/source/adapters/level_zero/v2/kernel.cpp +++ b/source/adapters/level_zero/v2/kernel.cpp @@ -417,6 +417,10 @@ ur_result_t urKernelGetGroupInfo( auto props = hKernel->getProperties(hDevice); return returnValue(uint32_t{props.privateMemSize}); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // No corresponding enumeration in Level Zero + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: { logger::error( "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})", diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index b5d4713e2f..33d8c35c36 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -81,11 +81,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( DIE_NO_IMPLEMENTATION; } - // Check reqd_work_group_size - if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) { - const auto &Reqd = hKernel->getReqdWGSize(); + // Check reqd_work_group_size and other kernel constraints + if (pLocalWorkSize != nullptr) { + uint64_t TotalNumWIs = 1; for (uint32_t Dim = 0; Dim < workDim; Dim++) { - if (pLocalWorkSize[Dim] != Reqd[Dim]) { + TotalNumWIs *= pLocalWorkSize[Dim]; + if (auto Reqd = hKernel->getReqdWGSize(); + Reqd && pLocalWorkSize[Dim] != Reqd.value()[Dim]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (auto MaxWG = hKernel->getMaxWGSize(); + MaxWG && pLocalWorkSize[Dim] > MaxWG.value()[Dim]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } + if (auto MaxLinearWG = hKernel->getMaxLinearWGSize()) { + if (TotalNumWIs > MaxLinearWG) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } } diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 23a65eb03b..9363f92b7b 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -31,14 +31,25 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, ur_kernel_handle_t_ *kernel; // Set reqd_work_group_size for kernel if needed + std::optional ReqdWG; const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD; - auto ReqdIt = ReqdMap.find(pKernelName); - if (ReqdIt != ReqdMap.end()) { - kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second); - } else { - kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f); + if (auto ReqdIt = ReqdMap.find(pKernelName); ReqdIt != ReqdMap.end()) { + ReqdWG = ReqdIt->second; } + std::optional MaxWG; + const auto &MaxMap = hProgram->KernelMaxWorkGroupSizeMD; + if (auto MaxIt = MaxMap.find(pKernelName); MaxIt != MaxMap.end()) { + MaxWG = MaxIt->second; + } + std::optional MaxLinearWG; + const auto &MaxLinMap = hProgram->KernelMaxLinearWorkGroupSizeMD; + if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) { + MaxLinearWG = MaxLIt->second; + } + kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG, + MaxLinearWG); + *phKernel = kernel; return UR_RESULT_SUCCESS; @@ -148,6 +159,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, int bytes = 0; return returnValue(static_cast(bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // FIXME: could be added + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index b5728fa8b2..084a0ee695 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -41,15 +41,14 @@ struct ur_kernel_handle_t_ : RefCounted { ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) - : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, - HasReqdWGSize(false) {} + : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize), - HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) { + ReqdWGSize(other.ReqdWGSize) { incrementReferenceCount(); } @@ -60,9 +59,12 @@ struct ur_kernel_handle_t_ : RefCounted { } ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler, - const native_cpu::ReqdWGSize_t &ReqdWGSize) + std::optional ReqdWGSize, + std::optional MaxWGSize, + std::optional MaxLinearWGSize) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, - HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {} + ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize), + MaxLinearWGSize(MaxLinearWGSize) {} ur_program_handle_t hProgram; std::string _name; @@ -70,9 +72,13 @@ struct ur_kernel_handle_t_ : RefCounted { std::vector _args; std::vector _localArgInfo; - bool hasReqdWGSize() const { return HasReqdWGSize; } + std::optional getReqdWGSize() const { + return ReqdWGSize; + } + + std::optional getMaxWGSize() const { return MaxWGSize; } - const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; } + std::optional getMaxLinearWGSize() const { return MaxLinearWGSize; } void updateMemPool(size_t numParallelThreads) { // compute requested size. @@ -103,6 +109,7 @@ struct ur_kernel_handle_t_ : RefCounted { private: char *_localMemPool = nullptr; size_t _localMemPoolSize = 0; - bool HasReqdWGSize; - native_cpu::ReqdWGSize_t ReqdWGSize; + std::optional ReqdWGSize = std::nullopt; + std::optional MaxWGSize = std::nullopt; + std::optional MaxLinearWGSize = std::nullopt; }; diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp index 77edd83bce..460660e2cd 100644 --- a/source/adapters/native_cpu/program.cpp +++ b/source/adapters/native_cpu/program.cpp @@ -29,8 +29,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, DIE_NO_IMPLEMENTATION } -static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement, - native_cpu::ReqdWGSize_t &res) { +static ur_result_t +deserializeWGMetadata(const ur_program_metadata_t &MetadataElement, + native_cpu::WGSize_t &res, std::uint32_t DefaultVal) { size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); // Expect between 1 and 3 32-bit integer values. @@ -43,12 +44,12 @@ static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement, const char *ValuePtr = reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); - // Read values and pad with 1's for values not present. - std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); - std::get<0>(res) = ReqdWorkGroupElements[0]; - std::get<1>(res) = ReqdWorkGroupElements[1]; - std::get<2>(res) = ReqdWorkGroupElements[2]; + // Read values and pad with a default value for missing elements. + std::uint32_t WorkGroupElements[] = {DefaultVal, DefaultVal, DefaultVal}; + std::memcpy(WorkGroupElements, ValuePtr, MDElemsSize); + std::get<0>(res) = WorkGroupElements[0]; + std::get<1>(res) = WorkGroupElements[1]; + std::get<2>(res) = WorkGroupElements[2]; return UR_RESULT_SUCCESS; } @@ -71,13 +72,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( const auto &mdNode = pProperties->pMetadatas[i]; std::string mdName(mdNode.pName); auto [Prefix, Tag] = splitMetadataName(mdName); - if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - native_cpu::ReqdWGSize_t reqdWGSize; - auto res = getReqdWGSize(mdNode, reqdWGSize); + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE || + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) { + bool isReqd = + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE; + native_cpu::WGSize_t wgSizeProp; + auto res = deserializeWGMetadata( + mdNode, wgSizeProp, + isReqd ? 1 : std::numeric_limits::max()); if (res != UR_RESULT_SUCCESS) { return res; } - hProgram->KernelReqdWorkGroupSizeMD[Prefix] = std::move(reqdWGSize); + (isReqd ? hProgram->KernelReqdWorkGroupSizeMD + : hProgram->KernelMaxWorkGroupSizeMD)[Prefix] = + std::move(wgSizeProp); + } else if (Tag == + __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) { + hProgram->KernelMaxLinearWorkGroupSizeMD[Prefix] = mdNode.value.data64; } } } diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp index e85749a7b2..d58412751e 100644 --- a/source/adapters/native_cpu/program.hpp +++ b/source/adapters/native_cpu/program.hpp @@ -18,7 +18,7 @@ #include namespace native_cpu { -using ReqdWGSize_t = std::array; +using WGSize_t = std::array; } struct ur_program_handle_t_ : RefCounted { @@ -36,8 +36,11 @@ struct ur_program_handle_t_ : RefCounted { }; std::map _kernels; - std::unordered_map + std::unordered_map KernelReqdWorkGroupSizeMD; + std::unordered_map + KernelMaxWorkGroupSizeMD; + std::unordered_map KernelMaxLinearWorkGroupSizeMD; }; // The nativecpu_entry struct is also defined as LLVM-IR in the diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 9735abefbf..074348c622 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -130,6 +130,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } + if (propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE || + propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE) { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo( cl_adapter::cast(hKernel), cl_adapter::cast(hDevice), diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index e41623b15c..fb705dfc20 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -3504,7 +3504,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName) { + if (UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < + propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 05b0c71995..3ccc51133b 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -3764,7 +3764,7 @@ ur_result_t UR_APICALL urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object ur_device_handle_t hDevice, ///< [in] handle of the Device object diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index e8a1ed56b9..0639a9d9be 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -53,6 +53,10 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE \ + "@max_work_group_size" +#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE \ + "@max_linear_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" // Terminates the process with a catastrophic error message. diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 11b9dea7e9..3e024ede0f 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -3211,7 +3211,7 @@ ur_result_t UR_APICALL urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object ur_device_handle_t hDevice, ///< [in] handle of the Device object diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index 6e5db6f70f..4cf052500e 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -12,25 +12,34 @@ urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_K urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoSingleTest.CompileWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelGetGroupInfoSingleTest.CompileMaxWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_FUNCTION_NAME urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_NUM_ARGS diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp index b91001a07f..fcb3c39410 100644 --- a/test/conformance/kernel/urKernelGetGroupInfo.cpp +++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp @@ -15,7 +15,9 @@ UUR_TEST_SUITE_P( UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE, UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, - UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE), + UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE, + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE), uur::deviceTestWithParamPrinter); struct urKernelGetGroupInfoSingleTest : uur::urKernelTest { @@ -85,7 +87,7 @@ TEST_P(urKernelGetGroupInfoWgSizeTest, CompileWorkGroupSize) { } TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) { - // Returns 0 by default when there is no sepecific information + // Returns 0 by default when there is no specific information std::array read_dims{1, 1, 1}; std::array zero{0, 0, 0}; ASSERT_SUCCESS(urKernelGetGroupInfo( @@ -93,3 +95,17 @@ TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) { sizeof(read_dims), read_dims.data(), nullptr)); ASSERT_EQ(read_dims, zero); } + +TEST_P(urKernelGetGroupInfoSingleTest, CompileMaxWorkGroupSizeEmpty) { + // Returns 0 by default when there is no specific information + std::array read_dims{1, 1, 1}; + std::array zero{0, 0, 0}; + auto result = urKernelGetGroupInfo( + kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + sizeof(read_dims), read_dims.data(), nullptr); + if (result == UR_RESULT_SUCCESS) { + ASSERT_EQ(read_dims, zero); + } else { + ASSERT_EQ(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + } +}