Skip to content

Commit

Permalink
Add two new properties to ur_kernel_group_info_t
Browse files Browse the repository at this point in the history
These two properties allow the program to specify a maximum work-group size in various ways.
  • Loading branch information
frasercrmck committed Aug 22, 2024
1 parent 40a790f commit b595cbf
Show file tree
Hide file tree
Showing 23 changed files with 242 additions and 47 deletions.
6 changes: 5 additions & 1 deletion include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -4854,6 +4854,10 @@ typedef enum ur_kernel_group_info_t {
UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5, ///< [size_t] Return minimum amount of private memory in bytes used by each
///< work item in the Kernel
UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6, ///< [size_t[3]] Return the maximum Work Group size guaranteed by the
///< source code, or (0, 0, 0) if unspecified
UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z)
///< guaranteed by the source code, or 0 if unspecified
/// @cond
UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
/// @endcond
Expand Down Expand Up @@ -4957,7 +4961,7 @@ urKernelGetInfo(
/// + `NULL == hKernel`
/// + `NULL == hDevice`
/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION
/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName`
/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName`
UR_APIEXPORT ur_result_t UR_APICALL
urKernelGetGroupInfo(
ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object
Expand Down
32 changes: 32 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7994,6 +7994,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_kernel_group_info_t va
case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE:
os << "UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE";
break;
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE";
break;
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -8086,6 +8092,32 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_kernel_grou

os << ")";
} break;
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {

const size_t *tptr = (const size_t *)ptr;
os << "{";
size_t nelems = size / sizeof(size_t);
for (size_t i = 0; i < nelems; ++i) {
if (i != 0) {
os << ", ";
}

os << tptr[i];
}
os << "}";
} break;
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
const size_t *tptr = (const size_t *)ptr;
if (sizeof(size_t) > size) {
os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")";
return UR_RESULT_ERROR_INVALID_SIZE;
}
os << (const void *)(tptr) << " (";

os << *tptr;

os << ")";
} break;
default:
os << "unknown enumerator";
return UR_RESULT_ERROR_INVALID_ENUMERATION;
Expand Down
8 changes: 8 additions & 0 deletions scripts/core/kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ etors:
desc: "[size_t] Return preferred multiple of Work Group size for launch"
- name: PRIVATE_MEM_SIZE
desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel"
- name: COMPILE_MAX_WORK_GROUP_SIZE
desc: |
[size_t[3]] Return the maximum Work Group size guaranteed by the
source code, or (0, 0, 0) if unspecified
- name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
desc: |
[size_t] Return the maximum linearized Work Group size (X * Y * Z)
guaranteed by the source code, or 0 if unspecified
--- #--------------------------------------------------------------------------
type: enum
desc: "Get Kernel SubGroup information"
Expand Down
11 changes: 11 additions & 0 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
// Set the active context here as guessLocalWorkSize needs an active context
ScopedContext Active(Device);
{
size_t *MaxThreadsPerBlock = Kernel->MaxThreadsPerBlock;
size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
MaxWorkGroupSize = Device->getMaxWorkGroupSize();

Expand All @@ -212,6 +213,10 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;

if (MaxThreadsPerBlock[Dim] != 0 &&
LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;

if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
// Checks that local work sizes are a divisor of the global work sizes
Expand All @@ -235,6 +240,12 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
KernelLocalWorkGroupSize *= LocalWorkSize[Dim];
}

if (size_t MaxLinearThreadsPerBlock = Kernel->MaxLinearThreadsPerBlock;
MaxLinearThreadsPerBlock &&
MaxLinearThreadsPerBlock < KernelLocalWorkGroupSize) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

if (hasExceededMaxRegistersPerBlock(Device, Kernel,
KernelLocalWorkGroupSize)) {
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
Expand Down
24 changes: 24 additions & 0 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,30 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
return ReturnValue(uint64_t(Bytes));
}
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {
size_t MaxGroupSize[3] = {0, 0, 0};
const auto &MaxWGSizeMDMap =
hKernel->getProgram()->KernelMaxWorkGroupSizeMD;
const auto MaxWGSizeMD = MaxWGSizeMDMap.find(hKernel->getName());
if (MaxWGSizeMD != MaxWGSizeMDMap.end()) {
const auto MaxWGSize = MaxWGSizeMD->second;
MaxGroupSize[0] = std::get<0>(MaxWGSize);
MaxGroupSize[1] = std::get<1>(MaxWGSize);
MaxGroupSize[2] = std::get<2>(MaxWGSize);
}
return ReturnValue(MaxGroupSize, 3);
}
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
size_t MaxLinearGroupSize = 0;
const auto &MaxLinearWGSizeMDMap =
hKernel->getProgram()->KernelMaxLinearWorkGroupSizeMD;
const auto MaxLinearWGSizeMD =
MaxLinearWGSizeMDMap.find(hKernel->getName());
if (MaxLinearWGSizeMD != MaxLinearWGSizeMDMap.end()) {
MaxLinearGroupSize = MaxLinearWGSizeMD->second;
}
return ReturnValue(MaxLinearGroupSize);
}
default:
break;
}
Expand Down
14 changes: 14 additions & 0 deletions source/adapters/cuda/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct ur_kernel_handle_t_ {

static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
size_t MaxThreadsPerBlock[ReqdThreadsPerBlockDimensions];
size_t MaxLinearThreadsPerBlock{0};
int RegsPerThread{0};

/// Structure that holds the arguments to the kernel.
Expand Down Expand Up @@ -169,6 +171,18 @@ struct ur_kernel_handle_t_ {
sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
(void)RetError;
assert(RetError == UR_RESULT_SUCCESS);
/// Note: this code assumes that there is only one device per context
RetError = urKernelGetGroupInfo(
this, Program->getDevice(),
UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr);
assert(RetError == UR_RESULT_SUCCESS);
/// Note: this code assumes that there is only one device per context
RetError = urKernelGetGroupInfo(
this, Program->getDevice(),
UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE,
sizeof(MaxLinearThreadsPerBlock), &MaxLinearThreadsPerBlock, nullptr);
assert(RetError == UR_RESULT_SUCCESS);
UR_CHECK_ERROR(
cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func));
}
Expand Down
22 changes: 14 additions & 8 deletions source/adapters/cuda/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,

auto [Prefix, Tag] = splitMetadataName(MetadataElementName);

if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
// If metadata is reqd_work_group_size, record it for the corresponding
// kernel name.
if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE ||
Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) {
// If metadata is reqd_work_group_size/max_work_group_size, record it for
// the corresponding kernel name.
size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);

// Expect between 1 and 3 32-bit integer values.
Expand All @@ -69,18 +70,23 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
reinterpret_cast<const char *>(MetadataElement.value.pData) +
sizeof(std::uint64_t);
// Read values and pad with 1's for values not present.
std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
KernelReqdWorkGroupSizeMD[Prefix] =
std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
ReqdWorkGroupElements[2]);
std::array<uint32_t, 3> WorkGroupElements = {1, 1, 1};
std::memcpy(WorkGroupElements.data(), ValuePtr, MDElemsSize);
(Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
? KernelReqdWorkGroupSizeMD
: KernelMaxWorkGroupSizeMD)[Prefix] =
std::make_tuple(WorkGroupElements[0], WorkGroupElements[1],
WorkGroupElements[2]);
} else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
const char *MetadataValPtr =
reinterpret_cast<const char *>(MetadataElement.value.pData) +
sizeof(std::uint64_t);
const char *MetadataValPtrEnd =
MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
} else if (Tag ==
__SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) {
KernelMaxLinearWorkGroupSizeMD[Prefix] = MetadataElement.value.data64;
}
}
return UR_RESULT_SUCCESS;
Expand Down
6 changes: 5 additions & 1 deletion source/adapters/cuda/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ struct ur_program_handle_t_ {
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelReqdWorkGroupSizeMD;
std::unordered_map<std::string, std::string> GlobalIDMD;
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelMaxWorkGroupSizeMD;
std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;

constexpr static size_t MaxLogSize = 8192u;

Expand All @@ -45,7 +48,8 @@ struct ur_program_handle_t_ {

ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
: Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{},
KernelMaxWorkGroupSizeMD{}, KernelMaxLinearWorkGroupSizeMD{} {
urContextRetain(Context);
urDeviceRetain(Device);
}
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/hip/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
&Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
return ReturnValue(uint64_t(Bytes));
}
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
// FIXME: could be added
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
default:
break;
}
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize});
}
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
// No corresponding enumeration in Level Zero
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
default: {
logger::error(
"Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})",
Expand Down
19 changes: 15 additions & 4 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
DIE_NO_IMPLEMENTATION;
}

// Check reqd_work_group_size
if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) {
const auto &Reqd = hKernel->getReqdWGSize();
// Check reqd_work_group_size and other kernel constraints
if (pLocalWorkSize != nullptr) {
uint64_t TotalNumWIs = 1;
for (uint32_t Dim = 0; Dim < workDim; Dim++) {
if (pLocalWorkSize[Dim] != Reqd[Dim]) {
TotalNumWIs *= pLocalWorkSize[Dim];
if (auto Reqd = hKernel->getReqdWGSize();
Reqd && pLocalWorkSize[Dim] != Reqd.value()[Dim]) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (auto MaxWG = hKernel->getMaxWGSize();
MaxWG && pLocalWorkSize[Dim] > MaxWG.value()[Dim]) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
}
if (auto MaxLinearWG = hKernel->getMaxLinearWGSize()) {
if (TotalNumWIs > MaxLinearWG) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
}
Expand Down
25 changes: 20 additions & 5 deletions source/adapters/native_cpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,25 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
ur_kernel_handle_t_ *kernel;

// Set reqd_work_group_size for kernel if needed
std::optional<native_cpu::WGSize_t> ReqdWG;
const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD;
auto ReqdIt = ReqdMap.find(pKernelName);
if (ReqdIt != ReqdMap.end()) {
kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second);
} else {
kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f);
if (auto ReqdIt = ReqdMap.find(pKernelName); ReqdIt != ReqdMap.end()) {
ReqdWG = ReqdIt->second;
}

std::optional<native_cpu::WGSize_t> MaxWG;
const auto &MaxMap = hProgram->KernelMaxWorkGroupSizeMD;
if (auto MaxIt = MaxMap.find(pKernelName); MaxIt != MaxMap.end()) {
ReqdWG = MaxIt->second;
}
std::optional<uint64_t> MaxLinearWG;
const auto &MaxLinMap = hProgram->KernelMaxLinearWorkGroupSizeMD;
if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
MaxLinearWG = MaxLIt->second;
}
kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
MaxLinearWG);

*phKernel = kernel;

return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -148,6 +159,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
int bytes = 0;
return returnValue(static_cast<uint64_t>(bytes));
}
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
// FIXME: could be added
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;

default:
break;
Expand Down
25 changes: 16 additions & 9 deletions source/adapters/native_cpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,14 @@ struct ur_kernel_handle_t_ : RefCounted {

ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
HasReqdWGSize(false) {}
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}

ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
: hProgram(other.hProgram), _name(other._name),
_subhandler(other._subhandler), _args(other._args),
_localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
_localMemPoolSize(other._localMemPoolSize),
HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) {
ReqdWGSize(other.ReqdWGSize) {
incrementReferenceCount();
}

Expand All @@ -60,19 +59,26 @@ struct ur_kernel_handle_t_ : RefCounted {
}
ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler,
const native_cpu::ReqdWGSize_t &ReqdWGSize)
std::optional<native_cpu::WGSize_t> ReqdWGSize,
std::optional<native_cpu::WGSize_t> MaxWGSize,
std::optional<uint64_t> MaxLinearWGSize)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {}
ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
MaxLinearWGSize(MaxLinearWGSize) {}

ur_program_handle_t hProgram;
std::string _name;
nativecpu_task_t _subhandler;
std::vector<native_cpu::NativeCPUArgDesc> _args;
std::vector<local_arg_info_t> _localArgInfo;

bool hasReqdWGSize() const { return HasReqdWGSize; }
std::optional<native_cpu::WGSize_t> getReqdWGSize() const {
return ReqdWGSize;
}

std::optional<native_cpu::WGSize_t> getMaxWGSize() const { return MaxWGSize; }

const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; }
std::optional<uint64_t> getMaxLinearWGSize() const { return MaxLinearWGSize; }

void updateMemPool(size_t numParallelThreads) {
// compute requested size.
Expand Down Expand Up @@ -103,6 +109,7 @@ struct ur_kernel_handle_t_ : RefCounted {
private:
char *_localMemPool = nullptr;
size_t _localMemPoolSize = 0;
bool HasReqdWGSize;
native_cpu::ReqdWGSize_t ReqdWGSize;
std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
};
Loading

0 comments on commit b595cbf

Please sign in to comment.