Skip to content

Commit

Permalink
Merge pull request #1363 from hdelan/refactor-device-initialization
Browse files Browse the repository at this point in the history
[CUDA] Refactor device initialization
  • Loading branch information
aarongreig authored Mar 27, 2024
2 parents ed949ec + 2968cc1 commit ff276ab
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 58 deletions.
32 changes: 11 additions & 21 deletions source/adapters/cuda/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ struct ur_device_handle_t_ {
size_t MaxWorkItemSizes[MaxWorkItemDimensions];
size_t MaxWorkGroupSize{0};
size_t MaxAllocSize{0};
int MaxBlockDimY{0};
int MaxBlockDimZ{0};
int MaxRegsPerBlock{0};
int MaxCapacityLocalMem{0};
int MaxChosenLocalMem{0};
Expand All @@ -40,17 +38,21 @@ struct ur_device_handle_t_ {
: CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
Platform(platform) {

UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice));
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice));
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
cuDevice));
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxCapacityLocalMem,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, cuDevice));

UR_CHECK_ERROR(urDeviceGetInfo(this, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
sizeof(MaxWorkItemSizes), MaxWorkItemSizes,
nullptr));

UR_CHECK_ERROR(urDeviceGetInfo(this, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize,
nullptr));

// Set local mem max size if env var is present
static const char *LocalMemSizePtrUR =
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE");
Expand Down Expand Up @@ -91,24 +93,12 @@ struct ur_device_handle_t_ {

uint64_t getElapsedTime(CUevent) const;

void saveMaxWorkItemSizes(size_t Size,
size_t *SaveMaxWorkItemSizes) noexcept {
memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size);
};

void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; };

void getMaxWorkItemSizes(size_t RetSize,
size_t *RetMaxWorkItemSizes) const noexcept {
memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize);
};
size_t getMaxWorkItemSizes(int index) const noexcept {
return MaxWorkItemSizes[index];
}

size_t getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; };

size_t getMaxBlockDimY() const noexcept { return MaxBlockDimY; };

size_t getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; };

size_t getMaxRegsPerBlock() const noexcept { return MaxRegsPerBlock; };

size_t getMaxAllocSize() const noexcept { return MaxAllocSize; };
Expand Down
16 changes: 6 additions & 10 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
// dimension.
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
const size_t MaxThreadsPerBlock[3],
ur_kernel_handle_t Kernel) {
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
Expand All @@ -154,14 +153,14 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
}

size_t MaxBlockDim[3];
MaxBlockDim[0] = MaxThreadsPerBlock[0];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();
MaxBlockDim[0] = Device->getMaxWorkItemSizes(0);
MaxBlockDim[1] = Device->getMaxWorkItemSizes(1);
MaxBlockDim[2] = Device->getMaxWorkItemSizes(2);

int MinGrid, MaxBlockSize;
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
MaxThreadsPerBlock[0]));
MaxBlockDim[0]));

roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxBlockSize);
Expand Down Expand Up @@ -197,7 +196,6 @@ setKernelParams(const ur_context_handle_t Context,
size_t (&BlocksPerGrid)[3]) {
ur_result_t Result = UR_RESULT_SUCCESS;
size_t MaxWorkGroupSize = 0u;
size_t MaxThreadsPerBlock[3] = {};
bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr;
uint32_t LocalSize = Kernel->getLocalSize();

Expand All @@ -207,16 +205,14 @@ setKernelParams(const ur_context_handle_t Context,
{
size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
MaxWorkGroupSize = Device->getMaxWorkGroupSize();
Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
MaxThreadsPerBlock);

if (ProvidedLocalWorkGroupSize) {
auto IsValid = [&](int Dim) {
if (ReqdThreadsPerBlock[Dim] != 0 &&
LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;

if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
// Checks that local work sizes are a divisor of the global work sizes
// which includes that the local work sizes are neither larger than
Expand Down Expand Up @@ -245,7 +241,7 @@ setKernelParams(const ur_context_handle_t Context,
}
} else {
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
MaxThreadsPerBlock, Kernel);
Kernel);
}
}

Expand Down
15 changes: 4 additions & 11 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
size_t GlobalWorkSize[3] = {0, 0, 0};

int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get()));
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get()));
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get()));

int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get()));
Expand All @@ -84,9 +76,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get()));

GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX;
GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY;
GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ;
GlobalWorkSize[0] = hDevice->getMaxWorkItemSizes(0) * MaxGridDimX;
GlobalWorkSize[1] = hDevice->getMaxWorkItemSizes(1) * MaxGridDimY;
GlobalWorkSize[2] = hDevice->getMaxWorkItemSizes(2) * MaxGridDimZ;

return ReturnValue(GlobalWorkSize, 3);
}
case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
Expand Down
16 changes: 0 additions & 16 deletions source/adapters/cuda/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,6 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,

Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
Device, Context, EvBase, &Platforms[i]});
{
const auto &Dev = Platforms[i].Devices.back().get();
size_t MaxWorkGroupSize = 0u;
size_t MaxThreadsPerBlock[3] = {};
UR_CHECK_ERROR(urDeviceGetInfo(
Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr));

UR_CHECK_ERROR(urDeviceGetInfo(
Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr));

Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
MaxThreadsPerBlock);
Dev->saveMaxWorkGroupSize(MaxWorkGroupSize);
}
}
} catch (const std::bad_alloc &) {
// Signal out-of-memory situation
Expand Down

0 comments on commit ff276ab

Please sign in to comment.