Skip to content

Commit

Permalink
Merge pull request #1385 from yingcong-wu/yc/new-api-suggestgroupsize
Browse files Browse the repository at this point in the history
Implement urKernelGetSuggestedLocalWorkSize
  • Loading branch information
kbenzie authored Jun 4, 2024
2 parents 6469b89 + 5593d84 commit 755a1e7
Show file tree
Hide file tree
Showing 30 changed files with 811 additions and 63 deletions.
51 changes: 51 additions & 0 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ typedef enum ur_function_t {
UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp
UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp
UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224, ///< Enumerator for ::urEnqueueKernelLaunchCustomExp
UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize
/// @cond
UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
/// @endcond
Expand Down Expand Up @@ -5230,6 +5231,43 @@ urKernelCreateWithNativeHandle(
ur_kernel_handle_t *phKernel ///< [out] pointer to the handle of the kernel object created.
);

///////////////////////////////////////////////////////////////////////////////
/// @brief Get the suggested local work size for a kernel.
///
/// @details
/// - Query a suggested local work size for a kernel given a global size for
/// each dimension.
/// - The application may call this function from simultaneous threads for
/// the same context.
///
/// @returns
/// - ::UR_RESULT_SUCCESS
/// - ::UR_RESULT_ERROR_UNINITIALIZED
/// - ::UR_RESULT_ERROR_DEVICE_LOST
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hKernel`
/// + `NULL == hQueue`
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pGlobalWorkOffset`
/// + `NULL == pGlobalWorkSize`
/// + `NULL == pSuggestedLocalWorkSize`
/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
UR_APIEXPORT ur_result_t UR_APICALL
urKernelGetSuggestedLocalWorkSize(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
uint32_t numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
///< and work-group work-items
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
///< the offset used to calculate the global ID of a work-item
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
///< the number of global work-items in workDim that will execute the
///< kernel function
size_t *pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
///< suggested local work size that will contain the result of the query
);

#if !defined(__GNUC__)
#pragma endregion
#endif
Expand Down Expand Up @@ -9943,6 +9981,19 @@ typedef struct ur_kernel_create_with_native_handle_params_t {
ur_kernel_handle_t **pphKernel;
} ur_kernel_create_with_native_handle_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for urKernelGetSuggestedLocalWorkSize
/// @details Each entry is a pointer to the parameter passed to the function;
/// allowing the callback the ability to modify the parameter's value
typedef struct ur_kernel_get_suggested_local_work_size_params_t {
ur_kernel_handle_t *phKernel;
ur_queue_handle_t *phQueue;
uint32_t *pnumWorkDim;
const size_t **ppGlobalWorkOffset;
const size_t **ppGlobalWorkSize;
size_t **ppSuggestedLocalWorkSize;
} ur_kernel_get_suggested_local_work_size_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for urKernelSetArgValue
/// @details Each entry is a pointer to the parameter passed to the function;
Expand Down
11 changes: 11 additions & 0 deletions include/ur_ddi.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelCreateWithNativeHandle_t)(
const ur_kernel_native_properties_t *,
ur_kernel_handle_t *);

///////////////////////////////////////////////////////////////////////////////
/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSize
typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)(
ur_kernel_handle_t,
ur_queue_handle_t,
uint32_t,
const size_t *,
const size_t *,
size_t *);

///////////////////////////////////////////////////////////////////////////////
/// @brief Function-pointer for urKernelSetArgValue
typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)(
Expand Down Expand Up @@ -603,6 +613,7 @@ typedef struct ur_kernel_dditable_t {
ur_pfnKernelRelease_t pfnRelease;
ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle;
ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle;
ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
ur_pfnKernelSetArgValue_t pfnSetArgValue;
ur_pfnKernelSetArgLocal_t pfnSetArgLocal;
ur_pfnKernelSetArgPointer_t pfnSetArgPointer;
Expand Down
8 changes: 8 additions & 0 deletions include/ur_print.h
Original file line number Diff line number Diff line change
Expand Up @@ -1442,6 +1442,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetNativeHandleParams(const str
/// - `buff_size < out_size`
UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelCreateWithNativeHandleParams(const struct ur_kernel_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);

///////////////////////////////////////////////////////////////////////////////
/// @brief Print ur_kernel_get_suggested_local_work_size_params_t struct
/// @returns
/// - ::UR_RESULT_SUCCESS
/// - ::UR_RESULT_ERROR_INVALID_SIZE
/// - `buff_size < out_size`
UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetSuggestedLocalWorkSizeParams(const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);

///////////////////////////////////////////////////////////////////////////////
/// @brief Print ur_kernel_set_arg_value_params_t struct
/// @returns
Expand Down
49 changes: 49 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP:
os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP";
break;
case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE:
os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -11462,6 +11465,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
return os;
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ur_kernel_get_suggested_local_work_size_params_t type
/// @returns
/// std::ostream &
inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_kernel_get_suggested_local_work_size_params_t *params) {

os << ".hKernel = ";

ur::details::printPtr(os,
*(params->phKernel));

os << ", ";
os << ".hQueue = ";

ur::details::printPtr(os,
*(params->phQueue));

os << ", ";
os << ".numWorkDim = ";

os << *(params->pnumWorkDim);

os << ", ";
os << ".pGlobalWorkOffset = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkOffset));

os << ", ";
os << ".pGlobalWorkSize = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkSize));

os << ", ";
os << ".pSuggestedLocalWorkSize = ";

ur::details::printPtr(os,
*(params->ppSuggestedLocalWorkSize));

return os;
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ur_kernel_set_arg_value_params_t type
/// @returns
Expand Down Expand Up @@ -17143,6 +17189,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
case UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE: {
os << (const struct ur_kernel_create_with_native_handle_params_t *)params;
} break;
case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: {
os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)params;
} break;
case UR_FUNCTION_KERNEL_SET_ARG_VALUE: {
os << (const struct ur_kernel_set_arg_value_params_t *)params;
} break;
Expand Down
41 changes: 41 additions & 0 deletions scripts/core/kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -534,3 +534,44 @@ params:
returns:
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE:
- "If the adapter has no underlying equivalent handle."
--- #--------------------------------------------------------------------------
type: function
desc: "Get the suggested local work size for a kernel."
class: $xKernel
name: GetSuggestedLocalWorkSize
ordinal: "0"
details:
- "Query a suggested local work size for a kernel given a global size for each dimension."
- "The application may call this function from simultaneous threads for the same context."
params:
- type: $x_kernel_handle_t
name: hKernel
desc: |
[in] handle of the kernel
- type: $x_queue_handle_t
name: hQueue
desc: |
[in] handle of the queue object
- type: uint32_t
name: numWorkDim
desc: |
[in] number of dimensions, from 1 to 3, to specify the global
and work-group work-items
- type: const size_t*
name: pGlobalWorkOffset
desc: |
[in] pointer to an array of numWorkDim unsigned values that specify
the offset used to calculate the global ID of a work-item
- type: const size_t*
name: pGlobalWorkSize
desc: |
[in] pointer to an array of numWorkDim unsigned values that specify
the number of global work-items in workDim that will execute the
kernel function
- type: size_t*
name: pSuggestedLocalWorkSize
desc: |
[out] pointer to an array of numWorkDim unsigned values that specify
suggested local work size that will contain the result of the query
returns:
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE
3 changes: 3 additions & 0 deletions scripts/core/registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,9 @@ etors:
- name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP
desc: Enumerator for $xEnqueueKernelLaunchCustomExp
value: '224'
- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE
desc: Enumerator for $xKernelGetSuggestedLocalWorkSize
value: '225'
---
type: enum
desc: Defines structure types
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/cuda/enqueue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
uint32_t NumEventsInWaitList,
const ur_event_handle_t *EventWaitList);

void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
ur_kernel_handle_t Kernel);

bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
ur_kernel_handle_t Kernel,
size_t BlockSize);
Expand Down
29 changes: 29 additions & 0 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
//===----------------------------------------------------------------------===//

#include "kernel.hpp"
#include "enqueue.hpp"
#include "memory.hpp"
#include "queue.hpp"
#include "sampler.hpp"

UR_APIEXPORT ur_result_t UR_APICALL
Expand Down Expand Up @@ -380,3 +382,30 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
}
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
[[maybe_unused]] const size_t *pGlobalWorkOffset,
const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
// Preconditions
UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
UR_RESULT_ERROR_INVALID_KERNEL);
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
UR_RESULT_ERROR_INVALID_NULL_POINTER);

ur_device_handle_t Device = hQueue->Device;
ur_result_t Result = UR_RESULT_SUCCESS;
size_t ThreadsPerBlock[3] = {};

// Set the active context here as guessLocalWorkSize needs an active context
ScopedContext Active(Device);

guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
hKernel);

std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
pSuggestedLocalWorkSize);
return Result;
}
1 change: 1 addition & 0 deletions source/adapters/cuda/ur_interface_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
pDdiTable->pfnSetArgValue = urKernelSetArgValue;
pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
pDdiTable->pfnSetSpecializationConstants = nullptr;
pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
return UR_RESULT_SUCCESS;
}

Expand Down
4 changes: 4 additions & 0 deletions source/adapters/hip/enqueue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,7 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
const hipMemoryType DstType, ur_rect_offset_t DstOffset,
size_t DstRowPitch, size_t DstSlicePitch,
hipMemcpy3DParms &Params);

void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
const size_t MaxThreadsPerBlock[3]);
29 changes: 29 additions & 0 deletions source/adapters/hip/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//

#include "kernel.hpp"
#include "enqueue.hpp"
#include "memory.hpp"
#include "sampler.hpp"

Expand Down Expand Up @@ -349,3 +350,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
[[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
[[maybe_unused]] ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue,
uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset,
const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
UR_RESULT_ERROR_INVALID_QUEUE);
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
UR_RESULT_ERROR_INVALID_NULL_POINTER);

size_t MaxThreadsPerBlock[3];
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};

MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX();
MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY();
MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ();

ur_device_handle_t Device = hQueue->getDevice();
ScopedContext Active(Device);

guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
MaxThreadsPerBlock);
std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
pSuggestedLocalWorkSize);
return UR_RESULT_SUCCESS;
}
1 change: 1 addition & 0 deletions source/adapters/hip/ur_interface_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
pDdiTable->pfnSetArgValue = urKernelSetArgValue;
pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
return UR_RESULT_SUCCESS;
}

Expand Down
Loading

0 comments on commit 755a1e7

Please sign in to comment.