From 48a9ef11fc4c14a1119b8410d6f4618e7c696cf3 Mon Sep 17 00:00:00 2001 From: Michael Aziz Date: Thu, 11 Jan 2024 07:29:50 -0800 Subject: [PATCH] [UR] Add default implementation for cooperative kernel functions Cooperative kernels can synchronize using device-scope barriers. These kernels use `urKernelSuggestMaxCooperativeGroupCountExp` to ensure that all work groups can run concurrently. When the maximum number of work groups is 1, these kernels behave the same as regular kernels. This PR adds a default implementation for `urKernelSuggestMaxCooperativeGroupCountExp` that returns 1. Also, it adds a default implementation for `urEnqueueCooperativeKernelLaunchExp` that calls `urEnqueueKernelLaunch`. Signed-off-by: Michael Aziz --- source/adapters/cuda/enqueue.cpp | 10 ++++++++++ source/adapters/cuda/kernel.cpp | 7 +++++++ source/adapters/hip/enqueue.cpp | 10 ++++++++++ source/adapters/hip/kernel.cpp | 7 +++++++ source/adapters/level_zero/kernel.cpp | 17 +++++++++++++++++ source/adapters/opencl/enqueue.cpp | 10 ++++++++++ source/adapters/opencl/kernel.cpp | 7 +++++++ 7 files changed, 68 insertions(+) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index c752c3fd14..f2805eae14 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -490,6 +490,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return Result; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index eaaa3ef368..1b23fb2a01 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -169,6 +169,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, uint32_t *pGroupCountRet) { + (void)hKernel; + *pGroupCountRet = 1; + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const ur_kernel_arg_value_properties_t *pProperties, diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 68e3e665d2..6fff90fbe8 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -475,6 +475,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return Result; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index e3eb37dc88..1aff571b0f 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -158,6 +158,13 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, uint32_t *pGroupCountRet) { + (void)hKernel; + *pGroupCountRet = 1; + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const ur_kernel_arg_value_properties_t *, const void *pArgValue) { diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index dfa8915197..25305e6553 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -253,6 +253,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. ur_program_handle_t Program, ///< [in] handle of the program containing the @@ -736,6 +746,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, uint32_t *pGroupCountRet) { + (void)hKernel; + *pGroupCountRet = 1; + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel. ur_context_handle_t Context, ///< [in] handle of the context object diff --git a/source/adapters/opencl/enqueue.cpp b/source/adapters/opencl/enqueue.cpp index 6830a28eec..506796a07b 100644 --- a/source/adapters/opencl/enqueue.cpp +++ b/source/adapters/opencl/enqueue.cpp @@ -41,6 +41,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 44157b826b..721dae8c2d 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -376,6 +376,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, uint32_t *pGroupCountRet) { + (void)hKernel; + *pGroupCountRet = 1; + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_native_handle_t hNativeKernel, ur_context_handle_t, ur_program_handle_t, const ur_kernel_native_properties_t *pProperties,