From 557a3a17669712d5826fc77705b1c339f5b66a76 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 21 Aug 2023 17:56:48 +0100 Subject: [PATCH 01/10] [CMDBUF] Add fill commands to cmd buffer exp feature - Adds USM and Buffer fill append commands - Update feature spec for new commands - Align naming conventions for Append* commands with core equivalents - Also includes stubs for CUDA and HIP adapters --- scripts/core/EXP-COMMAND-BUFFER.rst | 1 + source/adapters/cuda/command_buffer.cpp | 43 +++++++++++ source/adapters/cuda/ur_interface_loader.cpp | 2 + source/adapters/hip/command_buffer.cpp | 18 +++++ source/adapters/hip/ur_interface_loader.cpp | 2 + source/adapters/level_zero/command_buffer.cpp | 77 +++++++++++++++++++ .../level_zero/ur_interface_loader.cpp | 2 + 7 files changed, 145 insertions(+) diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index a6a32a66a1..7b1f1d54b9 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -103,6 +103,7 @@ Currently only the following commands are supported: * ${x}CommandBufferAppendMemBufferFillExp * ${x}CommandBufferAppendUSMPrefetchExp * ${x}CommandBufferAppendUSMAdviseExp +>>>>>>> 118f696b ([CMDBUF] Add fill commands to cmd buffer exp feature) It is planned to eventually support any command type from the Core API which can actually be appended to the equiavalent adapter native constructs. diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index dd97f48d6a..1b1faa870a 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -596,6 +596,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return Result; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)pPattern; + (void)patternSize; + (void)offset; + (void)size; + + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)pPtr; + (void)pPattern; + (void)patternSize; + (void)size; + + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index af18d96017..f31ffe6d87 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -279,6 +279,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp; + pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp; pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp; pDdiTable->pfnAppendMemBufferCopyRectExp = urCommandBufferAppendMemBufferCopyRectExp; @@ -291,6 +292,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( urCommandBufferAppendMemBufferWriteRectExp; pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp; pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp; + pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp; pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; return retVal; diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index c7609b6110..54a6fa2f4e 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -137,6 +137,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t, ur_mem_handle_t, const void *, size_t, + size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t, void *, const void *, size_t, size_t, + uint32_t, const ur_exp_command_buffer_sync_point_t *, + ur_exp_command_buffer_sync_point_t *) { + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for HIP adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t, ur_queue_handle_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index f23d395d1a..7707e78425 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -276,6 +276,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp; + pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp; pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp; pDdiTable->pfnAppendMemBufferCopyRectExp = urCommandBufferAppendMemBufferCopyRectExp; @@ -289,6 +290,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp; pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp; pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; + pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp; return retVal; } diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index bb081f9b2d..4b811ab033 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -379,6 +379,48 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( return UR_RESULT_SUCCESS; } +// Helper function for enqueuing memory fills +static ur_result_t enqueueCommandBufferFillHelper( + ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, + void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + // Pattern size must be a power of two. + UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), + UR_RESULT_ERROR_INVALID_VALUE); + + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); + + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; + + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendMemoryFill() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, @@ -783,6 +825,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, + const void *Pattern, size_t PatternSize, size_t Offset, size_t Size, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + + std::scoped_lock Lock(Buffer->Mutex); + + char *ZeHandleDst = nullptr; + _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); + UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferFillHelper( + UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t CommandBuffer, void *Ptr, + const void *Pattern, size_t PatternSize, size_t Size, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + + return enqueueCommandBufferFillHelper( + UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 5371fac082..74d0706b31 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -326,6 +326,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp; + pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp; pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp; pDdiTable->pfnAppendMemBufferCopyRectExp = urCommandBufferAppendMemBufferCopyRectExp; @@ -338,6 +339,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( urCommandBufferAppendMemBufferWriteRectExp; pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp; pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp; + pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp; pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; return retVal; From 3c4d445fee5e536d85bc7c66d08c57980d5946a4 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 31 Oct 2023 11:02:45 +0000 Subject: [PATCH 02/10] Add OpenCL fill stubs and fix naming --- source/adapters/opencl/command_buffer.cpp | 11 +++++++++++ source/adapters/opencl/ur_interface_loader.cpp | 2 ++ 2 files changed, 13 insertions(+) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 25d3311b79..0d9356644a 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -331,6 +331,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( + [[maybe_unused]] ur_exp_command_buffer_handle_t hCommandBuffer, + [[maybe_unused]] void *pPtr, [[maybe_unused]] const void *pPattern, + [[maybe_unused]] size_t PatternSize, [[maybe_unused]] size_t Size, + [[maybe_unused]] uint32_t NumSyncPointsInWaitList, + [[maybe_unused]] const ur_exp_command_buffer_sync_point_t + *pSyncPointWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index b9887b1b1a..ac2c33475b 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -286,6 +286,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp; + pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp; pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp; pDdiTable->pfnAppendMemBufferCopyRectExp = urCommandBufferAppendMemBufferCopyRectExp; @@ -298,6 +299,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( urCommandBufferAppendMemBufferWriteRectExp; pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp; pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp; + pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp; pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; return retVal; From 4813bd080aad4e4b87b16d57726a81e932e2c5bf Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 1 Dec 2023 16:47:51 +0000 Subject: [PATCH 03/10] fixes rebase issue --- source/adapters/opencl/command_buffer.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 0d9356644a..c0c2bfd915 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -136,17 +136,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( - [[maybe_unused]] ur_exp_command_buffer_handle_t hCommandBuffer, - [[maybe_unused]] void *pMemory, [[maybe_unused]] const void *pPattern, - [[maybe_unused]] size_t patternSize, [[maybe_unused]] size_t size, - [[maybe_unused]] uint32_t numSyncPointsInWaitList, - [[maybe_unused]] const ur_exp_command_buffer_sync_point_t - *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, From e4950aa9bcfee8d346a665d5bad7489683a37e3b Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 1 Dec 2023 16:55:38 +0000 Subject: [PATCH 04/10] mend --- source/adapters/opencl/command_buffer.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index c0c2bfd915..0d9356644a 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -136,6 +136,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( + [[maybe_unused]] ur_exp_command_buffer_handle_t hCommandBuffer, + [[maybe_unused]] void *pMemory, [[maybe_unused]] const void *pPattern, + [[maybe_unused]] size_t patternSize, [[maybe_unused]] size_t size, + [[maybe_unused]] uint32_t numSyncPointsInWaitList, + [[maybe_unused]] const ur_exp_command_buffer_sync_point_t + *pSyncPointWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, From 2882e1f1dbbde66f336cf5cd71c48c41c092ff0d Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 1 Dec 2023 17:33:19 +0000 Subject: [PATCH 05/10] fixes naming issue --- source/adapters/opencl/command_buffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 0d9356644a..b880f94053 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -273,7 +273,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferFillExp( +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numSyncPointsInWaitList, From a9a325dec54d46e42f5de84049d89597c0a17c4d Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 4 Dec 2023 10:13:32 +0000 Subject: [PATCH 06/10] Adds CUDA support --- source/adapters/cuda/command_buffer.cpp | 137 +++++++++++++++++++----- 1 file changed, 110 insertions(+), 27 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 1b1faa870a..379afeb687 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -99,6 +99,90 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType, Params.Depth = 1; } +// Helper function for enqueuing memory fills +static ur_result_t enqueueCommandBufferFillHelper( + ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice, + const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize, + size_t Size, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_result_t Result; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); + + try { + size_t N = Size / PatternSize; + auto Value = *static_cast(Pattern); + auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE + ? *static_cast(DstDevice) + : (CUdeviceptr)DstDevice; + + if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { + // Create a new node + CUgraphNode GraphNode; + CUDA_MEMSET_NODE_PARAMS NodeParams = {}; + NodeParams.dst = DstPtr; + NodeParams.elementSize = PatternSize; + NodeParams.height = N; + NodeParams.pitch = PatternSize; + NodeParams.value = Value; + NodeParams.width = 1; + + Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), + DepsList.size(), &NodeParams, CommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *SyncPoint = + CommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + + } else { + // CUDA has no memset functions that allow setting values more than 4 + // bytes. UR API lets you pass an arbitrary "pattern" to the buffer + // fill, which can be more than 4 bytes. We must break up the pattern + // into 4 byte values, and set the buffer using multiple strided calls. + // This means that one cuGraphAddMemsetNode call is made for every 4 bytes + // in the pattern. + + size_t NumberOfSteps = PatternSize / sizeof(uint32_t); + + // we walk up the pattern in 4-byte steps, and call cuMemset for each + // 4-byte chunk of the pattern. + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + // take 4 bytes of the pattern + auto Value = *(static_cast(Pattern) + Step); + + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t)); + + // Create a new node + CUgraphNode GraphNode; + // Update NodeParam + CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {}; + NodeParamsStep.dst = (CUdeviceptr)OffsetPtr; + NodeParamsStep.elementSize = 4; + NodeParamsStep.height = N; + NodeParamsStep.pitch = PatternSize; + NodeParamsStep.value = Value; + NodeParamsStep.width = 1; + + Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), + DepsList.size(), &NodeParamsStep, + CommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *SyncPoint = CommandBuffer->AddSyncPoint( + std::make_shared(GraphNode)); + } + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, @@ -602,20 +686,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)pPattern; - (void)patternSize; - (void)offset; - (void)size; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + auto ArgsAreMultiplesOfPatternSize = + (offset % patternSize == 0) || (size % patternSize == 0); + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && + PatternSizeIsValid, + UR_RESULT_ERROR_INVALID_SIZE); + + auto DstDevice = std::get(hBuffer->Mem).get() + offset; + + return enqueueCommandBufferFillHelper( + hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize, + size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -624,19 +710,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)pPtr; - (void)pPattern; - (void)patternSize; - (void)size; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + + UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); + return enqueueCommandBufferFillHelper( + hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( From 67b9061f42e8acc072b8ddbcb3c2d0835818563c Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 4 Dec 2023 11:05:55 +0000 Subject: [PATCH 07/10] fixes cuda support merge issues --- source/adapters/cuda/command_buffer.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 379afeb687..6b62b32b70 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -106,10 +106,11 @@ static ur_result_t enqueueCommandBufferFillHelper( size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - ur_result_t Result; + ur_result_t Result = UR_RESULT_SUCCESS; std::vector DepsList; UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, DepsList)); + SyncPointWaitList, DepsList), + Result); try { size_t N = Size / PatternSize; @@ -129,7 +130,7 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParams.value = Value; NodeParams.width = 1; - Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + UR_CHECK_ERROR(cuGraphAddMemsetNode( &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, CommandBuffer->Device->getContext())); @@ -167,7 +168,7 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParamsStep.value = Value; NodeParamsStep.width = 1; - Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + UR_CHECK_ERROR(cuGraphAddMemsetNode( &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParamsStep, CommandBuffer->Device->getContext())); From a8ea015c908f0de7f24268e63161cf82ebb98d14 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Wed, 6 Dec 2023 15:31:51 +0000 Subject: [PATCH 08/10] Removes code artefact --- scripts/core/EXP-COMMAND-BUFFER.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index 7b1f1d54b9..a6a32a66a1 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -103,7 +103,6 @@ Currently only the following commands are supported: * ${x}CommandBufferAppendMemBufferFillExp * ${x}CommandBufferAppendUSMPrefetchExp * ${x}CommandBufferAppendUSMAdviseExp ->>>>>>> 118f696b ([CMDBUF] Add fill commands to cmd buffer exp feature) It is planned to eventually support any command type from the Core API which can actually be appended to the equiavalent adapter native constructs. From 03c270d14c24c96302f5d7e021b1b2c613d496f2 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 8 Dec 2023 14:14:33 +0000 Subject: [PATCH 09/10] Add const variable --- source/adapters/cuda/command_buffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 6b62b32b70..a65530a1f1 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -113,7 +113,7 @@ static ur_result_t enqueueCommandBufferFillHelper( Result); try { - size_t N = Size / PatternSize; + const size_t N = Size / PatternSize; auto Value = *static_cast(Pattern); auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstDevice) From 3ee71a71da6863c51bf731fb76c08a7c8afaa026 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 3 Jan 2024 17:10:56 +0000 Subject: [PATCH 10/10] fixup rebase --- source/adapters/opencl/command_buffer.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index b880f94053..74cdd8a03d 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -331,17 +331,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( - [[maybe_unused]] ur_exp_command_buffer_handle_t hCommandBuffer, - [[maybe_unused]] void *pPtr, [[maybe_unused]] const void *pPattern, - [[maybe_unused]] size_t PatternSize, [[maybe_unused]] size_t Size, - [[maybe_unused]] uint32_t NumSyncPointsInWaitList, - [[maybe_unused]] const ur_exp_command_buffer_sync_point_t - *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,