diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 9cf7561ba6..ea60fc8f06 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -99,6 +99,90 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType, Params.Depth = 1; } +// Helper function for enqueuing memory fills +static ur_result_t enqueueCommandBufferFillHelper( + ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice, + const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize, + size_t Size, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_result_t Result; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); + + try { + size_t N = Size / PatternSize; + auto Value = *static_cast(Pattern); + auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE + ? *static_cast(DstDevice) + : (CUdeviceptr)DstDevice; + + if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { + // Create a new node + CUgraphNode GraphNode; + CUDA_MEMSET_NODE_PARAMS NodeParams = {}; + NodeParams.dst = DstPtr; + NodeParams.elementSize = PatternSize; + NodeParams.height = N; + NodeParams.pitch = PatternSize; + NodeParams.value = Value; + NodeParams.width = 1; + + Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), + DepsList.size(), &NodeParams, CommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *SyncPoint = + CommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + + } else { + // CUDA has no memset functions that allow setting values more than 4 + // bytes. UR API lets you pass an arbitrary "pattern" to the buffer + // fill, which can be more than 4 bytes. We must break up the pattern + // into 4 byte values, and set the buffer using multiple strided calls. + // This means that one cuGraphAddMemsetNode call is made for every 4 bytes + // in the pattern. + + size_t NumberOfSteps = PatternSize / sizeof(uint32_t); + + // we walk up the pattern in 4-byte steps, and call cuMemset for each + // 4-byte chunk of the pattern. + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + // take 4 bytes of the pattern + auto Value = *(static_cast(Pattern) + Step); + + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t)); + + // Create a new node + CUgraphNode GraphNode; + // Update NodeParam + CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {}; + NodeParamsStep.dst = (CUdeviceptr)OffsetPtr; + NodeParamsStep.elementSize = 4; + NodeParamsStep.height = N; + NodeParamsStep.pitch = PatternSize; + NodeParamsStep.value = Value; + NodeParamsStep.width = 1; + + Result = UR_CHECK_ERROR(cuGraphAddMemsetNode( + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), + DepsList.size(), &NodeParamsStep, + CommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *SyncPoint = CommandBuffer->AddSyncPoint( + std::make_shared(GraphNode)); + } + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, @@ -531,20 +615,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)pPattern; - (void)patternSize; - (void)offset; - (void)size; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + auto ArgsAreMultiplesOfPatternSize = + (offset % patternSize == 0) || (size % patternSize == 0); + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && + PatternSizeIsValid, + UR_RESULT_ERROR_INVALID_SIZE); + + auto DstDevice = std::get(hBuffer->Mem).get() + offset; + + return enqueueCommandBufferFillHelper( + hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize, + size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -553,19 +639,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)pPtr; - (void)pPattern; - (void)patternSize; - (void)size; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + + UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); + return enqueueCommandBufferFillHelper( + hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(