diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 0d239bc432..9fd3a06927 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -9,185 +9,1093 @@ //===----------------------------------------------------------------------===// #include "command_buffer.hpp" + +#include + #include "common.hpp" +#include "enqueue.hpp" +#include "event.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "queue.hpp" + +#include + +namespace { +ur_result_t +commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) { + if (CommandBuffer->decrementInternalReferenceCount() != 0) { + return UR_RESULT_SUCCESS; + } + + delete CommandBuffer; + return UR_RESULT_SUCCESS; +} + +ur_result_t +commandHandleReleaseInternal(ur_exp_command_buffer_command_handle_t Command) { + if (Command->decrementInternalReferenceCount() != 0) { + return UR_RESULT_SUCCESS; + } + + // Decrement parent command-buffer internal ref count + commandBufferReleaseInternal(Command->CommandBuffer); + + delete Command; + return UR_RESULT_SUCCESS; +} +} // end anonymous namespace + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t hContext, ur_device_handle_t hDevice, bool IsUpdatable) + : Context(hContext), Device(hDevice), + IsUpdatable(IsUpdatable), HIPGraph{nullptr}, HIPGraphExec{nullptr}, + RefCountInternal{1}, RefCountExternal{1} { + urContextRetain(hContext); + urDeviceRetain(hDevice); +} + +/// The ur_exp_command_buffer_handle_t_ destructor releases +/// all the memory objects allocated for command_buffer managment +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { + // Release the memory allocated to the Context stored in the command_buffer + UR_TRACE(urContextRelease(Context)); + + // Release the device + UR_TRACE(urDeviceRelease(Device)); + + // Release the memory allocated to the HIPGraph + UR_CHECK_ERROR(hipGraphDestroy(HIPGraph)); + + // Release the memory allocated to the HIPGraphExec + if (HIPGraphExec) { + UR_CHECK_ERROR(hipGraphExecDestroy(HIPGraphExec)); + } +} + +ur_exp_command_buffer_command_handle_t_:: + ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + std::shared_ptr &&Node, hipKernelNodeParams Params, + uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, + const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr) + : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(std::move(Node)), + Params(Params), WorkDim(WorkDim), RefCountInternal(1), + RefCountExternal(1) { + CommandBuffer->incrementInternalReferenceCount(); + + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); + std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + // Local work size may be nullptr + if (LocalWorkSizePtr) { + std::memcpy(LocalWorkSize, LocalWorkSizePtr, CopySize); + } else { + std::memset(LocalWorkSize, 0, sizeof(size_t) * 3); + } + + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); + std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); + } +} + +/// Helper function for finding the HIP Nodes associated with the commands in a +/// command-buffer, each event is pointed to by a sync-point in the wait list. +/// +/// @param[in] CommandBuffer to lookup the events from. +/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. +/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer +/// to find the events for. +/// @param[out] HipNodesList Return parameter for the HIP Nodes associated with +/// each sync-point in \p SyncPointWaitList. +/// +/// @return UR_RESULT_SUCCESS or an error code on failure +static ur_result_t getNodesFromSyncPoints( + const ur_exp_command_buffer_handle_t &CommandBuffer, + size_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + std::vector &HIPNodesList) { + // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining + // the event associated with each sync-point + auto SyncPoints = CommandBuffer->SyncPoints; + + // For each sync-point add associated HIP graph node to the return list. + for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { + if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); + NodeHandle != SyncPoints.end()) { + HIPNodesList.push_back(*NodeHandle->second.get()); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} + +// Helper function for enqueuing memory fills +static ur_result_t enqueueCommandBufferFillHelper( + ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice, + const hipMemoryType DstType, const void *Pattern, size_t PatternSize, + size_t Size, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::vector DepsList; + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + const size_t N = Size / PatternSize; + auto DstPtr = DstType == hipMemoryTypeDevice + ? *static_cast(DstDevice) + : DstDevice; + + if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { + // Create a new node + hipGraphNode_t GraphNode; + hipMemsetParams NodeParams = {}; + NodeParams.dst = DstPtr; + NodeParams.elementSize = PatternSize; + NodeParams.height = N; + NodeParams.pitch = PatternSize; + NodeParams.width = 1; + + // pattern size in bytes + switch (PatternSize) { + case 1: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + case 2: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + case 4: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + } + + UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + // Get sync point and register the node with it. + *SyncPoint = CommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + + } else { + // HIP has no memset functions that allow setting values more than 4 + // bytes. UR API lets you pass an arbitrary "pattern" to the buffer + // fill, which can be more than 4 bytes. We must break up the pattern + // into 1 byte values, and set the buffer using multiple strided calls. + // This means that one hipGraphAddMemsetNode call is made for every 1 + // bytes in the pattern. + + size_t NumberOfSteps = PatternSize / sizeof(uint8_t); + + // Shared pointer that will point to the last node created + std::shared_ptr GraphNodePtr; + + // Create a new node + hipGraphNode_t GraphNodeFirst; + // Update NodeParam + hipMemsetParams NodeParamsStepFirst = {}; + NodeParamsStepFirst.dst = DstPtr; + NodeParamsStepFirst.elementSize = 4; + NodeParamsStepFirst.height = Size / sizeof(uint32_t); + NodeParamsStepFirst.pitch = 4; + NodeParamsStepFirst.value = *(static_cast(Pattern)); + NodeParamsStepFirst.width = 1; + + UR_CHECK_ERROR(hipGraphAddMemsetNode( + &GraphNodeFirst, CommandBuffer->HIPGraph, DepsList.data(), + DepsList.size(), &NodeParamsStepFirst)); + + // Get sync point and register the node with it. + *SyncPoint = CommandBuffer->addSyncPoint( + std::make_shared(GraphNodeFirst)); + + DepsList.clear(); + DepsList.push_back(GraphNodeFirst); + + // we walk up the pattern in 1-byte steps, and add Memset node for each + // 1-byte chunk of the pattern. + for (auto Step = 4u; Step < NumberOfSteps; ++Step) { + // take 1 bytes of the pattern + auto Value = *(static_cast(Pattern) + Step); -/// Stub implementations of UR experimental feature command-buffers + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = reinterpret_cast( + reinterpret_cast(DstPtr) + (Step * sizeof(uint8_t))); + + // Create a new node + hipGraphNode_t GraphNode; + // Update NodeParam + hipMemsetParams NodeParamsStep = {}; + NodeParamsStep.dst = reinterpret_cast(OffsetPtr); + NodeParamsStep.elementSize = sizeof(uint8_t); + NodeParamsStep.height = Size / NumberOfSteps; + NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t); + NodeParamsStep.value = Value; + NodeParamsStep.width = 1; + + UR_CHECK_ERROR(hipGraphAddMemsetNode( + &GraphNode, CommandBuffer->HIPGraph, DepsList.data(), + DepsList.size(), &NodeParamsStep)); + + GraphNodePtr = std::make_shared(GraphNode); + // Get sync point and register the node with it. + *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); + + DepsList.clear(); + DepsList.push_back(*GraphNodePtr.get()); + } + } + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; +} UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( - ur_context_handle_t, ur_device_handle_t, - const ur_exp_command_buffer_desc_t *, ur_exp_command_buffer_handle_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer) { + const bool IsUpdatable = + pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; + + try { + *phCommandBuffer = + new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + try { + UR_CHECK_ERROR(hipGraphCreate(&(*phCommandBuffer)->HIPGraph, 0)); + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + hCommandBuffer->incrementInternalReferenceCount(); + hCommandBuffer->incrementExternalReferenceCount(); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + if (hCommandBuffer->decrementExternalReferenceCount() == 0) { + // External ref count has reached zero, internal release of created + // commands. + for (auto Command : hCommandBuffer->CommandHandles) { + commandHandleReleaseInternal(Command); + } + } + + return commandBufferReleaseInternal(hCommandBuffer); } UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + try { + const unsigned long long flags = 0; + UR_CHECK_ERROR(hipGraphInstantiateWithFlags( + &hCommandBuffer->HIPGraphExec, hCommandBuffer->HIPGraph, flags)); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t, ur_kernel_handle_t, uint32_t, - const size_t *, const size_t *, const size_t *, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_command_handle_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, + ur_exp_command_buffer_command_handle_t *phCommand) { + // Preconditions + UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + hipGraphNode_t GraphNode; + std::vector DepsList; + + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + if (*pGlobalWorkSize == 0) { + try { + // Create an empty node if the kernel workload size is zero + UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size())); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {64u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + hipFunction_t HIPFunc = hKernel->get(); + UR_CALL(setKernelParams(hCommandBuffer->Device, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, hKernel, HIPFunc, + ThreadsPerBlock, BlocksPerGrid), + Result); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + // Set node param structure with the kernel related data + auto &ArgIndices = hKernel->getArgIndices(); + hipKernelNodeParams NodeParams; + NodeParams.func = HIPFunc; + NodeParams.gridDim.x = BlocksPerGrid[0]; + NodeParams.gridDim.y = BlocksPerGrid[1]; + NodeParams.gridDim.z = BlocksPerGrid[2]; + NodeParams.blockDim.x = ThreadsPerBlock[0]; + NodeParams.blockDim.y = ThreadsPerBlock[1]; + NodeParams.blockDim.z = ThreadsPerBlock[2]; + NodeParams.sharedMemBytes = LocalSize; + NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.extra = nullptr; + + // Create and add an new kernel node to the HIP graph + UR_CHECK_ERROR(hipGraphAddKernelNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + if (LocalSize != 0) + hKernel->clearLocalSize(); + + // Get sync point and register the node with it. + auto NodeSP = std::make_shared(GraphNode); + if (pSyncPoint) { + *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP); + } + + auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ + hCommandBuffer, hKernel, std::move(NodeSP), NodeParams, + workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + + NewCommand->incrementInternalReferenceCount(); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + *phCommand = NewCommand; + } + + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( - ur_exp_command_buffer_handle_t, void *, const void *, size_t, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( + &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), + pDst, pSrc, size, hipMemcpyHostToHost)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, size_t, - size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(size + dstOffset <= std::get(hDstMem->Mem).getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto Src = std::get(hSrcMem->Mem) + .getPtrWithOffset(hCommandBuffer->Device, srcOffset); + auto Dst = std::get(hDstMem->Mem) + .getPtrWithOffset(hCommandBuffer->Device, dstOffset); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( + &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), + Dst, Src, size, hipMemcpyDeviceToDevice)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, - ur_rect_offset_t, ur_rect_offset_t, ur_rect_region_t, size_t, size_t, - size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto SrcPtr = + std::get(hSrcMem->Mem).getPtr(hCommandBuffer->Device); + auto DstPtr = + std::get(hDstMem->Mem).getPtr(hCommandBuffer->Device); + hipMemcpy3DParms NodeParams = {}; + + setCopyRectParams(region, SrcPtr, hipMemoryTypeDevice, srcOrigin, + srcRowPitch, srcSlicePitch, DstPtr, hipMemoryTypeDevice, + dstOrigin, dstRowPitch, dstSlicePitch, NodeParams); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, - const void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto Dst = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( + &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), + Dst, pSrc, size, hipMemcpyHostToDevice)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto Src = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( + &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), + pDst, Src, size, hipMemcpyDeviceToHost)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, - ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto DstPtr = + std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device); + hipMemcpy3DParms NodeParams = {}; + + setCopyRectParams(region, pSrc, hipMemoryTypeHost, hostOffset, hostRowPitch, + hostSlicePitch, DstPtr, hipMemoryTypeDevice, bufferOffset, + bufferRowPitch, bufferSlicePitch, NodeParams); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, - ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + auto SrcPtr = + std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device); + hipMemcpy3DParms NodeParams = {}; + + setCopyRectParams(region, SrcPtr, hipMemoryTypeDevice, bufferOffset, + bufferRowPitch, bufferSlicePitch, pDst, hipMemoryTypeHost, + hostOffset, hostRowPitch, hostSlicePitch, NodeParams); + + UR_CHECK_ERROR(hipGraphAddMemcpyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( - ur_exp_command_buffer_handle_t, const void *, size_t, - ur_usm_migration_flags_t, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */, + size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + // Prefetch cmd is not supported by Hip Graph. + // We implement it as an empty node to enforce dependencies. + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + // Create an empty node if the kernel workload size is zero + UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size())); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + + setErrorMessage("Prefetch hint ignored and replaced with empty node as " + "prefetch is not supported by HIP Graph backend", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( - ur_exp_command_buffer_handle_t, const void *, size_t, ur_usm_advice_flags_t, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, const void * /* Mem */, + size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + // Mem-Advise cmd is not supported by Hip Graph. + // We implement it as an empty node to enforce dependencies. + hipGraphNode_t GraphNode; + std::vector DepsList; + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + try { + // Create an empty node if the kernel workload size is zero + UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size())); + + // Get sync point and register the node with it. + *pSyncPoint = hCommandBuffer->addSyncPoint( + std::make_shared(GraphNode)); + + setErrorMessage("Memory advice ignored and replaced with empty node as " + "memory advice is not supported by HIP Graph backend", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( - ur_exp_command_buffer_handle_t, ur_mem_handle_t, const void *, size_t, - size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + auto ArgsAreMultiplesOfPatternSize = + (offset % patternSize == 0) || (size % patternSize == 0); + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && + PatternSizeIsValid, + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + auto DstDevice = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); + + return enqueueCommandBufferFillHelper( + hCommandBuffer, &DstDevice, hipMemoryTypeDevice, pPattern, patternSize, + size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( - ur_exp_command_buffer_handle_t, void *, const void *, size_t, size_t, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, void *pPtr, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + + auto PatternIsValid = (pPattern != nullptr); + + auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && + (patternSize > 0); // is a positive power of two + + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); + return enqueueCommandBufferFillHelper( + hCommandBuffer, pPtr, hipMemoryTypeUnified, pPattern, patternSize, size, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t, ur_queue_handle_t, uint32_t, - const ur_event_handle_t *, ur_event_handle_t *) { - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for HIP adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + std::unique_ptr RetImplEvent{nullptr}; + ScopedContext Active(hQueue->getDevice()); + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + + if ((Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)) != UR_RESULT_SUCCESS) { + return Result; + } + + if (phEvent) { + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + hQueue, HIPStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + // Launch graph + UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + + return Result; } -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferRetainCommandExp(ur_exp_command_buffer_command_handle_t) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + hCommand->incrementExternalReferenceCount(); + hCommand->incrementInternalReferenceCount(); + return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferReleaseCommandExp(ur_exp_command_buffer_command_handle_t) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + hCommand->decrementExternalReferenceCount(); + return commandHandleReleaseInternal(hCommand); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - ur_exp_command_buffer_command_handle_t, - const ur_exp_command_buffer_update_kernel_launch_desc_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { + // Update requires command-buffer to be finalized + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + if (!CommandBuffer->HIPGraphExec) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Update requires command-buffer to be created with update enabled + if (!CommandBuffer->IsUpdatable) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Kernel corresponding to the command to update + ur_kernel_handle_t Kernel = hCommand->Kernel; + ur_device_handle_t Device = CommandBuffer->Device; + + // Update pointer arguments to the kernel + uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = + pUpdateKernelLaunch->pNewPointerArgList; + for (uint32_t i = 0; i < NumPointerArgs; i++) { + const auto &PointerArgDesc = ArgPointerList[i]; + uint32_t ArgIndex = PointerArgDesc.argIndex; + const void *ArgValue = PointerArgDesc.pNewPointerArg; + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); + } catch (ur_result_t Err) { + Result = Err; + return Result; + } + } + + // Update memobj arguments to the kernel + uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = + pUpdateKernelLaunch->pNewMemObjArgList; + for (uint32_t i = 0; i < NumMemobjArgs; i++) { + const auto &MemobjArgDesc = ArgMemobjList[i]; + uint32_t ArgIndex = MemobjArgDesc.argIndex; + ur_mem_handle_t ArgValue = MemobjArgDesc.hNewMemObjArg; + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + if (ArgValue == nullptr) { + Kernel->setKernelArg(ArgIndex, 0, nullptr); + } else { + void *HIPPtr = std::get(ArgValue->Mem).getVoid(Device); + Kernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); + } + } catch (ur_result_t Err) { + Result = Err; + return Result; + } + } + + // Update value arguments to the kernel + uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = + pUpdateKernelLaunch->pNewValueArgList; + for (uint32_t i = 0; i < NumValueArgs; i++) { + const auto &ValueArgDesc = ArgValueList[i]; + uint32_t ArgIndex = ValueArgDesc.argIndex; + size_t ArgSize = ValueArgDesc.argSize; + const void *ArgValue = ValueArgDesc.pNewValueArg; + + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); + } catch (ur_result_t Err) { + Result = Err; + return Result; + } + } + + // Set the updated ND range + const uint32_t NewWorkDim = pUpdateKernelLaunch->newWorkDim; + if (NewWorkDim != 0) { + UR_ASSERT(NewWorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(NewWorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + hCommand->WorkDim = NewWorkDim; + } + + if (pUpdateKernelLaunch->pNewGlobalWorkOffset) { + hCommand->setGlobalOffset(pUpdateKernelLaunch->pNewGlobalWorkOffset); + } + + if (pUpdateKernelLaunch->pNewGlobalWorkSize) { + hCommand->setGlobalSize(pUpdateKernelLaunch->pNewGlobalWorkSize); + } + + if (pUpdateKernelLaunch->pNewLocalWorkSize) { + hCommand->setLocalSize(pUpdateKernelLaunch->pNewLocalWorkSize); + } + + size_t *GlobalWorkOffset = hCommand->GlobalWorkOffset; + size_t *GlobalWorkSize = hCommand->GlobalWorkSize; + + const bool ProvidedLocalSize = hCommand->LocalWorkSize[0] != 0 || + hCommand->LocalWorkSize[1] != 0 || + hCommand->LocalWorkSize[2] != 0; + // If no worksize is provided make sure we pass nullptr to setKernelParams so + // it can guess the local work size. + size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr; + uint32_t WorkDim = hCommand->WorkDim; + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + hipFunction_t HIPFunc = Kernel->get(); + auto Result = setKernelParams(Device, WorkDim, GlobalWorkOffset, + GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc, + ThreadsPerBlock, BlocksPerGrid); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + hipKernelNodeParams &Params = hCommand->Params; + + Params.func = HIPFunc; + Params.gridDim.x = BlocksPerGrid[0]; + Params.gridDim.y = BlocksPerGrid[1]; + Params.gridDim.z = BlocksPerGrid[2]; + Params.blockDim.x = ThreadsPerBlock[0]; + Params.blockDim.y = ThreadsPerBlock[1]; + Params.blockDim.z = ThreadsPerBlock[2]; + Params.sharedMemBytes = Kernel->getLocalSize(); + Params.kernelParams = const_cast(Kernel->getArgIndices().data()); + + hipGraphNode_t Node = *(hCommand->Node); + hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; + UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params)); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( - ur_exp_command_buffer_handle_t, ur_exp_command_buffer_info_t, size_t, - void *, size_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: + return ReturnValue(hCommandBuffer->getExternalReferenceCount()); + default: + assert(!"Command-buffer info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( - ur_exp_command_buffer_command_handle_t, - ur_exp_command_buffer_command_info_t, size_t, void *, size_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_command_buffer_command_handle_t hCommand, + ur_exp_command_buffer_command_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_EXP_COMMAND_BUFFER_COMMAND_INFO_REFERENCE_COUNT: + return ReturnValue(hCommand->getExternalReferenceCount()); + default: + assert(!"Command-buffer command info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index 96d0e8e34c..f1b3e32bfb 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -9,7 +9,312 @@ //===----------------------------------------------------------------------===// #include +#include -/// Stub implementation of command-buffers for HIP +#include "context.hpp" +#include +#include -struct ur_exp_command_buffer_handle_t_ {}; +static inline const char *getUrResultString(ur_result_t Result) { + switch (Result) { + case UR_RESULT_SUCCESS: + return "UR_RESULT_SUCCESS"; + case UR_RESULT_ERROR_INVALID_OPERATION: + return "UR_RESULT_ERROR_INVALID_OPERATION"; + case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: + return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; + case UR_RESULT_ERROR_INVALID_QUEUE: + return "UR_RESULT_ERROR_INVALID_QUEUE"; + case UR_RESULT_ERROR_INVALID_VALUE: + return "UR_RESULT_ERROR_INVALID_VALUE"; + case UR_RESULT_ERROR_INVALID_CONTEXT: + return "UR_RESULT_ERROR_INVALID_CONTEXT"; + case UR_RESULT_ERROR_INVALID_PLATFORM: + return "UR_RESULT_ERROR_INVALID_PLATFORM"; + case UR_RESULT_ERROR_INVALID_BINARY: + return "UR_RESULT_ERROR_INVALID_BINARY"; + case UR_RESULT_ERROR_INVALID_PROGRAM: + return "UR_RESULT_ERROR_INVALID_PROGRAM"; + case UR_RESULT_ERROR_INVALID_SAMPLER: + return "UR_RESULT_ERROR_INVALID_SAMPLER"; + case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: + return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; + case UR_RESULT_ERROR_INVALID_MEM_OBJECT: + return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; + case UR_RESULT_ERROR_INVALID_EVENT: + return "UR_RESULT_ERROR_INVALID_EVENT"; + case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; + case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: + return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; + case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; + case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: + return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; + case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: + return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; + case UR_RESULT_ERROR_DEVICE_NOT_FOUND: + return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; + case UR_RESULT_ERROR_INVALID_DEVICE: + return "UR_RESULT_ERROR_INVALID_DEVICE"; + case UR_RESULT_ERROR_DEVICE_LOST: + return "UR_RESULT_ERROR_DEVICE_LOST"; + case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: + return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; + case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: + return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; + case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: + return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; + case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: + return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; + case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; + case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: + return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; + case UR_RESULT_ERROR_INVALID_KERNEL: + return "UR_RESULT_ERROR_INVALID_KERNEL"; + case UR_RESULT_ERROR_INVALID_KERNEL_NAME: + return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; + case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; + case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: + return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; + case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: + return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; + case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: + return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; + case UR_RESULT_ERROR_UNINITIALIZED: + return "UR_RESULT_ERROR_UNINITIALIZED"; + case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_RESOURCES: + return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; + case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; + case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; + case UR_RESULT_ERROR_UNSUPPORTED_VERSION: + return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; + case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: + return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; + case UR_RESULT_ERROR_INVALID_ARGUMENT: + return "UR_RESULT_ERROR_INVALID_ARGUMENT"; + case UR_RESULT_ERROR_INVALID_NULL_HANDLE: + return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; + case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: + return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; + case UR_RESULT_ERROR_INVALID_NULL_POINTER: + return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; + case UR_RESULT_ERROR_INVALID_SIZE: + return "UR_RESULT_ERROR_INVALID_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_SIZE: + return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: + return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; + case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: + return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; + case UR_RESULT_ERROR_INVALID_ENUMERATION: + return "UR_RESULT_ERROR_INVALID_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; + case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: + return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; + case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: + return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; + case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: + return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; + case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; + case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; + case UR_RESULT_ERROR_PROGRAM_UNLINKED: + return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; + case UR_RESULT_ERROR_OVERLAPPING_REGIONS: + return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; + case UR_RESULT_ERROR_INVALID_HOST_PTR: + return "UR_RESULT_ERROR_INVALID_HOST_PTR"; + case UR_RESULT_ERROR_INVALID_USM_SIZE: + return "UR_RESULT_ERROR_INVALID_USM_SIZE"; + case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_ADAPTER_SPECIFIC: + return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; + default: + return "UR_RESULT_ERROR_UNKNOWN"; + } +} + +// Trace an internal UR call +#define UR_TRACE(Call) \ + { \ + ur_result_t Result; \ + UR_CALL(Call, Result); \ + } + +// Trace an internal UR call and return the result to the user. +#define UR_CALL(Call, Result) \ + { \ + if (PrintTrace) \ + fprintf(stderr, "UR ---> %s\n", #Call); \ + Result = (Call); \ + if (PrintTrace) \ + fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + } + +// Handle to a kernel command. +// +// Struct that stores all the information related to a kernel command in a +// command-buffer, such that the command can be recreated. When handles can +// be returned from other command types this struct will need refactored. +struct ur_exp_command_buffer_command_handle_t_ { + ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + std::shared_ptr &&Node, hipKernelNodeParams Params, + uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, + const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr); + + void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); + } + } + + void setGlobalSize(const size_t *GlobalWorkSizePtr) { + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); + } + } + + void setLocalSize(const size_t *LocalWorkSizePtr) { + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(LocalWorkSize, LocalWorkSizePtr, CopySize); + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(LocalWorkSize + WorkDim, 0, ZeroSize); + } + } + + uint32_t incrementInternalReferenceCount() noexcept { + return ++RefCountInternal; + } + uint32_t decrementInternalReferenceCount() noexcept { + return --RefCountInternal; + } + + uint32_t incrementExternalReferenceCount() noexcept { + return ++RefCountExternal; + } + uint32_t decrementExternalReferenceCount() noexcept { + return --RefCountExternal; + } + uint32_t getExternalReferenceCount() const noexcept { + return RefCountExternal; + } + + ur_exp_command_buffer_handle_t CommandBuffer; + ur_kernel_handle_t Kernel; + std::shared_ptr Node; + hipKernelNodeParams Params; + + uint32_t WorkDim; + size_t GlobalWorkOffset[3]; + size_t GlobalWorkSize[3]; + size_t LocalWorkSize[3]; + +private: + std::atomic_uint32_t RefCountInternal; + std::atomic_uint32_t RefCountExternal; +}; + +struct ur_exp_command_buffer_handle_t_ { + + ur_exp_command_buffer_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice, bool IsUpdatable); + + ~ur_exp_command_buffer_handle_t_(); + + void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, + std::shared_ptr HIPNode) { + SyncPoints[SyncPoint] = HIPNode; + NextSyncPoint++; + } + + ur_exp_command_buffer_sync_point_t getNextSyncPoint() const { + return NextSyncPoint; + } + + // Helper to register next sync point + // @param HIPNode Node to register as next sync point + // @return Pointer to the sync that registers the Node + ur_exp_command_buffer_sync_point_t + addSyncPoint(std::shared_ptr HIPNode) { + ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; + registerSyncPoint(SyncPoint, HIPNode); + return SyncPoint; + } + uint32_t incrementInternalReferenceCount() noexcept { + return ++RefCountInternal; + } + uint32_t decrementInternalReferenceCount() noexcept { + return --RefCountInternal; + } + uint32_t getInternalReferenceCount() const noexcept { + return RefCountInternal; + } + + uint32_t incrementExternalReferenceCount() noexcept { + return ++RefCountExternal; + } + uint32_t decrementExternalReferenceCount() noexcept { + return --RefCountExternal; + } + uint32_t getExternalReferenceCount() const noexcept { + return RefCountExternal; + } + + // UR context associated with this command-buffer + ur_context_handle_t Context; + // Device associated with this command buffer + ur_device_handle_t Device; + // Whether commands in the command-buffer can be updated + bool IsUpdatable; + // HIP Graph handle + hipGraph_t HIPGraph; + // HIP Graph Exec handle + hipGraphExec_t HIPGraphExec; + // Atomic variable counting the number of reference to this command_buffer + // using std::atomic prevents data race when incrementing/decrementing. + std::atomic_uint32_t RefCountInternal; + std::atomic_uint32_t RefCountExternal; + + // Map of sync_points to ur_events + std::unordered_map> + SyncPoints; + // Next sync_point value (may need to consider ways to reuse values if 32-bits + // is not enough) + ur_exp_command_buffer_sync_point_t NextSyncPoint; + + // Handles to individual commands in the command-buffer + std::vector CommandHandles; +}; diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index bc67fcee71..7c9142f3c7 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -539,6 +539,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // native asserts are in progress std::string SupportedExtensions = ""; SupportedExtensions += "pi_ext_intel_devicelib_assert "; + // Return supported for the UR command-buffer experimental feature + SupportedExtensions += "ur_exp_command_buffer "; SupportedExtensions += " "; hipDeviceProp_t Props; @@ -843,7 +845,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: - return ReturnValue(false); + return ReturnValue(true); default: break; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index c4b1b86045..24ba905688 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include "enqueue.hpp" #include "common.hpp" #include "context.hpp" #include "event.hpp" @@ -17,8 +18,6 @@ extern size_t imageElementByteSize(hipArray_Format ArrayFormat); -namespace { - ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { @@ -68,6 +67,8 @@ void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, } } +namespace { + ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size, ur_usm_advice_flags_t URAdviceFlags, hipDevice_t Device) { @@ -310,68 +311,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t MaxWorkGroupSize = 0u; - size_t MaxThreadsPerBlock[3] = {}; - bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); - - { - MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX(); - MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY(); - MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ(); - - MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); - - // The MaxWorkGroupSize = 1024 for AMD GPU - // The MaxThreadsPerBlock = {1024, 1024, 1024} - - if (ProvidedLocalWorkGroupSize) { - auto isValid = [&](int dim) { - UR_ASSERT(pLocalWorkSize[dim] <= MaxThreadsPerBlock[dim], - UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than the - // global work sizes and not 0. - UR_ASSERT(pLocalWorkSize != 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - UR_ASSERT((pGlobalWorkSize[dim] % pLocalWorkSize[dim]) == 0, - UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - ThreadsPerBlock[dim] = pLocalWorkSize[dim]; - return UR_RESULT_SUCCESS; - }; - - for (size_t dim = 0; dim < workDim; dim++) { - auto err = isValid(dim); - if (err != UR_RESULT_SUCCESS) - return err; - } - } else { - simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize, - MaxThreadsPerBlock, hKernel); - } - } - - UR_ASSERT(MaxWorkGroupSize >= size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * - ThreadsPerBlock[2]), - UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - for (size_t i = 0; i < workDim; i++) { - BlocksPerGrid[i] = - (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; - } - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { ur_device_handle_t Dev = hQueue->getDevice(); + + hipFunction_t HIPFunc = hKernel->get(); + UR_CHECK_ERROR(setKernelParams(Dev, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, hKernel, + HIPFunc, ThreadsPerBlock, BlocksPerGrid)); + ScopedContext Active(Dev); uint32_t StreamToken; ur_stream_quard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - hipFunction_t HIPFunc = hKernel->get(); if (DepEvents.size()) { UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(), @@ -385,22 +343,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - // Set the implicit global offset parameter if kernel has offset variant - if (hKernel->getWithOffsetParameter()) { - std::uint32_t hip_implicit_offset[3] = {0, 0, 0}; - if (pGlobalWorkOffset) { - for (size_t i = 0; i < workDim; i++) { - hip_implicit_offset[i] = - static_cast(pGlobalWorkOffset[i]); - if (pGlobalWorkOffset[i] != 0) { - HIPFunc = hKernel->getWithOffsetParameter(); - } - } - } - hKernel->setImplicitOffsetArg(sizeof(hip_implicit_offset), - hip_implicit_offset); - } - auto ArgIndices = hKernel->getArgIndices(); if (phEvent) { @@ -424,30 +366,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( MemMigrationLocks.clear(); } - // Set local mem max size if env var is present - static const char *LocalMemSzPtrUR = - std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE"); - static const char *LocalMemSzPtrPI = - std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE"); - static const char *LocalMemSzPtr = - LocalMemSzPtrUR ? LocalMemSzPtrUR - : (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr); - - if (LocalMemSzPtr) { - int DeviceMaxLocalMem = Dev->getDeviceMaxLocalMem(); - static const int EnvVal = std::atoi(LocalMemSzPtr); - if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) { - setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for " - "UR_HIP_MAX_LOCAL_MEM_SIZE" - : "Invalid value specified for " - "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - UR_CHECK_ERROR(hipFuncSetAttribute( - HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, EnvVal)); - } - UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], @@ -1824,3 +1742,156 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +// Helper to compute kernel parameters from workload +// dimensions. +// @param [in] Device handler to the target Device +// @param [in] WorkDim workload dimension +// @param [in] GlobalWorkOffset pointer workload global offsets +// @param [in] GlobalWorkSize pointer workload global sizes +// @param [in] LocalWorkOffset pointer workload local offsets +// @param [inout] Kernel handler to the kernel +// @param [inout] HIPFunc handler to the HIP function attached to the kernel +// @param [out] ThreadsPerBlock Number of threads per block we should run +// @param [out] BlocksPerGrid Number of blocks per grid we should run +ur_result_t +setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + hipFunction_t &HIPFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]) { + size_t MaxWorkGroupSize = 0; + ur_result_t Result = UR_RESULT_SUCCESS; + try { + ScopedContext Active(Device); + { + size_t MaxThreadsPerBlock[3] = { + static_cast(Device->getMaxBlockDimX()), + static_cast(Device->getMaxBlockDimY()), + static_cast(Device->getMaxBlockDimZ())}; + + MaxWorkGroupSize = Device->getMaxWorkGroupSize(); + + if (LocalWorkSize != nullptr) { + auto isValid = [&](int dim) { + UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim], + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + UR_ASSERT(LocalWorkSize != 0, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + UR_ASSERT((GlobalWorkSize[dim] % LocalWorkSize[dim]) == 0, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + ThreadsPerBlock[dim] = LocalWorkSize[dim]; + return UR_RESULT_SUCCESS; + }; + + for (size_t dim = 0; dim < WorkDim; dim++) { + auto err = isValid(dim); + if (err != UR_RESULT_SUCCESS) + return err; + } + } else { + simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize, + MaxThreadsPerBlock, Kernel); + } + } + + UR_ASSERT(MaxWorkGroupSize >= + size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * + ThreadsPerBlock[2]), + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + for (size_t i = 0; i < WorkDim; i++) { + BlocksPerGrid[i] = + (GlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; + } + + // Set the implicit global offset parameter if kernel has offset variant + if (Kernel->getWithOffsetParameter()) { + std::uint32_t ImplicitOffset[3] = {0, 0, 0}; + if (GlobalWorkOffset) { + for (size_t i = 0; i < WorkDim; i++) { + ImplicitOffset[i] = static_cast(GlobalWorkOffset[i]); + if (GlobalWorkOffset[i] != 0) { + HIPFunc = Kernel->getWithOffsetParameter(); + } + } + } + Kernel->setImplicitOffsetArg(sizeof(ImplicitOffset), ImplicitOffset); + } + + // Set local mem max size if env var is present + static const char *LocalMemSzPtrUR = + std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSzPtrPI = + std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSzPtr = + LocalMemSzPtrUR ? LocalMemSzPtrUR + : (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr); + + if (LocalMemSzPtr) { + int DeviceMaxLocalMem = Device->getDeviceMaxLocalMem(); + static const int EnvVal = std::atoi(LocalMemSzPtr); + if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) { + setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for " + "UR_HIP_MAX_LOCAL_MEM_SIZE" + : "Invalid value specified for " + "SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + UR_CHECK_ERROR(hipFuncSetAttribute( + HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, EnvVal)); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, + const hipMemoryType SrcType, ur_rect_offset_t SrcOffset, + size_t SrcRowPitch, size_t SrcSlicePitch, void *DstPtr, + const hipMemoryType DstType, ur_rect_offset_t DstOffset, + size_t DstRowPitch, size_t DstSlicePitch, + hipMemcpy3DParms &Params) { + // Set all params to 0 first + std::memset(&Params, 0, sizeof(hipMemcpy3DParms)); + + SrcRowPitch = (!SrcRowPitch) ? Region.width + SrcOffset.x : SrcRowPitch; + SrcSlicePitch = (!SrcSlicePitch) + ? ((Region.height + SrcOffset.y) * SrcRowPitch) + : SrcSlicePitch; + DstRowPitch = (!DstRowPitch) ? Region.width + DstOffset.x : DstRowPitch; + DstSlicePitch = (!DstSlicePitch) + ? ((Region.height + DstOffset.y) * DstRowPitch) + : DstSlicePitch; + + Params.extent.depth = Region.depth; + Params.extent.height = Region.height; + Params.extent.width = Region.width; + + Params.srcPtr.ptr = const_cast(SrcPtr); + Params.srcPtr.pitch = SrcRowPitch; + Params.srcPtr.xsize = SrcRowPitch; + Params.srcPtr.ysize = SrcSlicePitch / SrcRowPitch; + Params.srcPos.x = SrcOffset.x; + Params.srcPos.y = SrcOffset.y; + Params.srcPos.z = SrcOffset.z; + + Params.dstPtr.ptr = const_cast(DstPtr); + Params.dstPtr.pitch = DstRowPitch; + Params.dstPtr.xsize = DstRowPitch; + Params.dstPtr.ysize = DstSlicePitch / DstRowPitch; + Params.dstPos.x = DstOffset.x; + Params.dstPos.y = DstOffset.y; + Params.dstPos.z = DstOffset.z; + + Params.kind = (SrcType == hipMemoryTypeDevice + ? (DstType == hipMemoryTypeDevice ? hipMemcpyDeviceToDevice + : hipMemcpyDeviceToHost) + : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice + : hipMemcpyHostToHost)); +} diff --git a/source/adapters/hip/enqueue.hpp b/source/adapters/hip/enqueue.hpp new file mode 100644 index 0000000000..c84b47d479 --- /dev/null +++ b/source/adapters/hip/enqueue.hpp @@ -0,0 +1,37 @@ +//===--------- enqueue.hpp - HIP Adapter ---------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, + hipStream_t Stream, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList); + +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel, uint32_t LocalSize); + +ur_result_t +setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + hipFunction_t &HIPFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]); + +void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, + const hipMemoryType SrcType, ur_rect_offset_t SrcOffset, + size_t SrcRowPitch, size_t SrcSlicePitch, void *DstPtr, + const hipMemoryType DstType, ur_rect_offset_t DstOffset, + size_t DstRowPitch, size_t DstSlicePitch, + hipMemcpy3DParms &Params); diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp index ea5295dc6b..e7fac99800 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp @@ -82,11 +82,12 @@ TEST_P(BufferFillCommandTest, UpdateParameters) { }; // Set argument index 2 as new value to fill (index 1 is buffer accessor) + const uint32_t arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 4 : 2; uint32_t new_val = 33; ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(new_val), // argSize nullptr, // pProperties &new_val, // hArgValue @@ -217,10 +218,11 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { &output_update_desc)); uint32_t new_val = 33; + const uint32_t arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 4 : 2; ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(new_val), // argSize nullptr, // pProperties &new_val, // hArgValue @@ -280,11 +282,12 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); + const uint32_t arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 4 : 2; uint32_t first_val = 33; ur_exp_command_buffer_update_value_arg_desc_t first_input_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(first_val), // argSize nullptr, // pProperties &first_val, // hArgValue @@ -313,7 +316,7 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ur_exp_command_buffer_update_value_arg_desc_t second_input_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(second_val), // argSize nullptr, // pProperties &second_val, // hArgValue @@ -356,11 +359,12 @@ TEST_P(BufferFillCommandTest, OverrideArgList) { ValidateBuffer(buffer, sizeof(val) * global_size, val); ur_exp_command_buffer_update_value_arg_desc_t input_descs[2]; + const uint32_t arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 4 : 2; uint32_t first_val = 33; input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(first_val), // argSize nullptr, // pProperties &first_val, // hArgValue @@ -370,7 +374,7 @@ TEST_P(BufferFillCommandTest, OverrideArgList) { input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext - 2, // argIndex + arg_index, // argIndex sizeof(second_val), // argSize nullptr, // pProperties &second_val, // hArgValue diff --git a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp index 879b3a9bc6..b29ad8c6c5 100644 --- a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp @@ -29,29 +29,56 @@ struct BufferSaxpyKernelTest 0, nullptr, nullptr)); } - // Index 0 is output buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffers[0])); - // Index 1 is output accessor - struct { - size_t offsets[1] = {0}; - } accessor; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(accessor), nullptr, - &accessor)); - - // Index 2 is A - ASSERT_SUCCESS(urKernelSetArgValue(kernel, 2, sizeof(A), nullptr, &A)); - // Index 3 is X buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 3, nullptr, buffers[1])); - - // Index 4 is X buffer accessor - ASSERT_SUCCESS(urKernelSetArgValue(kernel, 4, sizeof(accessor), nullptr, - &accessor)); - // Index 5 is Y buffer - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 5, nullptr, buffers[2])); - - // Index 6 is Y buffer accessor - ASSERT_SUCCESS(urKernelSetArgValue(kernel, 6, sizeof(accessor), nullptr, - &accessor)); + // Variable that is incremented as arguments are added to the kernel + size_t current_arg_index = 0; + // Index 0 is output buffer for HIP/Non-HIP + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index++, + nullptr, buffers[0])); + + // Lambda to add accessor arguments depending on backend. + // HIP has 3 offset parameters and other backends only have 1. + auto addAccessorArgs = [&]() { + if (backend == UR_PLATFORM_BACKEND_HIP) { + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, + &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, + &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, + &val)); + } else { + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(accessor), nullptr, + &accessor)); + } + }; + + // Index 3 on HIP and 1 on non-HIP are accessors + addAccessorArgs(); + + // Index 4 on HIP and 2 on non-HIP is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(A), nullptr, &A)); + + // Index 5 on HIP and 3 on non-HIP is X buffer + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index++, + nullptr, buffers[1])); + + // Index 8 on HIP and 4 on non-HIP is X buffer accessor + addAccessorArgs(); + + // Index 9 on HIP and 5 on non-HIP is Y buffer + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index++, + nullptr, buffers[2])); + + // Index 12 on HIP and 6 on non-HIP is Y buffer accessor + addAccessorArgs(); // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( @@ -120,30 +147,34 @@ TEST_P(BufferSaxpyKernelTest, UpdateParameters) { Validate(buffers[0], buffers[1], buffers[2], A, global_size); ur_exp_command_buffer_update_memobj_arg_desc_t new_input_descs[2]; - // New X at index 3 + + // Index 5 on HIP and 3 on non-HIP is X buffer + const uint32_t x_arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 5 : 3; new_input_descs[0] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC, // stype nullptr, // pNext - 3, // argIndex + x_arg_index, // argIndex nullptr, // pProperties buffers[3], // hArgValue }; - // New Y at index 5 + // Index 9 on HIP and 5 on non-HIP is Y buffer + const uint32_t y_arg_index = backend == (UR_PLATFORM_BACKEND_HIP) ? 9 : 5; new_input_descs[1] = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC, // stype nullptr, // pNext - 5, // argIndex + y_arg_index, // argIndex nullptr, // pProperties buffers[4], // hArgValue }; - // A at index 2 + // Index 4 on HIP and 2 on non-HIP is A + const uint32_t a_arg_index = (backend == UR_PLATFORM_BACKEND_HIP) ? 4 : 2; uint32_t new_A = 33; ur_exp_command_buffer_update_value_arg_desc_t new_A_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype nullptr, // pNext, - 2, // argIndex + a_arg_index, // argIndex sizeof(new_A), // argSize nullptr, // pProperties &new_A, // hArgValue diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 4e9bff35f9..c8a198224b 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -59,6 +59,9 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + size_t returned_size; ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, nullptr, &returned_size)); @@ -97,6 +100,7 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; ur_bool_t updatable_command_buffer_support = false; + ur_platform_backend_t backend; }; struct urUpdatableCommandBufferExpExecutionTest