From 86f96f044313701ebab6eee2b45f0bb3149245be Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 13 Oct 2023 16:43:55 +0100 Subject: [PATCH] Refactor unions to using std::variant --- source/adapters/cuda/enqueue.cpp | 78 ++++----- source/adapters/cuda/kernel.cpp | 6 +- source/adapters/cuda/memory.cpp | 61 ++++--- source/adapters/cuda/memory.hpp | 287 +++++++++++++++---------------- source/adapters/hip/enqueue.cpp | 83 ++++----- source/adapters/hip/kernel.cpp | 11 +- source/adapters/hip/memory.cpp | 76 ++++---- source/adapters/hip/memory.hpp | 247 +++++++++++++------------- 8 files changed, 416 insertions(+), 433 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 7bead1c499..1022be1f09 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -514,7 +514,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -562,7 +562,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -606,9 +606,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), + UR_ASSERT(size + dstOffset <= std::get(hBufferDst->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), + UR_ASSERT(size + srcOffset <= std::get(hBufferSrc->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; @@ -628,8 +628,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset; - auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset; + auto Src = std::get(hBufferSrc->Mem).get() + srcOffset; + auto Dst = std::get(hBufferDst->Mem).get() + dstOffset; UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream)); @@ -654,8 +654,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get(); - CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get(); + CUdeviceptr SrcPtr = std::get(hBufferSrc->Mem).get(); + CUdeviceptr DstPtr = std::get(hBufferDst->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -726,7 +726,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), + UR_ASSERT(size + offset <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; @@ -745,7 +745,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->start()); } - auto DstDevice = hBuffer->Mem.BufferMem.get() + offset; + auto DstDevice = std::get(hBuffer->Mem).get() + offset; auto N = size / patternSize; // pattern size in bytes @@ -892,7 +892,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray Array = hImage->Mem.SurfaceMem.getArray(); + CUarray Array = std::get(hImage->Mem).getArray(); CUDA_ARRAY_DESCRIPTOR ArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); @@ -902,7 +902,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType(); std::unique_ptr RetImplEvent{nullptr}; if (phEvent) { @@ -964,7 +964,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray Array = hImage->Mem.SurfaceMem.getArray(); + CUarray Array = std::get(hImage->Mem).getArray(); CUDA_ARRAY_DESCRIPTOR ArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); @@ -982,7 +982,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( UR_CHECK_ERROR(RetImplEvent->start()); } - ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType(); if (ImgType == UR_MEM_TYPE_IMAGE1D) { UR_CHECK_ERROR( cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream)); @@ -1023,8 +1023,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == - hImageDst->Mem.SurfaceMem.getImageType(), + UR_ASSERT(std::get(hImageSrc->Mem).getImageType() == + std::get(hImageDst->Mem).getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); ur_result_t Result = UR_RESULT_SUCCESS; @@ -1035,8 +1035,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); - CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray(); + CUarray SrcArray = std::get(hImageSrc->Mem).getArray(); + CUarray DstArray = std::get(hImageDst->Mem).getArray(); CUDA_ARRAY_DESCRIPTOR SrcArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray)); @@ -1065,7 +1065,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); + ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getImageType(); if (ImgType == UR_MEM_TYPE_IMAGE1D) { UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray, SrcByteOffsetX, BytesToCopy)); @@ -1108,22 +1108,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_event_handle_t *phEvent, void **ppRetMap) { UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), + UR_ASSERT(offset + size <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); + auto &BufferImpl = std::get(hBuffer->Mem); ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT; const bool IsPinned = - hBuffer->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; // Currently no support for overlapping regions - if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { + if (BufferImpl.getMapPtr() != nullptr) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } // Allocate a pointer in the host to store the mapped information - auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(size, offset, mapFlags); - *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); + auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags); + *ppRetMap = BufferImpl.getMapPtr(); if (HostPtr) { Result = UR_RESULT_SUCCESS; } @@ -1168,21 +1168,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, + UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, + UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - const bool IsPinned = - hMem->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + const bool IsPinned = std::get(hMem->Mem).MemAllocMode == + BufferMem::AllocMode::AllocHostPtr; - if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) { + if (!IsPinned && + (std::get(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE)) { // Pinned host memory is only on host so it doesn't need to be written to. Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(), - hMem->Mem.BufferMem.getMapSize(), pMappedPtr, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hMem, true, std::get(hMem->Mem).getMapOffset(), + std::get(hMem->Mem).getMapSize(), pMappedPtr, + numEventsInWaitList, phEventWaitList, phEvent); } else { ScopedContext Active(hQueue->getContext()); @@ -1203,7 +1203,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( } } - hMem->Mem.BufferMem.unmap(pMappedPtr); + std::get(hMem->Mem).unmap(pMappedPtr); return Result; } @@ -1502,11 +1502,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, + UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -1549,11 +1549,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, + UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; try { diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 425cf637e7..eaaa3ef368 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -304,7 +304,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { CUDA_ARRAY3D_DESCRIPTOR arrayDesc; UR_CHECK_ERROR(cuArray3DGetDescriptor( - &arrayDesc, hArgValue->Mem.SurfaceMem.getArray())); + &arrayDesc, std::get(hArgValue->Mem).getArray())); if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_HALF && @@ -314,10 +314,10 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface(); + CUsurfObject CuSurf = std::get(hArgValue->Mem).getSurface(); hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf); } else { - CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get(); + CUdeviceptr CuPtr = std::get(hArgValue->Mem).get(); hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } } catch (ur_result_t Err) { diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 041e55d9d1..824ab1f580 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -44,22 +44,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( CUdeviceptr Ptr = 0; auto HostPtr = pProperties ? pProperties->pHost : nullptr; - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { UR_CHECK_ERROR( cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; + AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size)); UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + AllocMode = BufferMem::AllocMode::AllocHostPtr; } else { UR_CHECK_ERROR(cuMemAlloc(&Ptr, size)); if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; + AllocMode = BufferMem::AllocMode::CopyIn; } } @@ -121,21 +120,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { ScopedContext Active(MemObjPtr->getContext()); if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { - switch (MemObjPtr->Mem.BufferMem.MemAllocMode) { - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: - UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr)); + auto &BufferImpl = std::get(MemObjPtr->Mem); + switch (BufferImpl.MemAllocMode) { + case BufferMem::AllocMode::CopyIn: + case BufferMem::AllocMode::Classic: + UR_CHECK_ERROR(cuMemFree(BufferImpl.Ptr)); break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: - UR_CHECK_ERROR(cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr)); + case BufferMem::AllocMode::UseHostPtr: + UR_CHECK_ERROR(cuMemHostUnregister(BufferImpl.HostPtr)); break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: - UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr)); + case BufferMem::AllocMode::AllocHostPtr: + UR_CHECK_ERROR(cuMemFreeHost(BufferImpl.HostPtr)); }; } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { - UR_CHECK_ERROR( - cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface())); - UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray())); + auto &SurfaceImpl = std::get(MemObjPtr->Mem); + UR_CHECK_ERROR(cuSurfObjectDestroy(SurfaceImpl.getSurface())); + UR_CHECK_ERROR(cuArrayDestroy(SurfaceImpl.getArray())); } } catch (ur_result_t Err) { @@ -163,8 +163,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { - *phNativeMem = - reinterpret_cast(hMem->Mem.BufferMem.get()); + *phNativeMem = reinterpret_cast( + std::get(hMem->Mem).get()); return UR_RESULT_SUCCESS; } @@ -183,8 +183,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, case UR_MEM_INFO_SIZE: { try { size_t AllocSize = 0; - UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize, - hMemory->Mem.BufferMem.Ptr)); + UR_CHECK_ERROR(cuMemGetAddressRange( + nullptr, &AllocSize, std::get(hMemory->Mem).Ptr)); return ReturnValue(AllocSize); } catch (ur_result_t Err) { return Err; @@ -443,25 +443,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + auto &BufferImpl = std::get(hBuffer->Mem); + assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); - UR_ASSERT( - ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), - UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Retained indirectly due to retaining parent buffer below. ur_context_handle_t Context = hBuffer->Context; - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; - assert(hBuffer->Mem.BufferMem.Ptr != - ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}); - ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = - hBuffer->Mem.BufferMem.Ptr + pRegion->origin; + assert(BufferImpl.Ptr != BufferMem::native_type{0}); + BufferMem::native_type Ptr = BufferImpl.Ptr + pRegion->origin; void *HostPtr = nullptr; - if (hBuffer->Mem.BufferMem.HostPtr) { - HostPtr = - static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; + if (BufferImpl.HostPtr) { + HostPtr = static_cast(BufferImpl.HostPtr) + pRegion->origin; } std::unique_ptr MemObj{nullptr}; diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index e87acc859b..e60e415d39 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -12,9 +12,137 @@ #include #include #include +#include #include "common.hpp" +// Handler for plain, pointer-based CUDA allocations +struct BufferMem { + using native_type = CUdeviceptr; + + // If this allocation is a sub-buffer (i.e., a view on an existing + // allocation), this is the pointer to the parent handler structure + ur_mem_handle_t Parent; + // CUDA handler for the pointer + native_type Ptr; + + /// Pointer associated with this device on the host + void *HostPtr; + /// Size of the allocation in bytes + size_t Size; + /// Size of the active mapped region. + size_t MapSize; + /// Offset of the active mapped region. + size_t MapOffset; + /// Pointer to the active mapped region, if any + void *MapPtr; + /// Original flags for the mapped region + ur_map_flags_t MapFlags; + + /** AllocMode + * classic: Just a normal buffer allocated on the device via cuda malloc + * use_host_ptr: Use an address on the host for the device + * copy_in: The data for the device comes from the host but the host + pointer is not available later for re-use + * alloc_host_ptr: Uses pinned-memory allocation + */ + enum class AllocMode { + Classic, + UseHostPtr, + CopyIn, + AllocHostPtr, + } MemAllocMode; + + BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr, + void *HostPtr, size_t Size) + : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0}, + MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, + MemAllocMode{Mode} {}; + + native_type get() const noexcept { return Ptr; } + + size_t getSize() const noexcept { return Size; } + + void *getMapPtr() const noexcept { return MapPtr; } + + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } + + /// Returns a pointer to data visible on the host that contains + /// the data on the device associated with this allocation. + /// The offset is used to index into the CUDA allocation. + void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept { + assert(MapPtr == nullptr); + MapSize = Size; + MapOffset = Offset; + MapFlags = Flags; + if (HostPtr) { + MapPtr = static_cast(HostPtr) + Offset; + } else { + // TODO: Allocate only what is needed based on the offset + MapPtr = static_cast(malloc(this->getSize())); + } + return MapPtr; + } + + /// Detach the allocation from the host memory. + void unmap(void *) noexcept { + assert(MapPtr != nullptr); + + if (MapPtr != HostPtr) { + free(MapPtr); + } + MapPtr = nullptr; + MapSize = 0; + MapOffset = 0; + } + + ur_map_flags_t getMapFlags() const noexcept { + assert(MapPtr != nullptr); + return MapFlags; + } +}; + +// Handler data for surface object (i.e. Images) +struct SurfaceMem { + CUarray Array; + CUsurfObject SurfObj; + ur_mem_type_t ImageType; + + SurfaceMem(CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType, + void *HostPtr) + : Array{Array}, SurfObj{Surf}, ImageType{ImageType} { + (void)HostPtr; + } + + CUarray getArray() const noexcept { return Array; } + + CUsurfObject getSurface() const noexcept { return SurfObj; } + + ur_mem_type_t getImageType() const noexcept { return ImageType; } +}; + +// For sampled/unsampled images +struct ImageMem { + CUarray Array; + void *Handle; + ur_mem_type_t ImageType; + ur_sampler_handle_t Sampler; + + ImageMem(CUarray Array, void *Handle, ur_mem_type_t ImageType, + ur_sampler_handle_t Sampler) + : Array{Array}, Handle{Handle}, ImageType{ImageType}, Sampler{Sampler} {}; + + CUarray get_array() const noexcept { return Array; } + + void *get_handle() const noexcept { return Handle; } + + ur_mem_type_t get_image_type() const noexcept { return ImageType; } + + ur_sampler_handle_t get_sampler() const noexcept { return Sampler; } +}; + /// UR Mem mapping to CUDA memory allocations, both data and texture/surface. /// \brief Represents non-SVM allocations on the CUDA backend. /// Keeps tracks of all mapped regions used for Map/Unmap calls. @@ -35,136 +163,16 @@ struct ur_mem_handle_t_ { /// In CUDA their API handlers are different. Whereas "Buffers" are allocated /// as pointer-like structs, "Images" are stored in Textures or Surfaces. /// This union allows implementation to use either from the same handler. - union MemImpl { - // Handler for plain, pointer-based CUDA allocations - struct BufferMem { - using native_type = CUdeviceptr; - - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - ur_mem_handle_t Parent; - // CUDA handler for the pointer - native_type Ptr; - - /// Pointer associated with this device on the host - void *HostPtr; - /// Size of the allocation in bytes - size_t Size; - /// Size of the active mapped region. - size_t MapSize; - /// Offset of the active mapped region. - size_t MapOffset; - /// Pointer to the active mapped region, if any - void *MapPtr; - /// Original flags for the mapped region - ur_map_flags_t MapFlags; - - /** AllocMode - * classic: Just a normal buffer allocated on the device via cuda malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host - pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ - enum class AllocMode { - Classic, - UseHostPtr, - CopyIn, - AllocHostPtr, - } MemAllocMode; - - native_type get() const noexcept { return Ptr; } - - size_t getSize() const noexcept { return Size; } - - void *getMapPtr() const noexcept { return MapPtr; } - - size_t getMapSize() const noexcept { return MapSize; } - - size_t getMapOffset() const noexcept { return MapOffset; } - - /// Returns a pointer to data visible on the host that contains - /// the data on the device associated with this allocation. - /// The offset is used to index into the CUDA allocation. - void *mapToPtr(size_t Size, size_t Offset, - ur_map_flags_t Flags) noexcept { - assert(MapPtr == nullptr); - MapSize = Size; - MapOffset = Offset; - MapFlags = Flags; - if (HostPtr) { - MapPtr = static_cast(HostPtr) + Offset; - } else { - // TODO: Allocate only what is needed based on the offset - MapPtr = static_cast(malloc(this->getSize())); - } - return MapPtr; - } - - /// Detach the allocation from the host memory. - void unmap(void *) noexcept { - assert(MapPtr != nullptr); - - if (MapPtr != HostPtr) { - free(MapPtr); - } - MapPtr = nullptr; - MapSize = 0; - MapOffset = 0; - } - - ur_map_flags_t getMapFlags() const noexcept { - assert(MapPtr != nullptr); - return MapFlags; - } - } BufferMem; - - // Handler data for surface object (i.e. Images) - struct SurfaceMem { - CUarray Array; - CUsurfObject SurfObj; - ur_mem_type_t ImageType; - - CUarray getArray() const noexcept { return Array; } - - CUsurfObject getSurface() const noexcept { return SurfObj; } - - ur_mem_type_t getImageType() const noexcept { return ImageType; } - } SurfaceMem; - - struct ImageMem { - CUarray Array; - void *Handle; - ur_mem_type_t ImageType; - ur_sampler_handle_t Sampler; - - CUarray get_array() const noexcept { return Array; } - - void *get_handle() const noexcept { return Handle; } - - ur_mem_type_t get_image_type() const noexcept { return ImageType; } - - ur_sampler_handle_t get_sampler() const noexcept { return Sampler; } - } ImageMem; - } Mem; + std::variant Mem; /// Constructs the UR mem handler for a non-typed allocation ("buffer") ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, - ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode, + ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode, CUdeviceptr Ptr, void *HostPtr, size_t Size) - : Context{Context}, RefCount{1}, MemType{Type::Buffer}, MemFlags{ - MemFlags} { - Mem.BufferMem.Ptr = Ptr; - Mem.BufferMem.Parent = Parent; - Mem.BufferMem.HostPtr = HostPtr; - Mem.BufferMem.Size = Size; - Mem.BufferMem.MapSize = 0; - Mem.BufferMem.MapOffset = 0; - Mem.BufferMem.MapPtr = nullptr; - Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; - Mem.BufferMem.MemAllocMode = Mode; + : Context{Context}, RefCount{1}, MemType{Type::Buffer}, + MemFlags{MemFlags}, Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { if (isSubBuffer()) { - urMemRetain(Mem.BufferMem.Parent); + urMemRetain(std::get(Mem).Parent); } else { urContextRetain(Context); } @@ -174,43 +182,30 @@ struct ur_mem_handle_t_ { ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUsurfObject Surf, ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, void *HostPtr) - : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{ - MemFlags} { - (void)HostPtr; - - Mem.SurfaceMem.Array = Array; - Mem.SurfaceMem.SurfObj = Surf; - Mem.SurfaceMem.ImageType = ImageType; + : Context{Context}, RefCount{1}, MemType{Type::Surface}, + MemFlags{MemFlags}, Mem{SurfaceMem{Array, Surf, ImageType, HostPtr}} { urContextRetain(Context); } /// Constructs the UR allocation for an unsampled image object ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Surface} { - - Mem.ImageMem.Array = Array; - Mem.ImageMem.Handle = (void *)Surf; - Mem.ImageMem.ImageType = ImageType; - Mem.ImageMem.Sampler = nullptr; + : Context{Context}, RefCount{1}, MemType{Type::Surface}, + Mem{ImageMem{Array, (void *)Surf, ImageType, nullptr}} { urContextRetain(Context); } /// Constructs the UR allocation for a sampled image object ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex, ur_sampler_handle_t Sampler, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Texture} { - - Mem.ImageMem.Array = Array; - Mem.ImageMem.Handle = (void *)Tex; - Mem.ImageMem.ImageType = ImageType; - Mem.ImageMem.Sampler = Sampler; + : Context{Context}, RefCount{1}, MemType{Type::Texture}, + Mem{ImageMem{Array, (void *)Tex, ImageType, Sampler}} { urContextRetain(Context); } ~ur_mem_handle_t_() { if (isBuffer() && isSubBuffer()) { - urMemRelease(Mem.BufferMem.Parent); + urMemRelease(std::get(Mem).Parent); return; } urContextRelease(Context); @@ -219,7 +214,7 @@ struct ur_mem_handle_t_ { bool isBuffer() const noexcept { return MemType == Type::Buffer; } bool isSubBuffer() const noexcept { - return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); + return (isBuffer() && (std::get(Mem).Parent != nullptr)); } bool isImage() const noexcept { return MemType == Type::Surface; } diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index f7c378fcfc..95292ad00d 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -112,9 +112,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_CHECK_ERROR(RetImplEvent->start()); } - UR_CHECK_ERROR( - hipMemcpyHtoDAsync(hBuffer->Mem.BufferMem.getWithOffset(offset), - const_cast(pSrc), size, HIPStream)); + UR_CHECK_ERROR(hipMemcpyHtoDAsync( + std::get(hBuffer->Mem).getWithOffset(offset), + const_cast(pSrc), size, HIPStream)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -159,7 +159,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( } UR_CHECK_ERROR(hipMemcpyDtoHAsync( - pDst, hBuffer->Mem.BufferMem.getWithOffset(offset), size, HIPStream)); + pDst, std::get(hBuffer->Mem).getWithOffset(offset), size, + HIPStream)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -518,7 +519,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_SUCCESS; - void *DevPtr = hBuffer->Mem.BufferMem.getVoid(); + void *DevPtr = std::get(hBuffer->Mem).getVoid(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -566,7 +567,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - void *DevPtr = hBuffer->Mem.BufferMem.getVoid(); + void *DevPtr = std::get(hBuffer->Mem).getVoid(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -610,9 +611,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), + UR_ASSERT(size + srcOffset <= std::get(hBufferSrc->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), + UR_ASSERT(size + dstOffset <= std::get(hBufferDst->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; @@ -634,8 +635,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - auto Src = hBufferSrc->Mem.BufferMem.getWithOffset(srcOffset); - auto Dst = hBufferDst->Mem.BufferMem.getWithOffset(dstOffset); + auto Src = std::get(hBufferSrc->Mem).getWithOffset(srcOffset); + auto Dst = std::get(hBufferDst->Mem).getWithOffset(dstOffset); UR_CHECK_ERROR(hipMemcpyDtoDAsync(Dst, Src, size, Stream)); @@ -660,8 +661,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - void *SrcPtr = hBufferSrc->Mem.BufferMem.getVoid(); - void *DstPtr = hBufferDst->Mem.BufferMem.getVoid(); + void *SrcPtr = std::get(hBufferSrc->Mem).getVoid(); + void *DstPtr = std::get(hBufferDst->Mem).getVoid(); std::unique_ptr RetImplEvent{nullptr}; try { @@ -733,7 +734,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), + UR_ASSERT(size + offset <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); @@ -770,7 +771,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->start()); } - auto DstDevice = hBuffer->Mem.BufferMem.getWithOffset(offset); + auto DstDevice = std::get(hBuffer->Mem).getWithOffset(offset); auto N = size / patternSize; // pattern size in bytes @@ -904,7 +905,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( phEventWaitList); } - hipArray *Array = hImage->Mem.SurfaceMem.getArray(); + hipArray *Array = std::get(hImage->Mem).getArray(); hipArray_Format Format; size_t NumChannels; @@ -915,7 +916,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; - auto ImgType = hImage->Mem.SurfaceMem.getImageType(); + auto ImgType = std::get(hImage->Mem).getImageType(); size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; size_t SrcOffset[3] = {ByteOffsetX, origin.y, origin.z}; @@ -972,7 +973,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( phEventWaitList); } - hipArray *Array = hImage->Mem.SurfaceMem.getArray(); + hipArray *Array = std::get(hImage->Mem).getArray(); hipArray_Format Format; size_t NumChannels; @@ -983,7 +984,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; - auto ImgType = hImage->Mem.SurfaceMem.getImageType(); + auto ImgType = std::get(hImage->Mem).getImageType(); size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; size_t DstOffset[3] = {ByteOffsetX, origin.y, origin.z}; @@ -1029,8 +1030,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == - hImageDst->Mem.SurfaceMem.getImageType(), + UR_ASSERT(std::get(hImageSrc->Mem).getImageType() == + std::get(hImageDst->Mem).getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); ur_result_t Result = UR_RESULT_SUCCESS; @@ -1043,12 +1044,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( phEventWaitList); } - hipArray *SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); + hipArray *SrcArray = std::get(hImageSrc->Mem).getArray(); hipArray_Format SrcFormat; size_t SrcNumChannels; getArrayDesc(SrcArray, SrcFormat, SrcNumChannels); - hipArray *DstArray = hImageDst->Mem.SurfaceMem.getArray(); + hipArray *DstArray = std::get(hImageDst->Mem).getArray(); hipArray_Format DstFormat; size_t DstNumChannels; getArrayDesc(DstArray, DstFormat, DstNumChannels); @@ -1064,7 +1065,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * DstNumChannels; size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.depth; - auto ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); + auto ImgType = std::get(hImageSrc->Mem).getImageType(); size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.width}; size_t SrcOffset[3] = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; @@ -1111,22 +1112,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_event_handle_t *phEvent, void **ppRetMap) { UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), + auto &BufferImpl = std::get(hBuffer->Mem); + UR_ASSERT(offset + size <= BufferImpl.getSize(), UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; const bool IsPinned = - hBuffer->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; // Currently no support for overlapping regions - if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { + if (BufferImpl.getMapPtr() != nullptr) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } // Allocate a pointer in the host to store the mapped information - auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(size, offset, mapFlags); - *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); + auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags); + *ppRetMap = std::get(hBuffer->Mem).getMapPtr(); if (HostPtr) { Result = UR_RESULT_SUCCESS; } @@ -1171,23 +1172,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, + UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, + UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - const bool IsPinned = - hMem->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + const bool IsPinned = std::get(hMem->Mem).MemAllocMode == + BufferMem::AllocMode::AllocHostPtr; - if (!IsPinned && ((hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE) || - (hMem->Mem.BufferMem.getMapFlags() & - UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { + if (!IsPinned && + ((std::get(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE) || + (std::get(hMem->Mem).getMapFlags() & + UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { // Pinned host memory is only on host so it doesn't need to be written to. Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(), - hMem->Mem.BufferMem.getMapSize(), pMappedPtr, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hMem, true, std::get(hMem->Mem).getMapOffset(), + std::get(hMem->Mem).getMapSize(), pMappedPtr, + numEventsInWaitList, phEventWaitList, phEvent); } else { ScopedContext Active(hQueue->getDevice()); @@ -1208,7 +1209,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( } } - hMem->Mem.BufferMem.unmap(pMappedPtr); + std::get(hMem->Mem).unmap(pMappedPtr); return Result; } diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index b433d3a3b4..936589401e 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -272,7 +272,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( ur_result_t Result = UR_RESULT_SUCCESS; try { if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { - auto array = hArgValue->Mem.SurfaceMem.getArray(); + auto array = std::get(hArgValue->Mem).getArray(); hipArray_Format Format; size_t NumChannels; getArrayDesc(array, Format, NumChannels); @@ -283,12 +283,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( "UR HIP kernels only support images with channel types int32, " "uint32, float, and half."); } - hipSurfaceObject_t hipSurf = hArgValue->Mem.SurfaceMem.getSurface(); + hipSurfaceObject_t hipSurf = + std::get(hArgValue->Mem).getSurface(); hKernel->setKernelArg(argIndex, sizeof(hipSurf), (void *)&hipSurf); - } else - - { - void *HIPPtr = hArgValue->Mem.BufferMem.getVoid(); + } else { + void *HIPPtr = std::get(hArgValue->Mem).getVoid(); hKernel->setKernelArg(argIndex, sizeof(void *), (void *)&HIPPtr); } } catch (ur_result_t Err) { diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 0decb6e089..41cb2b94d0 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -35,24 +35,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { ScopedContext Active(uniqueMemObj->getContext()->getDevice()); if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { - switch (uniqueMemObj->Mem.BufferMem.MemAllocMode) { - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: - UR_CHECK_ERROR(hipFree((void *)uniqueMemObj->Mem.BufferMem.Ptr)); + auto &hBuffer = std::get(uniqueMemObj->Mem); + switch (hBuffer.MemAllocMode) { + case BufferMem::AllocMode::CopyIn: + case BufferMem::AllocMode::Classic: + UR_CHECK_ERROR(hipFree((void *)hBuffer.Ptr)); break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: - UR_CHECK_ERROR(hipHostUnregister(uniqueMemObj->Mem.BufferMem.HostPtr)); + case BufferMem::AllocMode::UseHostPtr: + UR_CHECK_ERROR(hipHostUnregister(hBuffer.HostPtr)); break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: - UR_CHECK_ERROR(hipFreeHost(uniqueMemObj->Mem.BufferMem.HostPtr)); + case BufferMem::AllocMode::AllocHostPtr: + UR_CHECK_ERROR(hipFreeHost(hBuffer.HostPtr)); }; } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { - UR_CHECK_ERROR( - hipDestroySurfaceObject(uniqueMemObj->Mem.SurfaceMem.getSurface())); - auto Array = uniqueMemObj->Mem.SurfaceMem.getArray(); - UR_CHECK_ERROR(hipFreeArray(Array)); + auto &hImage = std::get(uniqueMemObj->Mem); + UR_CHECK_ERROR(hipDestroySurfaceObject(hImage.getSurface())); + UR_CHECK_ERROR(hipFreeArray(hImage.getArray())); } } catch (ur_result_t Err) { @@ -103,30 +103,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ScopedContext Active(hContext->getDevice()); void *Ptr; auto pHost = pProperties ? pProperties->pHost : nullptr; - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { UR_CHECK_ERROR(hipHostRegister(pHost, size, hipHostRegisterMapped)); UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; + AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { UR_CHECK_ERROR(hipHostMalloc(&pHost, size)); UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + AllocMode = BufferMem::AllocMode::AllocHostPtr; } else { UR_CHECK_ERROR(hipMalloc(&Ptr, size)); if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; + AllocMode = BufferMem::AllocMode::CopyIn; } } if (Result == UR_RESULT_SUCCESS) { ur_mem_handle_t parentBuffer = nullptr; - auto DevPtr = - reinterpret_cast( - Ptr); + auto DevPtr = reinterpret_cast(Ptr); auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ hContext, parentBuffer, flags, AllocMode, DevPtr, pHost, size}); if (URMemObj != nullptr) { @@ -192,24 +189,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - UR_ASSERT( - ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), - UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + auto &BufferImpl = std::get(hBuffer->Mem); + UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); // Retained indirectly due to retaining parent buffer below. ur_context_handle_t Context = hBuffer->Context; - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; - UR_ASSERT(hBuffer->Mem.BufferMem.Ptr != - ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}, + UR_ASSERT(BufferImpl.Ptr != BufferMem::native_type{0}, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = - hBuffer->Mem.BufferMem.getWithOffset(pRegion->origin); + BufferMem::native_type Ptr = BufferImpl.getWithOffset(pRegion->origin); void *HostPtr = nullptr; - if (hBuffer->Mem.BufferMem.HostPtr) { - HostPtr = - static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; + if (BufferImpl.HostPtr) { + HostPtr = static_cast(BufferImpl.HostPtr) + pRegion->origin; } ReleaseGuard ReleaseGuard(hBuffer); @@ -251,8 +244,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, case UR_MEM_INFO_SIZE: { try { size_t AllocSize = 0; - UR_CHECK_ERROR(hipMemGetAddressRange(nullptr, &AllocSize, - hMemory->Mem.BufferMem.Ptr)); + UR_CHECK_ERROR(hipMemGetAddressRange( + nullptr, &AllocSize, std::get(hMemory->Mem).Ptr)); return ReturnValue(AllocSize); } catch (ur_result_t Err) { return Err; @@ -278,24 +271,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { #if defined(__HIP_PLATFORM_NVIDIA__) - if (sizeof(ur_mem_handle_t_::MemImpl::BufferMem::native_type) > - sizeof(ur_native_handle_t)) { + if (sizeof(BufferMem::native_type) > sizeof(ur_native_handle_t)) { // Check that all the upper bits that cannot be represented by // ur_native_handle_t are empty. // NOTE: The following shift might trigger a warning, but the check in the // if above makes sure that this does not underflow. - ur_mem_handle_t_::MemImpl::BufferMem::native_type UpperBits = - hMem->Mem.BufferMem.get() >> (sizeof(ur_native_handle_t) * CHAR_BIT); + BufferMem::native_type UpperBits = std::get(hMem->Mem).get() >> + (sizeof(ur_native_handle_t) * CHAR_BIT); if (UpperBits) { // Return an error if any of the remaining bits is non-zero. return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } } - *phNativeMem = - reinterpret_cast(hMem->Mem.BufferMem.get()); + *phNativeMem = reinterpret_cast( + std::get(hMem->Mem).get()); #elif defined(__HIP_PLATFORM_AMD__) - *phNativeMem = - reinterpret_cast(hMem->Mem.BufferMem.get()); + *phNativeMem = reinterpret_cast( + std::get(hMem->Mem).get()); #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 823450cb42..2732b22a6e 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -11,6 +11,119 @@ #include "common.hpp" #include +#include + +// Handler for plain, pointer-based HIP allocations +struct BufferMem { + using native_type = hipDeviceptr_t; + + // If this allocation is a sub-buffer (i.e., a view on an existing + // allocation), this is the pointer to the parent handler structure + ur_mem_handle_t Parent; + // HIP handler for the pointer + native_type Ptr; + + /// Pointer associated with this device on the host + void *HostPtr; + /// Size of the allocation in bytes + size_t Size; + /// Size of the active mapped region. + size_t MapSize; + /// Offset of the active mapped region. + size_t MapOffset; + /// Pointer to the active mapped region, if any + void *MapPtr; + /// Original flags for the mapped region + ur_map_flags_t MapFlags; + + /** AllocMode + * Classic: Just a normal buffer allocated on the device via hip malloc + * UseHostPtr: Use an address on the host for the device + * CopyIn: The data for the device comes from the host but the host + pointer is not available later for re-use + * AllocHostPtr: Uses pinned-memory allocation + */ + enum class AllocMode { + Classic, + UseHostPtr, + CopyIn, + AllocHostPtr + } MemAllocMode; + + BufferMem(ur_mem_handle_t Parent, AllocMode Mode, hipDeviceptr_t Ptr, + void *HostPtr, size_t Size) + : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0}, + MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, + MemAllocMode{Mode} {}; + + native_type get() const noexcept { return Ptr; } + + native_type getWithOffset(size_t Offset) const noexcept { + return reinterpret_cast(reinterpret_cast(Ptr) + + Offset); + } + + void *getVoid() const noexcept { return reinterpret_cast(Ptr); } + + size_t getSize() const noexcept { return Size; } + + void *getMapPtr() const noexcept { return MapPtr; } + + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } + + /// Returns a pointer to data visible on the host that contains + /// the data on the device associated with this allocation. + /// The offset is used to index into the HIP allocation. + /// + void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept { + assert(MapPtr == nullptr); + MapSize = Size; + MapOffset = Offset; + MapFlags = Flags; + if (HostPtr) { + MapPtr = static_cast(HostPtr) + Offset; + } else { + // TODO: Allocate only what is needed based on the offset + MapPtr = static_cast(malloc(this->getSize())); + } + return MapPtr; + } + + /// Detach the allocation from the host memory. + void unmap(void *) noexcept { + assert(MapPtr != nullptr); + + if (MapPtr != HostPtr) { + free(MapPtr); + } + MapPtr = nullptr; + MapSize = 0; + MapOffset = 0; + } + + ur_map_flags_t getMapFlags() const noexcept { + assert(MapPtr != nullptr); + return MapFlags; + } +}; + +// Handler data for surface object (i.e. Images) +struct SurfaceMem { + hipArray *Array; + hipSurfaceObject_t SurfObj; + ur_mem_type_t ImageType; + + SurfaceMem(hipArray *Array, hipSurfaceObject_t Surf, ur_mem_type_t ImageType) + : Array{Array}, SurfObj{Surf}, ImageType{ImageType} {}; + + hipArray *getArray() const noexcept { return Array; } + + hipSurfaceObject_t getSurface() const noexcept { return SurfObj; } + + ur_mem_type_t getImageType() const noexcept { return ImageType; } +}; /// UR Mem mapping to HIP memory allocations, both data and texture/surface. /// \brief Represents non-SVM allocations on the HIP backend. @@ -37,128 +150,16 @@ struct ur_mem_handle_t_ { /// In HIP their API handlers are different. Whereas "Buffers" are allocated /// as pointer-like structs, "Images" are stored in Textures or Surfaces. /// This union allows implementation to use either from the same handler. - union MemImpl { - // Handler for plain, pointer-based HIP allocations - struct BufferMem { - using native_type = hipDeviceptr_t; - - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - ur_mem Parent; - // HIP handler for the pointer - native_type Ptr; - - /// Pointer associated with this device on the host - void *HostPtr; - /// Size of the allocation in bytes - size_t Size; - /// Size of the active mapped region. - size_t MapSize; - /// Offset of the active mapped region. - size_t MapOffset; - /// Pointer to the active mapped region, if any - void *MapPtr; - /// Original flags for the mapped region - ur_map_flags_t MapFlags; - - /** AllocMode - * Classic: Just a normal buffer allocated on the device via hip malloc - * UseHostPtr: Use an address on the host for the device - * CopyIn: The data for the device comes from the host but the host - pointer is not available later for re-use - * AllocHostPtr: Uses pinned-memory allocation - */ - enum class AllocMode { - Classic, - UseHostPtr, - CopyIn, - AllocHostPtr - } MemAllocMode; - - native_type get() const noexcept { return Ptr; } - - native_type getWithOffset(size_t Offset) const noexcept { - return reinterpret_cast(reinterpret_cast(Ptr) + - Offset); - } - - void *getVoid() const noexcept { return reinterpret_cast(Ptr); } - - size_t getSize() const noexcept { return Size; } - - void *getMapPtr() const noexcept { return MapPtr; } - - size_t getMapSize() const noexcept { return MapSize; } - - size_t getMapOffset() const noexcept { return MapOffset; } - - /// Returns a pointer to data visible on the host that contains - /// the data on the device associated with this allocation. - /// The offset is used to index into the HIP allocation. - /// - void *mapToPtr(size_t Size, size_t Offset, - ur_map_flags_t Flags) noexcept { - assert(MapPtr == nullptr); - MapSize = Size; - MapOffset = Offset; - MapFlags = Flags; - if (HostPtr) { - MapPtr = static_cast(HostPtr) + Offset; - } else { - // TODO: Allocate only what is needed based on the offset - MapPtr = static_cast(malloc(this->getSize())); - } - return MapPtr; - } - - /// Detach the allocation from the host memory. - void unmap(void *) noexcept { - assert(MapPtr != nullptr); - - if (MapPtr != HostPtr) { - free(MapPtr); - } - MapPtr = nullptr; - MapSize = 0; - MapOffset = 0; - } - - ur_map_flags_t getMapFlags() const noexcept { - assert(MapPtr != nullptr); - return MapFlags; - } - } BufferMem; - - // Handler data for surface object (i.e. Images) - struct SurfaceMem { - hipArray *Array; - hipSurfaceObject_t SurfObj; - ur_mem_type_t ImageType; - - hipArray *getArray() const noexcept { return Array; } - - hipSurfaceObject_t getSurface() const noexcept { return SurfObj; } - - ur_mem_type_t getImageType() const noexcept { return ImageType; } - } SurfaceMem; - } Mem; + std::variant Mem; /// Constructs the UR MEM handler for a non-typed allocation ("buffer") ur_mem_handle_t_(ur_context Ctxt, ur_mem Parent, ur_mem_flags_t MemFlags, - MemImpl::BufferMem::AllocMode Mode, hipDeviceptr_t Ptr, - void *HostPtr, size_t Size) - : Context{Ctxt}, RefCount{1}, MemType{Type::Buffer}, MemFlags{MemFlags} { - Mem.BufferMem.Ptr = Ptr; - Mem.BufferMem.Parent = Parent; - Mem.BufferMem.HostPtr = HostPtr; - Mem.BufferMem.Size = Size; - Mem.BufferMem.MapSize = 0; - Mem.BufferMem.MapOffset = 0; - Mem.BufferMem.MapPtr = nullptr; - Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; - Mem.BufferMem.MemAllocMode = Mode; + BufferMem::AllocMode Mode, hipDeviceptr_t Ptr, void *HostPtr, + size_t Size) + : Context{Ctxt}, RefCount{1}, MemType{Type::Buffer}, MemFlags{MemFlags}, + Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { if (isSubBuffer()) { - urMemRetain(Mem.BufferMem.Parent); + urMemRetain(std::get(Mem).Parent); } else { urContextRetain(Context); } @@ -167,16 +168,14 @@ struct ur_mem_handle_t_ { /// Constructs the UR allocation for an Image object ur_mem_handle_t_(ur_context Ctxt, hipArray *Array, hipSurfaceObject_t Surf, ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, void *) - : Context{Ctxt}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags} { - Mem.SurfaceMem.Array = Array; - Mem.SurfaceMem.ImageType = ImageType; - Mem.SurfaceMem.SurfObj = Surf; + : Context{Ctxt}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags}, + Mem{SurfaceMem{Array, Surf, ImageType}} { urContextRetain(Context); } ~ur_mem_handle_t_() { if (isBuffer() && isSubBuffer()) { - urMemRelease(Mem.BufferMem.Parent); + urMemRelease(std::get(Mem).Parent); return; } urContextRelease(Context); @@ -185,7 +184,7 @@ struct ur_mem_handle_t_ { bool isBuffer() const noexcept { return MemType == Type::Buffer; } bool isSubBuffer() const noexcept { - return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); + return (isBuffer() && (std::get(Mem).Parent != nullptr)); } bool isImage() const noexcept { return MemType == Type::Surface; }