diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 7c1de98837..6f99941095 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -1160,27 +1160,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_RESULT_ERROR_INVALID_SIZE); auto &BufferImpl = std::get(hBuffer->Mem); - ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT; - const bool IsPinned = - BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; + auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags); - // Currently no support for overlapping regions - if (BufferImpl.getMapPtr() != nullptr) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + if (!MapPtr) { + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } - // Allocate a pointer in the host to store the mapped information - auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags); - *ppRetMap = BufferImpl.getMapPtr(); - if (HostPtr) { - Result = UR_RESULT_SUCCESS; - } + const bool IsPinned = + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; + ur_result_t Result = UR_RESULT_SUCCESS; if (!IsPinned && ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { // Pinned host memory is already on host so it doesn't need to be read. Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, - HostPtr, numEventsInWaitList, + MapPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { ScopedContext Active(hQueue->getContext()); @@ -1201,6 +1195,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } } } + *ppRetMap = MapPtr; return Result; } @@ -1213,23 +1208,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + auto &BufferImpl = std::get(hMem->Mem); - const bool IsPinned = std::get(hMem->Mem).MemAllocMode == - BufferMem::AllocMode::AllocHostPtr; + auto *Map = BufferImpl.getMapDetails(pMappedPtr); + UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (!IsPinned && - (std::get(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE)) { + const bool IsPinned = + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; + + ur_result_t Result = UR_RESULT_SUCCESS; + if (!IsPinned && (Map->getMapFlags() & UR_MAP_FLAG_WRITE)) { // Pinned host memory is only on host so it doesn't need to be written to. Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, std::get(hMem->Mem).getMapOffset(), - std::get(hMem->Mem).getMapSize(), pMappedPtr, + hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { ScopedContext Active(hQueue->getContext()); @@ -1250,8 +1243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( } } } + BufferImpl.unmap(pMappedPtr); - std::get(hMem->Mem).unmap(pMappedPtr); return Result; } diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 3a9a7e2d6b..b727fc1551 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -18,89 +18,107 @@ // Handler for plain, pointer-based CUDA allocations struct BufferMem { - using native_type = CUdeviceptr; - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - ur_mem_handle_t Parent; - // CUDA handler for the pointer - native_type Ptr; + struct BufferMap { + /// Size of the active mapped region. + size_t MapSize; + /// Offset of the active mapped region. + size_t MapOffset; + /// Original flags for the mapped region + ur_map_flags_t MapFlags; + /// Allocated host memory used exclusively for this map. + std::unique_ptr MapMem; - /// Pointer associated with this device on the host - void *HostPtr; - /// Size of the allocation in bytes - size_t Size; - /// Size of the active mapped region. - size_t MapSize; - /// Offset of the active mapped region. - size_t MapOffset; - /// Pointer to the active mapped region, if any - void *MapPtr; - /// Original flags for the mapped region - ur_map_flags_t MapFlags; + BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags) + : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags), + MapMem(nullptr) {} + + BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags, + std::unique_ptr &MapMem) + : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags), + MapMem(std::move(MapMem)) {} + + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } + + ur_map_flags_t getMapFlags() const noexcept { return MapFlags; } + }; /** AllocMode * classic: Just a normal buffer allocated on the device via cuda malloc * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host - pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ + * copy_in: The data for the device comes from the host but the host pointer + * is not available later for re-use alloc_host_ptr: Uses pinned-memory + * allocation + */ enum class AllocMode { Classic, UseHostPtr, CopyIn, AllocHostPtr, - } MemAllocMode; + }; + + using native_type = CUdeviceptr; + + /// If this allocation is a sub-buffer (i.e., a view on an existing + /// allocation), this is the pointer to the parent handler structure + ur_mem_handle_t Parent; + /// CUDA handler for the pointer + native_type Ptr; + /// Pointer associated with this device on the host + void *HostPtr; + /// Size of the allocation in bytes + size_t Size; + /// A map that contains all the active mappings for this buffer. + std::unordered_map PtrToBufferMap; + + AllocMode MemAllocMode; BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr, void *HostPtr, size_t Size) - : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0}, - MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, - MemAllocMode{Mode} {}; + : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, + PtrToBufferMap{}, MemAllocMode{Mode} {}; native_type get() const noexcept { return Ptr; } size_t getSize() const noexcept { return Size; } - void *getMapPtr() const noexcept { return MapPtr; } - - size_t getMapSize() const noexcept { return MapSize; } - - size_t getMapOffset() const noexcept { return MapOffset; } + BufferMap *getMapDetails(void *Map) { + auto details = PtrToBufferMap.find(Map); + if (details != PtrToBufferMap.end()) { + return &details->second; + } + return nullptr; + } /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the CUDA allocation. - void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept { - assert(MapPtr == nullptr); - MapSize = Size; - MapOffset = Offset; - MapFlags = Flags; - if (HostPtr) { - MapPtr = static_cast(HostPtr) + Offset; + void *mapToPtr(size_t MapSize, size_t MapOffset, + ur_map_flags_t MapFlags) noexcept { + + void *MapPtr = nullptr; + if (HostPtr == nullptr) { + /// If HostPtr is invalid, we need to create a Mapping that owns its own + /// memory on the host. + auto MapMem = std::make_unique(MapSize); + MapPtr = MapMem.get(); + PtrToBufferMap.insert( + {MapPtr, BufferMap(MapSize, MapOffset, MapFlags, MapMem)}); } else { - // TODO: Allocate only what is needed based on the offset - MapPtr = static_cast(malloc(this->getSize())); + /// However, if HostPtr already has valid memory (e.g. pinned allocation), + /// we can just use that memory for the mapping. + MapPtr = static_cast(HostPtr) + MapOffset; + PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)}); } return MapPtr; } /// Detach the allocation from the host memory. - void unmap(void *) noexcept { - assert(MapPtr != nullptr); - - if (MapPtr != HostPtr) { - free(MapPtr); - } - MapPtr = nullptr; - MapSize = 0; - MapOffset = 0; - } - - ur_map_flags_t getMapFlags() const noexcept { + void unmap(void *MapPtr) noexcept { assert(MapPtr != nullptr); - return MapFlags; + PtrToBufferMap.erase(MapPtr); } }; diff --git a/test/conformance/enqueue/enqueue_adapter_cuda.match b/test/conformance/enqueue/enqueue_adapter_cuda.match index 8fe2045d2c..9b57269f3d 100644 --- a/test/conformance/enqueue/enqueue_adapter_cuda.match +++ b/test/conformance/enqueue/enqueue_adapter_cuda.match @@ -2,7 +2,6 @@ {{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__256__patternSize__256 {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__1024__patternSize__256 -{{OPT}}urEnqueueMemBufferMapTest.SuccessMultiMaps/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferReadRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferWriteRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemImageCopyTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}___1D diff --git a/test/conformance/enqueue/urEnqueueMemBufferMap.cpp b/test/conformance/enqueue/urEnqueueMemBufferMap.cpp index 06b348e218..eb06724139 100644 --- a/test/conformance/enqueue/urEnqueueMemBufferMap.cpp +++ b/test/conformance/enqueue/urEnqueueMemBufferMap.cpp @@ -194,8 +194,8 @@ TEST_P(urEnqueueMemBufferMapTestWithParam, SuccessMultiMaps) { for (size_t i = 0; i < map_count; ++i) { map_a[i] = 42; } - for (size_t i = map_count; i < count; ++i) { - map_a[i] = 24; + for (size_t i = 0; i < map_count; ++i) { + map_b[i] = 24; } ASSERT_SUCCESS( urEnqueueMemUnmap(queue, buffer, map_a, 0, nullptr, nullptr));