Skip to content

Commit

Permalink
Merge pull request #1220 from fabiomestre/fabio/cuda_multimap
Browse files Browse the repository at this point in the history
[CUDA] Add support for multiple active mappings
  • Loading branch information
kbenzie authored Apr 10, 2024
2 parents 08b3e8f + 834e643 commit 6e76c98
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 81 deletions.
43 changes: 18 additions & 25 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,27 +1160,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
UR_RESULT_ERROR_INVALID_SIZE);

auto &BufferImpl = std::get<BufferMem>(hBuffer->Mem);
ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
const bool IsPinned =
BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags);

// Currently no support for overlapping regions
if (BufferImpl.getMapPtr() != nullptr) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
if (!MapPtr) {
return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
}

// Allocate a pointer in the host to store the mapped information
auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
*ppRetMap = BufferImpl.getMapPtr();
if (HostPtr) {
Result = UR_RESULT_SUCCESS;
}
const bool IsPinned =
BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;

ur_result_t Result = UR_RESULT_SUCCESS;
if (!IsPinned &&
((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
// Pinned host memory is already on host so it doesn't need to be read.
Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
HostPtr, numEventsInWaitList,
MapPtr, numEventsInWaitList,
phEventWaitList, phEvent);
} else {
ScopedContext Active(hQueue->getContext());
Expand All @@ -1201,6 +1195,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
}
}
}
*ppRetMap = MapPtr;

return Result;
}
Expand All @@ -1213,23 +1208,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
ur_result_t Result = UR_RESULT_SUCCESS;
UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() != nullptr,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() == pMappedPtr,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
auto &BufferImpl = std::get<BufferMem>(hMem->Mem);

const bool IsPinned = std::get<BufferMem>(hMem->Mem).MemAllocMode ==
BufferMem::AllocMode::AllocHostPtr;
auto *Map = BufferImpl.getMapDetails(pMappedPtr);
UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT);

if (!IsPinned &&
(std::get<BufferMem>(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE)) {
const bool IsPinned =
BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;

ur_result_t Result = UR_RESULT_SUCCESS;
if (!IsPinned && (Map->getMapFlags() & UR_MAP_FLAG_WRITE)) {
// Pinned host memory is only on host so it doesn't need to be written to.
Result = urEnqueueMemBufferWrite(
hQueue, hMem, true, std::get<BufferMem>(hMem->Mem).getMapOffset(),
std::get<BufferMem>(hMem->Mem).getMapSize(), pMappedPtr,
hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
numEventsInWaitList, phEventWaitList, phEvent);
} else {
ScopedContext Active(hQueue->getContext());
Expand All @@ -1250,8 +1243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
}
}
}
BufferImpl.unmap(pMappedPtr);

std::get<BufferMem>(hMem->Mem).unmap(pMappedPtr);
return Result;
}

Expand Down
124 changes: 71 additions & 53 deletions source/adapters/cuda/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,89 +18,107 @@

// Handler for plain, pointer-based CUDA allocations
struct BufferMem {
using native_type = CUdeviceptr;

// If this allocation is a sub-buffer (i.e., a view on an existing
// allocation), this is the pointer to the parent handler structure
ur_mem_handle_t Parent;
// CUDA handler for the pointer
native_type Ptr;
struct BufferMap {
/// Size of the active mapped region.
size_t MapSize;
/// Offset of the active mapped region.
size_t MapOffset;
/// Original flags for the mapped region
ur_map_flags_t MapFlags;
/// Allocated host memory used exclusively for this map.
std::unique_ptr<unsigned char[]> MapMem;

/// Pointer associated with this device on the host
void *HostPtr;
/// Size of the allocation in bytes
size_t Size;
/// Size of the active mapped region.
size_t MapSize;
/// Offset of the active mapped region.
size_t MapOffset;
/// Pointer to the active mapped region, if any
void *MapPtr;
/// Original flags for the mapped region
ur_map_flags_t MapFlags;
BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
: MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
MapMem(nullptr) {}

BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags,
std::unique_ptr<unsigned char[]> &MapMem)
: MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
MapMem(std::move(MapMem)) {}

size_t getMapSize() const noexcept { return MapSize; }

size_t getMapOffset() const noexcept { return MapOffset; }

ur_map_flags_t getMapFlags() const noexcept { return MapFlags; }
};

/** AllocMode
* classic: Just a normal buffer allocated on the device via cuda malloc
* use_host_ptr: Use an address on the host for the device
* copy_in: The data for the device comes from the host but the host
pointer is not available later for re-use
* alloc_host_ptr: Uses pinned-memory allocation
*/
* copy_in: The data for the device comes from the host but the host pointer
* is not available later for re-use alloc_host_ptr: Uses pinned-memory
* allocation
*/
enum class AllocMode {
Classic,
UseHostPtr,
CopyIn,
AllocHostPtr,
} MemAllocMode;
};

using native_type = CUdeviceptr;

/// If this allocation is a sub-buffer (i.e., a view on an existing
/// allocation), this is the pointer to the parent handler structure
ur_mem_handle_t Parent;
/// CUDA handler for the pointer
native_type Ptr;
/// Pointer associated with this device on the host
void *HostPtr;
/// Size of the allocation in bytes
size_t Size;
/// A map that contains all the active mappings for this buffer.
std::unordered_map<void *, BufferMap> PtrToBufferMap;

AllocMode MemAllocMode;

BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr,
void *HostPtr, size_t Size)
: Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0},
MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE},
MemAllocMode{Mode} {};
: Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
PtrToBufferMap{}, MemAllocMode{Mode} {};

native_type get() const noexcept { return Ptr; }

size_t getSize() const noexcept { return Size; }

void *getMapPtr() const noexcept { return MapPtr; }

size_t getMapSize() const noexcept { return MapSize; }

size_t getMapOffset() const noexcept { return MapOffset; }
BufferMap *getMapDetails(void *Map) {
auto details = PtrToBufferMap.find(Map);
if (details != PtrToBufferMap.end()) {
return &details->second;
}
return nullptr;
}

/// Returns a pointer to data visible on the host that contains
/// the data on the device associated with this allocation.
/// The offset is used to index into the CUDA allocation.
void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept {
assert(MapPtr == nullptr);
MapSize = Size;
MapOffset = Offset;
MapFlags = Flags;
if (HostPtr) {
MapPtr = static_cast<char *>(HostPtr) + Offset;
void *mapToPtr(size_t MapSize, size_t MapOffset,
ur_map_flags_t MapFlags) noexcept {

void *MapPtr = nullptr;
if (HostPtr == nullptr) {
/// If HostPtr is invalid, we need to create a Mapping that owns its own
/// memory on the host.
auto MapMem = std::make_unique<unsigned char[]>(MapSize);
MapPtr = MapMem.get();
PtrToBufferMap.insert(
{MapPtr, BufferMap(MapSize, MapOffset, MapFlags, MapMem)});
} else {
// TODO: Allocate only what is needed based on the offset
MapPtr = static_cast<void *>(malloc(this->getSize()));
/// However, if HostPtr already has valid memory (e.g. pinned allocation),
/// we can just use that memory for the mapping.
MapPtr = static_cast<char *>(HostPtr) + MapOffset;
PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)});
}
return MapPtr;
}

/// Detach the allocation from the host memory.
void unmap(void *) noexcept {
assert(MapPtr != nullptr);

if (MapPtr != HostPtr) {
free(MapPtr);
}
MapPtr = nullptr;
MapSize = 0;
MapOffset = 0;
}

ur_map_flags_t getMapFlags() const noexcept {
void unmap(void *MapPtr) noexcept {
assert(MapPtr != nullptr);
return MapFlags;
PtrToBufferMap.erase(MapPtr);
}
};

Expand Down
1 change: 0 additions & 1 deletion test/conformance/enqueue/enqueue_adapter_cuda.match
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
{{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
{{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__256__patternSize__256
{{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__1024__patternSize__256
{{OPT}}urEnqueueMemBufferMapTest.SuccessMultiMaps/NVIDIA_CUDA_BACKEND___{{.*}}_
{{OPT}}urEnqueueMemBufferReadRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
{{OPT}}urEnqueueMemBufferWriteRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}___1D
Expand Down
4 changes: 2 additions & 2 deletions test/conformance/enqueue/urEnqueueMemBufferMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ TEST_P(urEnqueueMemBufferMapTestWithParam, SuccessMultiMaps) {
for (size_t i = 0; i < map_count; ++i) {
map_a[i] = 42;
}
for (size_t i = map_count; i < count; ++i) {
map_a[i] = 24;
for (size_t i = 0; i < map_count; ++i) {
map_b[i] = 24;
}
ASSERT_SUCCESS(
urEnqueueMemUnmap(queue, buffer, map_a, 0, nullptr, nullptr));
Expand Down

0 comments on commit 6e76c98

Please sign in to comment.