Skip to content

Commit

Permalink
[SYCL][UR][CUDA] Replace direct allocation calls with calls to USMPro…
Browse files Browse the repository at this point in the history
…xyPool

Both direct and pooled allocation calls now utilize UMF pool API,
making calls to cuPointerGetAttributes when freeing memory unnecessary.
  • Loading branch information
kswiecicki committed Dec 15, 2023
1 parent a63a56a commit dcb8ecd
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 48 deletions.
33 changes: 33 additions & 0 deletions source/adapters/cuda/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,37 @@

#include <cassert>

ur_result_t ur_context_handle_t_::initialize() {
auto Context = reinterpret_cast<ur_context_handle_t>(this);

ur_result_t Ret;
std::tie(Ret, ProxyPoolManager) =
usm::pool_manager<usm::pool_descriptor>::create();
if (Ret)
return Ret;

auto Device = Context->DeviceID;

// TODO: Replace this with appropriate usm::pool_descriptor 'create' static
// function.
usm::pool_descriptor Descs[] = {
{nullptr, Context, nullptr, UR_USM_TYPE_HOST, false},
{nullptr, Context, Device, UR_USM_TYPE_DEVICE, false},
{nullptr, Context, Device, UR_USM_TYPE_SHARED, false}};

for (auto &Desc : Descs) {
umf::pool_unique_handle_t ProxyPool = nullptr;
std::tie(Ret, ProxyPool) = createUMFPoolForDesc<USMProxyPool>(Desc);
if (Ret) {
throw UsmAllocationException(Ret);
}

ProxyPoolManager.addPool(Desc, ProxyPool);
}

return UR_RESULT_SUCCESS;
}

void ur_context_handle_t_::addPool(ur_usm_pool_handle_t Pool) {
std::lock_guard<std::mutex> Lock(Mutex);
PoolHandles.insert(Pool);
Expand Down Expand Up @@ -56,6 +87,8 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
try {
ContextPtr = std::unique_ptr<ur_context_handle_t_>(
new ur_context_handle_t_{*phDevices});
ContextPtr->initialize();

*phContext = ContextPtr.release();
} catch (ur_result_t Err) {
RetErr = Err;
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/cuda/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "device.hpp"

#include <umf/memory_pool.h>
#include <ur_pool_manager.hpp>

typedef void (*ur_context_extended_deleter_t)(void *user_data);

Expand Down Expand Up @@ -77,6 +78,9 @@ struct ur_context_handle_t_ {
native_type CUContext;
ur_device_handle_t DeviceID;
std::atomic_uint32_t RefCount;
// Stores pools designated for direct allocations, utilizing UMF tracking
// capabilities. These pools don't perform any pooling.
usm::pool_manager<usm::pool_descriptor> ProxyPoolManager;

ur_context_handle_t_(ur_device_handle_t_ *DevID)
: CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
Expand All @@ -85,6 +89,8 @@ struct ur_context_handle_t_ {

~ur_context_handle_t_() { urDeviceRelease(DeviceID); }

ur_result_t initialize();

void invokeExtendedDeleters() {
std::lock_guard<std::mutex> Guard(Mutex);
for (auto &Deleter : ExtendedDeleters) {
Expand Down
102 changes: 56 additions & 46 deletions source/adapters/cuda/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
(alignment == 0 || ((alignment & (alignment - 1)) == 0)),
UR_RESULT_ERROR_INVALID_VALUE);

if (!hPool) {
return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
}

std::optional<umf_memory_pool_handle_t> hPoolInternalOpt = std::nullopt;
usm::pool_descriptor Desc = {hPool, hContext, nullptr, UR_USM_TYPE_HOST,
false};
auto hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
if (hPool) {
hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
} else {
hPoolInternalOpt = hContext->ProxyPoolManager.getPool(Desc);
}

if (!hPoolInternalOpt.has_value()) {
// Internal error, every L0 context and usm pool should have Host, Device,
// Shared and SharedReadOnly UMF pools.
Expand All @@ -64,14 +66,15 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
(alignment == 0 || ((alignment & (alignment - 1)) == 0)),
UR_RESULT_ERROR_INVALID_VALUE);

if (!hPool) {
return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
alignment);
}

std::optional<umf_memory_pool_handle_t> hPoolInternalOpt = std::nullopt;
usm::pool_descriptor Desc = {hPool, hContext, hDevice, UR_USM_TYPE_DEVICE,
false};
auto hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
if (hPool) {
hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
} else {
hPoolInternalOpt = hContext->ProxyPoolManager.getPool(Desc);
}

if (!hPoolInternalOpt.has_value()) {
// Internal error, every L0 context and usm pool should have Host, Device,
// Shared and SharedReadOnly UMF pools.
Expand All @@ -98,14 +101,15 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
(alignment == 0 || ((alignment & (alignment - 1)) == 0)),
UR_RESULT_ERROR_INVALID_VALUE);

if (!hPool) {
return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
alignment);
}

std::optional<umf_memory_pool_handle_t> hPoolInternalOpt = std::nullopt;
usm::pool_descriptor Desc = {hPool, hContext, hDevice, UR_USM_TYPE_SHARED,
false};
auto hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
if (hPool) {
hPoolInternalOpt = hPool->PoolManager.getPool(Desc);
} else {
hPoolInternalOpt = hContext->ProxyPoolManager.getPool(Desc);
}

if (!hPoolInternalOpt.has_value()) {
// Internal error, every L0 context and usm pool should have Host, Device,
// Shared and SharedReadOnly UMF pools.
Expand All @@ -121,40 +125,13 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
return UR_RESULT_SUCCESS;
}

ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
ScopedContext Active(Context);
bool IsManaged;
unsigned int Type;
void *AttributeValues[2] = {&IsManaged, &Type};
CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
UR_CHECK_ERROR(cuPointerGetAttributes(2, Attributes, AttributeValues,
(CUdeviceptr)Pointer));
UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
// Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
// with cuMemFree
UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer));
} else {
// Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
UR_CHECK_ERROR(cuMemFreeHost(Pointer));
}
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

/// USM: Frees the given USM pointer associated with the context.
///
UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
void *pMem) {
if (auto Pool = umfPoolByPtr(pMem))
return umf::umf2urResult(umfPoolFree(Pool, pMem));
return USMFreeImpl(hContext, pMem);
return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
}

ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
Expand Down Expand Up @@ -216,6 +193,29 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
return UR_RESULT_SUCCESS;
}

ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
// Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
// with cuMemFree
UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer));
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

ur_result_t USMHostFreeImpl(ur_context_handle_t Context, void *Pointer) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
// Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
UR_CHECK_ERROR(cuMemFreeHost(Pointer));
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL
urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
ur_usm_alloc_info_t propName, size_t propValueSize,
Expand Down Expand Up @@ -364,7 +364,7 @@ enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align,
enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) {
(void)Size;

auto Res = USMFreeImpl(Context, Ptr);
auto Res = freeImpl(Context, Ptr);
if (Res != UR_RESULT_SUCCESS) {
getLastStatusRef() = Res;
return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
Expand Down Expand Up @@ -392,12 +392,22 @@ ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
Alignment);
}

ur_result_t USMSharedMemoryProvider::freeImpl(ur_context_handle_t Context,
void *Ptr) {
return USMFreeImpl(Context, Ptr);
}

ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
Alignment);
}

ur_result_t USMDeviceMemoryProvider::freeImpl(ur_context_handle_t Context,
void *Ptr) {
return USMFreeImpl(Context, Ptr);
}

ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
Expand Down
64 changes: 62 additions & 2 deletions source/adapters/cuda/usm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,11 @@ class USMMemoryProvider {
ur_device_handle_t Device;
size_t MinPageSize;

// Internal allocation routine which must be implemented for each allocation
// type
// Internal allocation and deallocation routines which must be implemented for
// each allocation type
virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) = 0;
virtual ur_result_t freeImpl(ur_context_handle_t Context, void *Ptr) = 0;

public:
umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev);
Expand Down Expand Up @@ -93,6 +94,7 @@ class USMSharedMemoryProvider final : public USMMemoryProvider {
protected:
ur_result_t allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) override;
ur_result_t freeImpl(ur_context_handle_t Context, void *Ptr) override;
};

// Allocation routines for device memory type
Expand All @@ -103,6 +105,7 @@ class USMDeviceMemoryProvider final : public USMMemoryProvider {
protected:
ur_result_t allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) override;
ur_result_t freeImpl(ur_context_handle_t Context, void *Ptr) override;
};

// Allocation routines for host memory type
Expand All @@ -113,6 +116,7 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
protected:
ur_result_t allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) override;
ur_result_t freeImpl(ur_context_handle_t Context, void *Ptr) override;
};

ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
Expand All @@ -129,3 +133,59 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_usm_host_mem_flags_t *Flags, size_t Size,
uint32_t Alignment);

// Simple proxy for memory allocations. It is used for the UMF tracking
// capabilities.
class USMProxyPool {
public:
umf_result_t initialize(umf_memory_provider_handle_t *Providers,
size_t NumProviders) noexcept {
std::ignore = NumProviders;

this->hProvider = Providers[0];
return UMF_RESULT_SUCCESS;
}
void *malloc(size_t Size) noexcept { return aligned_malloc(Size, 0); }
void *calloc(size_t Num, size_t Size) noexcept {
std::ignore = Num;
std::ignore = Size;

// Currently not needed
umf::getPoolLastStatusRef<USMProxyPool>() = UMF_RESULT_ERROR_NOT_SUPPORTED;
return nullptr;
}
void *realloc(void *Ptr, size_t Size) noexcept {
std::ignore = Ptr;
std::ignore = Size;

// Currently not needed
umf::getPoolLastStatusRef<USMProxyPool>() = UMF_RESULT_ERROR_NOT_SUPPORTED;
return nullptr;
}
void *aligned_malloc(size_t Size, size_t Alignment) noexcept {
void *Ptr = nullptr;
auto Ret = umfMemoryProviderAlloc(hProvider, Size, Alignment, &Ptr);
if (Ret != UMF_RESULT_SUCCESS) {
umf::getPoolLastStatusRef<USMProxyPool>() = Ret;
}
return Ptr;
}
size_t malloc_usable_size(void *Ptr) noexcept {
std::ignore = Ptr;

// Currently not needed
return 0;
}
enum umf_result_t free(void *Ptr) noexcept {
return umfMemoryProviderFree(hProvider, Ptr, 0);
}
enum umf_result_t get_last_allocation_error() {
return umf::getPoolLastStatusRef<USMProxyPool>();
}
umf_memory_provider_handle_t hProvider;
};

// Template helper function for creating USM pools for given pool descriptor.
template <typename P, typename... Args>
std::pair<ur_result_t, umf::pool_unique_handle_t>
createUMFPoolForDesc(usm::pool_descriptor &Desc, Args &&...args);

0 comments on commit dcb8ecd

Please sign in to comment.