Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[uma] Improve reporting of pool/provider specific errors #640

Merged
merged 4 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions source/common/uma_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,12 @@ auto memoryProviderMakeUnique(Args &&...args) {

UMA_ASSIGN_OP(ops, T, alloc, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, free, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, get_last_result, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP_NORETURN(ops, T, get_last_native_error);
UMA_ASSIGN_OP(ops, T, get_recommended_page_size, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, get_min_page_size, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, purge_lazy, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, purge_force, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP_NORETURN(ops, T, get_name);
UMA_ASSIGN_OP(ops, T, get_name, "");

uma_memory_provider_handle_t hProvider = nullptr;
auto ret = umaMemoryProviderCreate(&ops, &argsTuple, &hProvider);
Expand Down Expand Up @@ -147,13 +147,19 @@ auto poolMakeUnique(uma_memory_provider_handle_t *providers,
UMA_ASSIGN_OP(ops, T, realloc, ((void *)nullptr));
UMA_ASSIGN_OP(ops, T, malloc_usable_size, ((size_t)0));
UMA_ASSIGN_OP_NORETURN(ops, T, free);
UMA_ASSIGN_OP(ops, T, get_last_result, UMA_RESULT_ERROR_UNKNOWN);
UMA_ASSIGN_OP(ops, T, get_last_allocation_error, UMA_RESULT_ERROR_UNKNOWN);

uma_memory_pool_handle_t hPool = nullptr;
auto ret = umaPoolCreate(&ops, providers, numProviders, &argsTuple, &hPool);
return std::pair<uma_result_t, pool_unique_handle_t>{
ret, pool_unique_handle_t(hPool, &umaPoolDestroy)};
}

template <typename Type> uma_result_t &getPoolLastStatusRef() {
static thread_local uma_result_t last_status = UMA_RESULT_SUCCESS;
return last_status;
}

} // namespace uma

#endif /* UMA_HELPERS_H */
38 changes: 26 additions & 12 deletions source/common/uma_pools/disjoint_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ static size_t AlignUp(size_t Val, size_t Alignment) {
return (Val + Alignment - 1) & (~(Alignment - 1));
}

struct MemoryProviderError {
uma_result_t code;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM
nit: perhaps a better name for this variable would be retCode? I think it may be more descriptive.

};

DisjointPoolConfig::DisjointPoolConfig()
: limits(std::make_shared<DisjointPoolConfig::SharedLimits>()) {}

Expand Down Expand Up @@ -332,8 +336,7 @@ static void *memoryProviderAlloc(uma_memory_provider_handle_t hProvider,
void *ptr;
auto ret = umaMemoryProviderAlloc(hProvider, size, alignment, &ptr);
if (ret != UMA_RESULT_SUCCESS) {
// TODO: Set last error appropriately
throw std::runtime_error("umaMemoryProviderAlloc");
throw MemoryProviderError{ret};
}
return ptr;
}
Expand All @@ -342,8 +345,7 @@ static void memoryProviderFree(uma_memory_provider_handle_t hProvider,
void *ptr) {
auto ret = umaMemoryProviderFree(hProvider, ptr, 0);
if (ret != UMA_RESULT_SUCCESS) {
// TODO: introduce custom exceptions? PI L0 relies on those
throw std::runtime_error("memoryProviderFree");
throw MemoryProviderError{ret};
}
}

Expand Down Expand Up @@ -373,7 +375,13 @@ Slab::~Slab() {
} catch (std::exception &e) {
std::cout << "DisjointPool: unexpected error: " << e.what() << "\n";
}
memoryProviderFree(bucket.getMemHandle(), MemPtr);

try {
memoryProviderFree(bucket.getMemHandle(), MemPtr);
} catch (MemoryProviderError &e) {
std::cout << "DisjointPool: error from memory provider: " << e.code
igchor marked this conversation as resolved.
Show resolved Hide resolved
<< "\n";
}
}

// Return the index of the first available chunk, SIZE_MAX otherwise
Expand Down Expand Up @@ -719,7 +727,7 @@ void Bucket::printStats(bool &TitlePrinted, const std::string &Label) {
}
}

void *DisjointPool::AllocImpl::allocate(size_t Size, bool &FromPool) {
void *DisjointPool::AllocImpl::allocate(size_t Size, bool &FromPool) try {
void *Ptr;

if (Size == 0) {
Expand All @@ -744,10 +752,13 @@ void *DisjointPool::AllocImpl::allocate(size_t Size, bool &FromPool) {
}

return Ptr;
} catch (MemoryProviderError &e) {
uma::getPoolLastStatusRef<DisjointPool>() = e.code;
return nullptr;
}

void *DisjointPool::AllocImpl::allocate(size_t Size, size_t Alignment,
bool &FromPool) {
bool &FromPool) try {
void *Ptr;

if (Size == 0) {
Expand Down Expand Up @@ -791,6 +802,9 @@ void *DisjointPool::AllocImpl::allocate(size_t Size, size_t Alignment,
}

return AlignPtrUp(Ptr, Alignment);
} catch (MemoryProviderError &e) {
uma::getPoolLastStatusRef<DisjointPool>() = e.code;
return nullptr;
}

Bucket &DisjointPool::AllocImpl::findBucket(size_t Size) {
Expand All @@ -805,7 +819,7 @@ Bucket &DisjointPool::AllocImpl::findBucket(size_t Size) {
return *(*It);
}

void DisjointPool::AllocImpl::deallocate(void *Ptr, bool &ToPool) {
void DisjointPool::AllocImpl::deallocate(void *Ptr, bool &ToPool) try {
auto *SlabPtr = AlignPtrDown(Ptr, SlabMinSize());

// Lock the map on read
Expand Down Expand Up @@ -848,6 +862,8 @@ void DisjointPool::AllocImpl::deallocate(void *Ptr, bool &ToPool) {
// to some slab with an entry in the map. So we find a slab
// but the range checks fail.
memoryProviderFree(getMemHandle(), Ptr);
} catch (MemoryProviderError &e) {
uma::getPoolLastStatusRef<DisjointPool>() = e.code;
}

void DisjointPool::AllocImpl::printStats(bool &TitlePrinted,
Expand Down Expand Up @@ -939,10 +955,8 @@ void DisjointPool::free(void *ptr) {
return;
}

enum uma_result_t DisjointPool::get_last_result(const char **ppMessage) {
// TODO: implement and return last error, we probably need something like
// https://github.com/oneapi-src/unified-runtime/issues/500 in UMA
return UMA_RESULT_ERROR_UNKNOWN;
enum uma_result_t DisjointPool::get_last_allocation_error() {
return uma::getPoolLastStatusRef<DisjointPool>();
}

DisjointPool::DisjointPool() {}
Expand Down
2 changes: 1 addition & 1 deletion source/common/uma_pools/disjoint_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class DisjointPool {
void *aligned_malloc(size_t size, size_t alignment);
size_t malloc_usable_size(void *);
void free(void *ptr);
enum uma_result_t get_last_result(const char **ppMessage);
enum uma_result_t get_last_allocation_error();

DisjointPool();
~DisjointPool();
Expand Down
1 change: 1 addition & 0 deletions source/common/unified_memory_allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(UMA_SOURCES
src/memory_pool.c
src/memory_provider.c
src/memory_tracker.cpp
src/memory_provider_get_last_failed.cpp
)

if(UMA_BUILD_SHARED_LIBRARY)
Expand Down
13 changes: 5 additions & 8 deletions source/common/unified_memory_allocation/include/uma/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,13 @@ enum uma_result_t {
UMA_RESULT_SUCCESS = 0, ///< Success
UMA_RESULT_ERROR_OUT_OF_HOST_MEMORY =
1, ///< Insufficient host memory to satisfy call,
UMA_RESULT_ERROR_POOL_SPECIFIC =
2, ///< A pool specific warning/error has been reported and can be
///< Retrieved via the umaPoolGetLastResult entry point.
UMA_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC =
3, ///< A provider specific warning/error has been reported and can be
///< Retrieved via the umaMemoryProviderGetLastResult entry point.
2, ///< A provider specific warning/error has been reported and can be
///< Retrieved via the umaMemoryProviderGetLastNativeError entry point.
UMA_RESULT_ERROR_INVALID_ARGUMENT =
4, ///< Generic error code for invalid arguments
UMA_RESULT_ERROR_INVALID_ALIGNMENT = 5, /// Invalid alignment of an argument
UMA_RESULT_ERROR_NOT_SUPPORTED = 6, /// Operation not supported
3, ///< Generic error code for invalid arguments
UMA_RESULT_ERROR_INVALID_ALIGNMENT = 4, /// Invalid alignment of an argument
UMA_RESULT_ERROR_NOT_SUPPORTED = 5, /// Operation not supported

UMA_RESULT_ERROR_UNKNOWN = 0x7ffffffe ///< Unknown or internal error
};
Expand Down
33 changes: 14 additions & 19 deletions source/common/unified_memory_allocation/include/uma/memory_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,28 +104,23 @@ void umaPoolFree(uma_memory_pool_handle_t hPool, void *ptr);
void umaFree(void *ptr);

///
/// \brief Retrieve string representation of the underlying pool specific
/// result reported by the last API that returned
/// UMA_RESULT_ERROR_POOL_SPECIFIC or NULL ptr (in case of allocation
/// functions). Allows for a pool independent way to
/// return a pool specific result.
/// \brief Retrieve uma_result_t representing the error of the last failed allocation
/// operation in this thread (malloc, calloc, realloc, aligned_malloc).
///
/// \details
/// - The string returned via the ppMessage is a NULL terminated C style
/// string.
/// - The string returned via the ppMessage is thread local.
/// - The memory in the string returned via the ppMessage is owned by the
/// adapter.
/// - The application may call this function from simultaneous threads.
/// - The implementation of this function should be lock-free.
/// * Implementations *must* store the error code in thread-local
/// storage prior to returning NULL from the allocation functions.
///
/// * If the last allocation/de-allocation operation succeeded, the value returned by
/// this function is unspecified.
///
/// * The application *may* call this function from simultaneous threads.
///
/// * The implementation of this function *should* be lock-free.
/// \param hPool specified memory hPool
/// \param ppMessage [out] pointer to a string containing provider specific
/// result in string representation.
/// \return UMA_RESULT_SUCCESS if the result being
/// reported is to be considered a warning. Any other result code
/// returned indicates that the adapter specific result is an error.
enum uma_result_t umaPoolGetLastResult(uma_memory_pool_handle_t hPool,
const char **ppMessage);
/// \return Error code desciribng the failure of the last failed allocation operation.
/// The value is undefined if the previous allocation was successful.
enum uma_result_t umaPoolGetLastAllocationError(uma_memory_pool_handle_t hPool);
igchor marked this conversation as resolved.
Show resolved Hide resolved

///
/// \brief Retrieve memory pool associated with a given ptr.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct uma_memory_pool_ops_t {
void *(*aligned_malloc)(void *pool, size_t size, size_t alignment);
size_t (*malloc_usable_size)(void *pool, void *ptr);
void (*free)(void *pool, void *);
enum uma_result_t (*get_last_result)(void *pool, const char **ppMessage);
enum uma_result_t (*get_last_allocation_error)(void *pool);
};

#ifdef __cplusplus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,25 @@ enum uma_result_t umaMemoryProviderFree(uma_memory_provider_handle_t hProvider,
/// independent way to return a provider specific result.
///
/// \details
/// - The string returned via the ppMessage is a NULL terminated C style
/// string.
/// - The string returned via the ppMessage is thread local.
/// - The memory in the string returned via the ppMessage is owned by the
/// adapter.
/// - The application may call this function from simultaneous threads.
/// - The implementation of this function should be lock-free.
/// * Implementations *must* store the message and error code in thread-local
/// storage prior to returning UMA_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC.
///
/// * The message and error code will only be valid if a previously
/// called entry-point returned UMA_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC.
igchor marked this conversation as resolved.
Show resolved Hide resolved
///
/// * The memory pointed to by the C string returned in `ppMessage` is owned by
/// the adapter and *must* be null terminated.
///
/// * The application *may* call this function from simultaneous threads.
///
/// * The implementation of this function *should* be lock-free.
/// \param hProvider handle to the memory provider
/// \param ppMessage [out] pointer to a string containing provider specific
/// result in string representation
/// \return UMA_RESULT_SUCCESS if the result being reported is to be considered
/// a warning. Any other result code returned indicates that the
/// adapter specific result is an error.
enum uma_result_t
umaMemoryProviderGetLastResult(uma_memory_provider_handle_t hProvider,
const char **ppMessage);
/// \param pError [out] pointer to an integer where the adapter specific error code will be stored
void umaMemoryProviderGetLastNativeError(uma_memory_provider_handle_t hProvider,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if some pool uses several memory providers, e.g. one for data and another one for metadata?
what will be the user flow?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. User might need to know which (metadata or data) provider failed. I think we need to extend error codes to specify DATA_PROVIDER_ERROR and METADATA_PROVIDER_ERROR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not like the idea of introducing provider type-specific errors, especially if we consider changing the way how we specify provider type in the pool.

We can think of storing the handle to the provider that failed and remove hProvider argument from the umaMemoryProviderGetLastNativeError function.

In general, I would like to see some code-snippet that demonstrates how these API suppose to be used by client (e.g. UR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've extended the error codes in #641

Copy link
Member Author

@igchor igchor Jun 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we assume that umaMemoryProviderGetLastNativeError can only be called by the memory pool user, then we could probably replace the hProvider handle with a pool handle. I don't think we can remove the handle entirely because we need to know which function to call (from memory_provider_ops that is associated with that handle).

Still I think umaMemoryProviderGetLastNativeError might also be useful inside a memory pool implementation (e.g. when a pool always uses some specific metadata provider).

The code would look similar to what I proposed in the comment above (just with extra switch for metadata/data provider):

void* ptr = umaPoolMalloc(hPool, size);
if (!ptr) {
    uma_result_t ret = umaPoolGetLastAllocationError(hPool);
    uma_memory_provider_handle_t provider;
    if (ret == UMA_RESULT_ERROR_DATA_MEMORY_PROVIDER_SPECIFIC) {
        provider = umaPoolGetDataMemoryProvider(hPool):
    } else if (ret == UMA_RESULT_ERROR_METADATA_MEMORY_PROVIDER_SPECIFIC {
        provider = umaPoolGetMetadataMemoryProvider(hPool):
    } else {
        ....
    }
    const char *msg;
    int32_t native_error;
    umaMemoryProviderGetLastNativeError(provider, &msg, &native_error);
    // cast native_error to ze_result_t and handle it
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then how would you know which provider failed? (i.e. which provider to pass to the umaMemoryProviderGetLastNativeError)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we decide not to merge #641 we need some other way of specyfing which memory provider is the one that failed. The simplest approach would be just to have only one provider per pool (as we had originally).

Copy link
Member Author

@igchor igchor Jun 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vinser52 Anyway, this PR does not introduce UMA_RESULT_ERROR_DATA_MEMORY_PROVIDER_SPECIFIC nor UMA_RESULT_ERROR_METADATA_MEMORY_PROVIDER_SPECIFIC so I think this discussion can continue on this thread: https://github.com/oneapi-src/unified-runtime/pull/641/files#r1244132297

Can this PR be merged or do you have any other comments?

Copy link
Contributor

@vinser52 vinser52 Jun 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, to interpret the native error code you should know the concrete implementation referred by the memory provider handle.

Also, I think it is useful to know which provider failed: whether it's metadata or data provider.

And it does not matter whether it DATA or METADATA memory provider. You just need to know which memory provider failed. For example, imagine the following situation:

  1. You have two memory providers (different implementations - different ops structure) and pass them to umaPoolCreate function.
  2. umaPoolMalloc call failed because one of the memory providers failed. Does it really matter whether failed memory provider was used as DATA or METADATA? One obvious solution could be just to have umaPoolGetLastFailedProvider function to get failed provider. It will help to avoid if-else if-...-else block.

But an even better idea is to look how similar problem solved in STL: std::error_code

Can this PR be merged or do you have any other comments?

It would be useful to understand how clients are going to use this native error? Do we have a code snippet from UR perspective? Let's discuss it by phone first.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I've implemented umaGetLastFailedProvider as a standalone function that does not accept a pool handle so that it can be implemented in memory_provider.c (as opposed to in each memory pool separately).

const char **ppMessage,
int32_t *pError);

///
/// \brief Retrieve recommended page size for a given allocation size.
Expand Down Expand Up @@ -136,8 +138,17 @@ umaMemoryProviderPurgeForce(uma_memory_provider_handle_t hProvider, void *ptr,
/// \brief Retrive name of a given memory provider.
/// \param hProvider handle to the memory provider
/// \param ppName [out] pointer to a string containing name of the provider
void umaMemoryProviderGetName(uma_memory_provider_handle_t hProvider,
const char **ppName);
const char *umaMemoryProviderGetName(uma_memory_provider_handle_t hProvider);

/// \brief Retrieve handle to the last memory provider that returned status other
/// than UMA_RESULT_SUCCESS on the calling thread.
///
/// \details Handle to the memory provider is stored in the thread local
/// storage. The handle is updated every time a memory provider
/// returns status other than UMA_RESULT_SUCCESS.
///
/// \return Handle to the memory provider
uma_memory_provider_handle_t umaGetLastFailedMemoryProvider();

#ifdef __cplusplus
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ struct uma_memory_provider_ops_t {
enum uma_result_t (*alloc)(void *provider, size_t size, size_t alignment,
void **ptr);
enum uma_result_t (*free)(void *provider, void *ptr, size_t size);
enum uma_result_t (*get_last_result)(void *provider,
const char **ppMessage);
void (*get_last_native_error)(void *provider, const char **ppMessage,
int32_t *pError);
enum uma_result_t (*get_recommended_page_size)(void *provider, size_t size,
size_t *pageSize);
enum uma_result_t (*get_min_page_size)(void *provider, void *ptr,
size_t *pageSize);
enum uma_result_t (*purge_lazy)(void *provider, void *ptr, size_t size);
enum uma_result_t (*purge_force)(void *provider, void *ptr, size_t size);
void (*get_name)(void *provider, const char **ppName);
const char *(*get_name)(void *provider);
};

#ifdef __cplusplus
Expand Down
6 changes: 3 additions & 3 deletions source/common/unified_memory_allocation/src/memory_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ void umaFree(void *ptr) {
}
}

enum uma_result_t umaPoolGetLastResult(uma_memory_pool_handle_t hPool,
const char **ppMessage) {
return hPool->ops.get_last_result(hPool->pool_priv, ppMessage);
enum uma_result_t
umaPoolGetLastAllocationError(uma_memory_pool_handle_t hPool) {
return hPool->ops.get_last_allocation_error(hPool->pool_priv);
}

uma_memory_pool_handle_t umaPoolByPtr(const void *ptr) {
Expand Down
Loading