From b3887e61d9ebc45b0d0c43aaec4c9772d3fab654 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 9 Oct 2023 10:58:25 +0100 Subject: [PATCH] Revert add prefetch for USM hip allocations a6b8fa66b537753415d24076f1025c040110c332 --- source/adapters/hip/context.hpp | 53 --------------------------------- source/adapters/hip/enqueue.cpp | 11 +------ source/adapters/hip/kernel.cpp | 2 +- source/adapters/hip/kernel.hpp | 15 ---------- 4 files changed, 2 insertions(+), 79 deletions(-) diff --git a/source/adapters/hip/context.hpp b/source/adapters/hip/context.hpp index 8191a05408..d8d0f5d304 100644 --- a/source/adapters/hip/context.hpp +++ b/source/adapters/hip/context.hpp @@ -10,7 +10,6 @@ #pragma once #include -#include #include "common.hpp" #include "device.hpp" @@ -106,61 +105,9 @@ struct ur_context_handle_t_ { ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool); - /// We need to keep track of USM mappings in AMD HIP, as certain extra - /// synchronization *is* actually required for correctness. - /// During kernel enqueue we must dispatch a prefetch for each kernel argument - /// that points to a USM mapping to ensure the mapping is correctly - /// populated on the device (https://github.com/intel/llvm/issues/7252). Thus, - /// we keep track of mappings in the context, and then check against them just - /// before the kernel is launched. The stream against which the kernel is - /// launched is not known until enqueue time, but the USM mappings can happen - /// at any time. Thus, they are tracked on the context used for the urUSM* - /// mapping. - /// - /// The three utility function are simple wrappers around a mapping from a - /// pointer to a size. - void addUSMMapping(void *Ptr, size_t Size) { - std::lock_guard Guard(Mutex); - assert(USMMappings.find(Ptr) == USMMappings.end() && - "mapping already exists"); - USMMappings[Ptr] = Size; - } - - void removeUSMMapping(const void *Ptr) { - std::lock_guard guard(Mutex); - auto It = USMMappings.find(Ptr); - if (It != USMMappings.end()) - USMMappings.erase(It); - } - - std::pair getUSMMapping(const void *Ptr) { - std::lock_guard Guard(Mutex); - auto It = USMMappings.find(Ptr); - // The simple case is the fast case... - if (It != USMMappings.end()) - return *It; - - // ... but in the failure case we have to fall back to a full scan to search - // for "offset" pointers in case the user passes in the middle of an - // allocation. We have to do some not-so-ordained-by-the-standard ordered - // comparisons of pointers here, but it'll work on all platforms we support. - uintptr_t PtrVal = (uintptr_t)Ptr; - for (std::pair Pair : USMMappings) { - uintptr_t BaseAddr = (uintptr_t)Pair.first; - uintptr_t EndAddr = BaseAddr + Pair.second; - if (PtrVal > BaseAddr && PtrVal < EndAddr) { - // If we've found something now, offset *must* be nonzero - assert(Pair.second); - return Pair; - } - } - return {nullptr, 0}; - } - private: std::mutex Mutex; std::vector ExtendedDeleters; - std::unordered_map USMMappings; std::set PoolHandles; }; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index f7c378fcfc..3af98b8104 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -257,7 +257,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( try { ur_device_handle_t Dev = hQueue->getDevice(); ScopedContext Active(Dev); - ur_context_handle_t Ctx = hQueue->getContext(); uint32_t StreamToken; ur_stream_quard Guard; @@ -265,14 +264,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); hipFunction_t HIPFunc = hKernel->get(); - hipDevice_t HIPDev = Dev->get(); - for (const void *P : hKernel->getPtrArgs()) { - auto [Addr, Size] = Ctx->getUSMMapping(P); - if (!Addr) - continue; - if (hipMemPrefetchAsync(Addr, Size, HIPDev, HIPStream) != hipSuccess) - return UR_RESULT_ERROR_INVALID_KERNEL_ARGS; - } Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList); @@ -314,7 +305,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( int DeviceMaxLocalMem = 0; UR_CHECK_ERROR(hipDeviceGetAttribute( &DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock, - HIPDev)); + Dev->get())); static const int EnvVal = std::atoi(LocalMemSzPtr); if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) { diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index b433d3a3b4..8bcc164093 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -255,7 +255,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( ur_kernel_handle_t hKernel, uint32_t argIndex, const ur_kernel_arg_pointer_properties_t *, const void *pArgValue) { - hKernel->setKernelPtrArg(argIndex, sizeof(pArgValue), pArgValue); + hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index 3db9dce19a..f13478a69c 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -14,7 +14,6 @@ #include #include #include -#include #include "program.hpp" @@ -58,7 +57,6 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; args_index_t Indices; args_size_t OffsetPerIndex; - std::set PtrArgs; std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; @@ -179,19 +177,6 @@ struct ur_kernel_handle_t_ { Args.addArg(Index, Size, Arg); } - /// We track all pointer arguments to be able to issue prefetches at enqueue - /// time - void setKernelPtrArg(int Index, size_t Size, const void *PtrArg) { - Args.PtrArgs.insert(*static_cast(PtrArg)); - setKernelArg(Index, Size, PtrArg); - } - - bool isPtrArg(const void *ptr) { - return Args.PtrArgs.find(ptr) != Args.PtrArgs.end(); - } - - std::set &getPtrArgs() { return Args.PtrArgs; } - void setKernelLocalArg(int Index, size_t Size) { Args.addLocalArg(Index, Size); }