diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index c55d752410..b05d7c6348 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -22,8 +22,6 @@ namespace ur_sanitizer_layer { namespace { -constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem"; - uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) { return USM_SHADOW_BASE + (UPtr >> 3); } @@ -348,11 +346,14 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context, ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); + auto KernelInfo = getKernelInfo(Kernel); + + UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get())); ManagedQueue InternalQueue(Context, Device); if (!InternalQueue) { @@ -370,23 +371,12 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo) { - auto Program = GetProgram(Kernel); - ur_event_handle_t ReadEvent{}; - - // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it - // to host, but it's okay that it isn't defined + USMLaunchInfo &LaunchInfo) { // FIXME: We must use block operation here, until we support urEventSetCallback - auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead( - Queue, Program, kSPIR_DeviceSanitizerReportMem, true, - sizeof(LaunchInfo.SPIR_DeviceSanitizerReportMem), 0, - &LaunchInfo.SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent); + auto Result = context.urDdiTable.Queue.pfnFinish(Queue); if (Result == UR_RESULT_SUCCESS) { - Event = ReadEvent; - - const auto &AH = LaunchInfo.SPIR_DeviceSanitizerReportMem; + const auto &AH = LaunchInfo.Data->SanitizerReport; if (!AH.Flag) { return UR_RESULT_SUCCESS; } @@ -627,13 +617,44 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) { return UR_RESULT_SUCCESS; } +ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { + return UR_RESULT_SUCCESS; + } + m_KernelMap.emplace(Kernel, std::make_shared(Kernel)); + return UR_RESULT_SUCCESS; +} + +ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + m_KernelMap.erase(Kernel); + return UR_RESULT_SUCCESS; +} + ur_result_t SanitizerInterceptor::prepareLaunch( ur_context_handle_t Context, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Program = GetProgram(Kernel); do { + // Set launch info argument + auto ArgNums = GetKernelNumArgs(Kernel); + if (ArgNums) { + context.logger.debug( + "launch_info {} (numLocalArgs={}, localArgs={})", + (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs, + (void *)LaunchInfo.Data->LocalArgs); + ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer( + Kernel, ArgNums - 1, nullptr, &LaunchInfo.Data); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error("Failed to set launch info: {}", URes); + return URes; + } + } + // Write global variable to program auto EnqueueWriteGlobal = [Queue, Program](const char *Name, const void *Value, @@ -723,15 +744,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch( "LocalShadowMemorySize={})", NumWG, LocalMemorySize, LocalShadowMemorySize); - UR_CALL(EnqueueAllocateDevice(LocalShadowMemorySize, - LaunchInfo.LocalShadowOffset)); + UR_CALL(EnqueueAllocateDevice( + LocalShadowMemorySize, LaunchInfo.Data->LocalShadowOffset)); - LaunchInfo.LocalShadowOffsetEnd = - LaunchInfo.LocalShadowOffset + LocalShadowMemorySize - 1; + LaunchInfo.Data->LocalShadowOffsetEnd = + LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize - + 1; - context.logger.info("ShadowMemory(Local, {} - {})", - (void *)LaunchInfo.LocalShadowOffset, - (void *)LaunchInfo.LocalShadowOffsetEnd); + context.logger.info( + "ShadowMemory(Local, {} - {})", + (void *)LaunchInfo.Data->LocalShadowOffset, + (void *)LaunchInfo.Data->LocalShadowOffsetEnd); } } } while (false); @@ -749,15 +772,61 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) { return --It; } -LaunchInfo::~LaunchInfo() { +ur_result_t USMLaunchInfo::initialize() { + UR_CALL(context.urDdiTable.Context.pfnRetain(Context)); + UR_CALL(context.urDdiTable.Device.pfnRetain(Device)); + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data)); + *Data = LaunchInfo{}; + return UR_RESULT_SUCCESS; +} + +ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) { + auto NumArgs = KI.LocalArgs.size(); + if (NumArgs) { + Data->NumLocalArgs = NumArgs; + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs, + (void **)&Data->LocalArgs)); + uint32_t i = 0; + for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) { + Data->LocalArgs[i++] = ArgInfo; + context.logger.debug( + "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex, + ArgInfo.Size, ArgInfo.SizeWithRedZone); + } + } + return UR_RESULT_SUCCESS; +} + +USMLaunchInfo::~USMLaunchInfo() { [[maybe_unused]] ur_result_t Result; - if (LocalShadowOffset) { - Result = - context.urDdiTable.USM.pfnFree(Context, (void *)LocalShadowOffset); + if (Data) { + auto Type = GetDeviceType(Device); + if (Type == DeviceType::GPU_PVC) { + if (Data->PrivateShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->PrivateShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + if (Data->LocalShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->LocalShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + } + if (Data->LocalArgs) { + Result = context.urDdiTable.USM.pfnFree(Context, + (void *)Data->LocalArgs); + assert(Result == UR_RESULT_SUCCESS); + } + Result = context.urDdiTable.USM.pfnFree(Context, (void *)Data); assert(Result == UR_RESULT_SUCCESS); } Result = context.urDdiTable.Context.pfnRelease(Context); assert(Result == UR_RESULT_SUCCESS); + Result = context.urDdiTable.Device.pfnRelease(Device); + assert(Result == UR_RESULT_SUCCESS); } } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index a691bee7b7..1a699df1f6 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -79,6 +79,26 @@ struct QueueInfo { } }; +struct KernelInfo { + ur_kernel_handle_t Handle; + + ur_shared_mutex Mutex; + // Need preserve the order of local arguments + std::map LocalArgs; + + explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRetain(Kernel); + assert(Result == UR_RESULT_SUCCESS); + } + + ~KernelInfo() { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRelease(Handle); + assert(Result == UR_RESULT_SUCCESS); + } +}; + struct ContextInfo { ur_context_handle_t Handle; @@ -107,31 +127,30 @@ struct ContextInfo { } }; -struct LaunchInfo { - uptr LocalShadowOffset = 0; - uptr LocalShadowOffsetEnd = 0; - DeviceSanitizerReport SPIR_DeviceSanitizerReportMem; +struct USMLaunchInfo { + LaunchInfo *Data; ur_context_handle_t Context = nullptr; + ur_device_handle_t Device = nullptr; const size_t *GlobalWorkSize = nullptr; const size_t *GlobalWorkOffset = nullptr; std::vector LocalWorkSize; uint32_t WorkDim = 0; - LaunchInfo(ur_context_handle_t Context, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, const size_t *GlobalWorkOffset, - uint32_t WorkDim) - : Context(Context), GlobalWorkSize(GlobalWorkSize), + USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + const size_t *GlobalWorkOffset, uint32_t WorkDim) + : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize), GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) { - [[maybe_unused]] auto Result = - context.urDdiTable.Context.pfnRetain(Context); - assert(Result == UR_RESULT_SUCCESS); if (LocalWorkSize) { this->LocalWorkSize = std::vector(LocalWorkSize, LocalWorkSize + WorkDim); } } - ~LaunchInfo(); + ~USMLaunchInfo(); + + ur_result_t initialize(); + ur_result_t updateKernelInfo(const KernelInfo &KI); }; struct DeviceGlobalInfo { @@ -158,12 +177,11 @@ class SanitizerInterceptor { ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t insertContext(ur_context_handle_t Context, std::shared_ptr &CI); @@ -173,6 +191,9 @@ class SanitizerInterceptor { std::shared_ptr &CI); ur_result_t eraseDevice(ur_device_handle_t Device); + ur_result_t insertKernel(ur_kernel_handle_t Kernel); + ur_result_t eraseKernel(ur_kernel_handle_t Kernel); + std::optional findAllocInfoByAddress(uptr Address); std::shared_ptr getContextInfo(ur_context_handle_t Context) { @@ -181,6 +202,18 @@ class SanitizerInterceptor { return m_ContextMap[Context]; } + std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { + std::shared_lock Guard(m_DeviceMapMutex); + assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); + return m_DeviceMap[Device]; + } + + std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { + std::shared_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + return m_KernelMap[Kernel]; + } + private: ur_result_t updateShadowMemory(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, @@ -195,26 +228,23 @@ class SanitizerInterceptor { std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); - std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { - std::shared_lock Guard(m_DeviceMapMutex); - assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); - return m_DeviceMap[Device]; - } - private: std::unordered_map> m_ContextMap; ur_shared_mutex m_ContextMapMutex; - std::unordered_map> m_DeviceMap; ur_shared_mutex m_DeviceMapMutex; + std::unordered_map> + m_KernelMap; + ur_shared_mutex m_KernelMapMutex; + /// Assumption: all USM chunks are allocated in one VA AllocationMap m_AllocationMap; ur_shared_mutex m_AllocationMapMutex; diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp index 46ddee4423..1c8ef24a9d 100644 --- a/source/loader/layers/sanitizer/asan_libdevice.hpp +++ b/source/loader/layers/sanitizer/asan_libdevice.hpp @@ -62,6 +62,23 @@ struct DeviceSanitizerReport { bool IsRecover = false; }; +struct LocalArgsInfo { + uint64_t Size = 0; + uint64_t SizeWithRedZone = 0; +}; + +struct LaunchInfo { + uintptr_t PrivateShadowOffset = + 0; // don't move this field, we use it in AddressSanitizerPass + + uintptr_t LocalShadowOffset = 0; + uintptr_t LocalShadowOffsetEnd = 0; + DeviceSanitizerReport SanitizerReport; + + uint32_t NumLocalArgs = 0; + LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex +}; + constexpr unsigned ASAN_SHADOW_SCALE = 3; constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE; diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp index d5612100aa..1d43c512da 100644 --- a/source/loader/layers/sanitizer/common.hpp +++ b/source/loader/layers/sanitizer/common.hpp @@ -65,6 +65,41 @@ inline constexpr uptr ComputeRZLog(uptr user_requested_size) { return rz_log; } +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// Examples: +/// \code +/// alignTo(5, 8) = 8 +/// alignTo(17, 8) = 24 +/// alignTo(~0LL, 8) = 0 +/// alignTo(321, 255) = 510 +/// \endcode +inline uint64_t AlignTo(uint64_t Value, uint64_t Align) { + assert(Align != 0u && "Align can't be 0."); + return (Value + Align - 1) / Align * Align; +} + +inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size, + uint64_t Granularity, + uint64_t Alignment) { + uint64_t Res = 0; + if (Size <= 4) { + Res = 16; + } else if (Size <= 16) { + Res = 32; + } else if (Size <= 128) { + Res = Size + 32; + } else if (Size <= 512) { + Res = Size + 64; + } else if (Size <= 4096) { + Res = Size + 128; + } else { + Res = Size + 256; + } + return AlignTo(std::max(Res, 2 * Granularity), Alignment); +} + // ================================================================ // Trace an internal UR call; returns in case of an error. diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 53ce5d1c1e..58f54c9338 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -272,8 +272,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( context.logger.debug("==== urEnqueueKernelLaunch"); - LaunchInfo LaunchInfo(GetContext(hQueue), pGlobalWorkSize, pLocalWorkSize, - pGlobalWorkOffset, workDim); + USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + workDim); + UR_CALL(LaunchInfo.initialize()); UR_CALL(context.interceptor->preLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -283,8 +285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent); if (result == UR_RESULT_SUCCESS) { - UR_CALL(context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent, - LaunchInfo)); + UR_CALL( + context.interceptor->postLaunchKernel(hKernel, hQueue, LaunchInfo)); } if (phEvent) { @@ -374,6 +376,90 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelCreate +__urdlllocal ur_result_t UR_APICALL urKernelCreate( + ur_program_handle_t hProgram, ///< [in] handle of the program instance + const char *pKernelName, ///< [in] pointer to null-terminated string. + ur_kernel_handle_t + *phKernel ///< [out] pointer to handle of kernel object created. +) { + auto pfnCreate = context.urDdiTable.Kernel.pfnCreate; + + if (nullptr == pfnCreate) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelCreate"); + + UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); + UR_CALL(context.interceptor->insertKernel(*phKernel)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelRelease +__urdlllocal ur_result_t urKernelRelease( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release +) { + auto pfnRelease = context.urDdiTable.Kernel.pfnRelease; + + if (nullptr == pfnRelease) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelRelease"); + UR_CALL(pfnRelease(hKernel)); + + if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) { + uint32_t RefCount; + UR_CALL(context.urDdiTable.Kernel.pfnGetInfo( + hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount), + &RefCount, nullptr)); + if (RefCount == 1) { + UR_CALL(context.interceptor->eraseKernel(hKernel)); + } + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgLocal +__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + size_t + argSize, ///< [in] size of the local buffer to be allocated by the runtime + const ur_kernel_arg_local_properties_t + *pProperties ///< [in][optional] pointer to local buffer properties. +) { + auto pfnSetArgLocal = context.urDdiTable.Kernel.pfnSetArgLocal; + + if (nullptr == pfnSetArgLocal) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelSetArgLocal (argIndex={}, argSize={})", + argIndex, argSize); + + { + auto KI = context.interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KI->Mutex); + // TODO: get local variable alignment + auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal( + argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY); + KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ}; + argSize = argSizeWithRZ; + } + + ur_result_t result = + pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Context table /// with current process' addresses @@ -470,6 +556,38 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's Kernel table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_kernel_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::context.version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate; + pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease; + pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + + return result; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Enqueue table /// with current process' addresses @@ -570,6 +688,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Context); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetKernelProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Kernel); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetProgramProcAddrTable( UR_API_VERSION_CURRENT, &dditable->Program);