From a5bb5d4d0c266174b9e724d41aa0cd197e3585e4 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 12 Feb 2024 16:26:47 +0000 Subject: [PATCH 1/7] Implemented native USM Fill and removed piextUSMEnqueueMemset symbol --- sycl/include/sycl/detail/cg.hpp | 2 +- sycl/include/sycl/detail/pi.def | 2 +- sycl/include/sycl/detail/pi.h | 27 +++++++------- sycl/include/sycl/handler.hpp | 10 +++--- sycl/plugins/cuda/pi_cuda.cpp | 12 +++---- sycl/plugins/hip/pi_hip.cpp | 12 +++---- sycl/plugins/level_zero/pi_level_zero.cpp | 23 ++++++------ sycl/plugins/native_cpu/pi_native_cpu.cpp | 12 +++---- sycl/plugins/opencl/pi_opencl.cpp | 12 +++---- sycl/plugins/unified_runtime/pi2ur.hpp | 14 ++++---- .../unified_runtime/pi_unified_runtime.cpp | 36 +++++++++---------- sycl/source/detail/graph_impl.hpp | 6 ++-- sycl/source/detail/memory_manager.cpp | 20 +++++------ sycl/source/detail/memory_manager.hpp | 6 ++-- sycl/source/detail/queue_impl.cpp | 4 +-- sycl/source/detail/scheduler/commands.cpp | 4 +-- sycl/source/handler.cpp | 10 ++++++ .../DiscardEvents/discard_events_usm.cpp | 14 ++++---- .../discard_events_usm_ooo_queue.cpp | 14 ++++---- sycl/test/abi/pi_cuda_symbol_check.dump | 2 +- sycl/test/abi/pi_hip_symbol_check.dump | 2 +- sycl/test/abi/pi_level_zero_symbol_check.dump | 2 +- sycl/test/abi/pi_opencl_symbol_check.dump | 4 +-- sycl/test/abi/sycl_symbols_linux.dump | 7 ++-- sycl/test/abi/sycl_symbols_windows.dump | 7 ++-- sycl/tools/xpti_helpers/usm_analyzer.hpp | 14 ++++---- sycl/unittests/SYCL2020/GetNativeOpenCL.cpp | 8 ++--- sycl/unittests/helpers/PiMockPlugin.hpp | 11 +++--- sycl/unittests/queue/USM.cpp | 6 ++-- sycl/unittests/queue/Wait.cpp | 5 +-- .../scheduler/InOrderQueueHostTaskDeps.cpp | 7 ++-- .../unittests/xpti_trace/QueueApiFailures.cpp | 20 +++++------ 32 files changed, 173 insertions(+), 162 deletions(-) diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index 5c2ffdc712c00..08edae99c506f 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -299,7 +299,7 @@ class CGFillUSM : public CG { MPattern(std::move(Pattern)), MDst(DstPtr), MLength(Length) {} void *getDst() { return MDst; } size_t getLength() { return MLength; } - int getFill() { return MPattern[0]; } + const std::vector &getPattern() { return MPattern; } }; /// "Prefetch USM" command group class. diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 46a200e001231..ee7e4f3de4a90 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -128,7 +128,7 @@ _PI_API(piextUSMHostAlloc) _PI_API(piextUSMDeviceAlloc) _PI_API(piextUSMSharedAlloc) _PI_API(piextUSMFree) -_PI_API(piextUSMEnqueueMemset) +_PI_API(piextUSMEnqueueFill) _PI_API(piextUSMEnqueueMemcpy) _PI_API(piextUSMEnqueuePrefetch) _PI_API(piextUSMEnqueueMemAdvise) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 3c9076e09f66b..f693d116157dc 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -156,9 +156,10 @@ // piextEnqueueCooperativeKernelLaunch. // 15.46 Add piextGetGlobalVariablePointer // 15.47 Added PI_ERROR_FEATURE_UNSUPPORTED. +// 16.48 Replaced piextUSMEnqueueMemset with piextUSMEnqueueFill -#define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 47 +#define _PI_H_VERSION_MAJOR 16 +#define _PI_H_VERSION_MINOR 48 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -2020,22 +2021,22 @@ __SYCL_EXPORT pi_result piextUSMPitchedAlloc( /// \param ptr is the memory to be freed __SYCL_EXPORT pi_result piextUSMFree(pi_context context, void *ptr); -/// USM Memset API +/// USM Fill API /// /// \param queue is the queue to submit to -/// \param ptr is the ptr to memset -/// \param value is value to set. It is interpreted as an 8-bit value and the -/// upper -/// 24 bits are ignored -/// \param count is the size in bytes to memset +/// \param ptr is the ptr to fill +/// \param pattern is the ptr with the bytes of the pattern to set +/// \param patternSize is the size in bytes of the pattern to set +/// \param count is the size in bytes to fill /// \param num_events_in_waitlist is the number of events to wait on /// \param events_waitlist is an array of events to wait on /// \param event is the event that represents this operation -__SYCL_EXPORT pi_result piextUSMEnqueueMemset(pi_queue queue, void *ptr, - pi_int32 value, size_t count, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event); +__SYCL_EXPORT pi_result piextUSMEnqueueFill(pi_queue queue, void *ptr, + const void *pattern, + size_t patternSize, size_t count, + pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, + pi_event *event); /// USM Memcpy API /// diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 672f8da32c91c..4451bb7b6800c 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -2796,14 +2796,9 @@ class __SYCL_EXPORT handler { /// device copyable. /// \param Count is the number of times to fill Pattern into Ptr. template void fill(void *Ptr, const T &Pattern, size_t Count) { - throwIfActionIsCreated(); - setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); static_assert(is_device_copyable::value, "Pattern must be device copyable"); - parallel_for<__usmfill>(range<1>(Count), [=](id<1> Index) { - T *CastedPtr = static_cast(Ptr); - CastedPtr[Index] = Pattern; - }); + this->fill_impl(Ptr, &Pattern, sizeof(T), Count); } /// Prevents any commands submitted afterward to this queue from executing @@ -3613,6 +3608,9 @@ class __SYCL_EXPORT handler { }); } + // Implementation of USM fill using command for native fill. + void fill_impl(void *Dest, const void *Value, size_t ValueSize, size_t Count); + // Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy. void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, size_t Width, size_t Height); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e6d395e758568..cc3ffa656a661 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -902,12 +902,12 @@ pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } -pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, - size_t Count, pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 2fbde10b77123..4a1fc086ceaf9 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -905,12 +905,12 @@ pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } -pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, - size_t Count, pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 26a1b104b3335..8019b5af76865 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -929,23 +929,22 @@ pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } -/// USM Memset API +/// USM Fill API /// /// @param Queue is the queue to submit to -/// @param Ptr is the ptr to memset -/// @param Value is value to set. It is interpreted as an 8-bit value and the -/// upper -/// 24 bits are ignored -/// @param Count is the size in bytes to memset +/// @param Ptr is the ptr to fill +/// \param Pattern is the ptr with the bytes of the pattern to set +/// \param PatternSize is the size in bytes of the pattern to set +/// @param Count is the size in bytes to fill /// @param NumEventsInWaitlist is the number of events to wait on /// @param EventsWaitlist is an array of events to wait on /// @param Event is the event that represents this operation -pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, - size_t Count, pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 5174456e95a77..af7b7333da608 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -905,12 +905,12 @@ pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } -pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, - size_t Count, pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index e8a168b60445e..9ffcb0ce65f57 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -864,12 +864,12 @@ pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } -pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, - size_t Count, pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 14b7b4723c0dc..443a9c5a76cb4 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -3856,11 +3856,12 @@ inline pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, return PI_SUCCESS; } -inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, - pi_int32 Value, size_t Count, - pi_uint32 NumEventsInWaitList, - const pi_event *EventsWaitList, - pi_event *OutEvent) { +inline pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, + const void *Pattern, size_t PatternSize, + size_t Count, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); if (!Ptr) { return PI_ERROR_INVALID_VALUE; @@ -3872,8 +3873,7 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, ur_event_handle_t *UREvent = reinterpret_cast(OutEvent); - size_t PatternSize = 1; - HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, &Value, Count, + HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, Pattern, Count, NumEventsInWaitList, UrEventsWaitList, UREvent)); diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 8701d23027682..5d465cc10407e 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -437,24 +437,24 @@ __SYCL_EXPORT pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, ParamValueSizeRet); } -/// USM Memset API +/// USM Fill API /// -/// @param Queue is the queue to submit to -/// @param Ptr is the ptr to memset -/// @param Value is value to set. It is interpreted as an 8-bit value and the -/// upper -/// 24 bits are ignored -/// @param Count is the size in bytes to memset -/// @param NumEventsInWaitlist is the number of events to wait on -/// @param EventsWaitlist is an array of events to wait on -/// @param Event is the event that represents this operation -__SYCL_EXPORT pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, - pi_int32 Value, size_t Count, - pi_uint32 NumEventsInWaitlist, - const pi_event *EventsWaitlist, - pi_event *Event) { - return pi2ur::piextUSMEnqueueMemset( - Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +/// \param queue is the queue to submit to +/// \param ptr is the ptr to fill +/// \param pattern is the ptr with the bytes of the pattern to set +/// \param patternSize is the size in bytes of the pattern to set +/// \param count is the size in bytes to fill +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueueFill(pi_queue Queue, void *Ptr, + const void *Pattern, + size_t PatternSize, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + return pi2ur::piextUSMEnqueueFill(Queue, Ptr, Pattern, PatternSize, Count, + NumEventsInWaitlist, EventsWaitlist, Event); } __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( @@ -1472,7 +1472,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piEnqueueMemBufferMap) _PI_API(piEnqueueMemUnmap) _PI_API(piEnqueueMemBufferFill) - _PI_API(piextUSMEnqueueMemset) + _PI_API(piextUSMEnqueueFill) _PI_API(piEnqueueMemBufferCopyRect) _PI_API(piEnqueueMemBufferCopy) _PI_API(piextUSMEnqueueMemcpy) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 73064b021edc5..9c725a4c73613 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -454,8 +454,10 @@ class node_impl { sycl::detail::CGFillUSM *FillUSM = static_cast(MCommandGroup.get()); Stream << "Dst: " << FillUSM->getDst() - << " Length: " << FillUSM->getLength() - << " Pattern: " << FillUSM->getFill() << "\\n"; + << " Length: " << FillUSM->getLength() << " Pattern: "; + for (auto byte : FillUSM->getPattern()) + Stream << byte; + Stream << "\\n"; } break; case sycl::detail::CG::CGTYPE::PrefetchUSM: diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index c0af141935bd5..e799533572a68 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -942,7 +942,7 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(!SrcQueue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_usm."); + "Host queue not supported in copy_usm."); if (!Len) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -978,7 +978,7 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, } void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, - int Pattern, + const std::vector &Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { @@ -1001,14 +1001,14 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); const PluginPtr &Plugin = Queue->getPlugin(); - Plugin->call( - Queue->getHandleRef(), Mem, Pattern, Length, DepEvents.size(), - DepEvents.data(), OutEvent); + Plugin->call( + Queue->getHandleRef(), Mem, Pattern.data(), Pattern.size(), Length, + DepEvents.size(), DepEvents.data(), OutEvent); } // TODO: This function will remain until ABI-breaking change void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, - int Pattern, + const std::vector &Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent) { MemoryManager::fill_usm(Mem, Queue, Length, Pattern, DepEvents, OutEvent, @@ -1677,7 +1677,8 @@ void MemoryManager::ext_oneapi_copy_usm_cmd_buffer( void MemoryManager::ext_oneapi_fill_usm_cmd_buffer( sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, - size_t Len, int Pattern, std::vector Deps, + size_t Len, const std::vector &Pattern, + std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { if (!DstMem) @@ -1685,10 +1686,9 @@ void MemoryManager::ext_oneapi_fill_usm_cmd_buffer( PI_ERROR_INVALID_VALUE); const PluginPtr &Plugin = Context->getPlugin(); - // Pattern is interpreted as an unsigned char so pattern size is always 1. - size_t PatternSize = 1; + Plugin->call( - CommandBuffer, DstMem, &Pattern, PatternSize, Len, Deps.size(), + CommandBuffer, DstMem, Pattern.data(), Pattern.size(), Len, Deps.size(), Deps.data(), OutSyncPoint); } diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 1d2800bf9dadc..858e7232380ac 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -172,14 +172,14 @@ class __SYCL_EXPORT MemoryManager { sycl::detail::pi::PiEvent *OutEvent); static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, - int Pattern, + const std::vector &Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl); // TODO: This function will remain until ABI-breaking change static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, - int Pattern, + const std::vector &Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent); @@ -319,7 +319,7 @@ class __SYCL_EXPORT MemoryManager { static void ext_oneapi_fill_usm_cmd_buffer( sycl::detail::ContextImplPtr Context, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, - size_t Len, int Pattern, + size_t Len, const std::vector &Pattern, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 321cc48b29769..28a5b61a5c82e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -121,11 +121,11 @@ event queue_impl::memset(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif - + const std::vector Pattern{static_cast(Value)}; return submitMemOpHelper( Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self, - Count, Value); + Count, Pattern); } void report(const code_location &CodeLoc) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index efc553cdb97e2..85b867319b1db 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2847,7 +2847,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get(); MemoryManager::ext_oneapi_fill_usm_cmd_buffer( MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(), - Fill->getLength(), Fill->getFill(), std::move(MSyncPointDeps), + Fill->getLength(), Fill->getPattern(), std::move(MSyncPointDeps), &OutSyncPoint); MEvent->setSyncPoint(OutSyncPoint); return PI_SUCCESS; @@ -3041,7 +3041,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { case CG::CGTYPE::FillUSM: { CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get(); MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(), - Fill->getFill(), std::move(RawEvents), Event, + Fill->getPattern(), std::move(RawEvents), Event, MEvent); return PI_SUCCESS; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5b40f1b3b07c3..83de1c17bb888 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -971,6 +971,16 @@ void handler::mem_advise(const void *Ptr, size_t Count, int Advice) { setType(detail::CG::AdviseUSM); } +void handler::fill_impl(void *Dest, const void *Value, size_t ValueSize, + size_t Count) { + throwIfActionIsCreated(); + MDstPtr = Dest; + MPattern.resize(ValueSize); + std::memcpy(MPattern.data(), Value, ValueSize); + MLength = Count * ValueSize; + setType(detail::CG::FillUSM); +} + void handler::ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, size_t Width, size_t Height) { diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp index 48ab65c68896c..269706b10321d 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp @@ -12,7 +12,7 @@ // Since it is a warning it is safe to ignore for this test. // // Everything that follows TestQueueOperations() -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // @@ -20,8 +20,7 @@ // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // -// Q.fill don't use piEnqueueMemBufferFill -// CHECK: ---> piEnqueueKernelLaunch( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // @@ -47,7 +46,7 @@ // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // // RegularQueue -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -57,7 +56,7 @@ // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // // Everything that follows TestQueueOperationsViaSubmit() -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // @@ -65,8 +64,7 @@ // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // -// Q.fill don't use piEnqueueMemBufferFill -// CHECK: ---> piEnqueueKernelLaunch( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // @@ -92,7 +90,7 @@ // CHECK-NEXT: pi_event * : {{0|0000000000000000}}[ nullptr ] // // RegularQueue -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp index 96d53a632beb6..fb7555bab9d9f 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp @@ -12,7 +12,7 @@ // Since it is a warning it is safe to ignore for this test. // // Everything that follows TestQueueOperations() -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -22,8 +22,7 @@ // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS // -// Q.fill don't use piEnqueueMemBufferFill -// CHECK: ---> piEnqueueKernelLaunch( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -58,7 +57,7 @@ // CHECK: ---> pi_result : PI_SUCCESS // // RegularQueue -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -69,7 +68,7 @@ // CHECK: ---> pi_result : PI_SUCCESS // // Everything that follows TestQueueOperationsViaSubmit() -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -79,8 +78,7 @@ // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS // -// Q.fill don't use piEnqueueMemBufferFill -// CHECK: ---> piEnqueueKernelLaunch( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS @@ -115,7 +113,7 @@ // CHECK: ---> pi_result : PI_SUCCESS // // RegularQueue -// CHECK: ---> piextUSMEnqueueMemset( +// CHECK: ---> piextUSMEnqueueFill( // CHECK: pi_event * : // CHECK-NOT: pi_event * : {{0|0000000000000000}}[ nullptr ] // CHECK: ---> pi_result : PI_SUCCESS diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index ec83769469dcd..a81c70cb3a17c 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -151,11 +151,11 @@ piextQueueCreateWithNativeHandle piextQueueGetNativeHandle piextSignalExternalSemaphore piextUSMDeviceAlloc +piextUSMEnqueueFill piextUSMEnqueueFill2D piextUSMEnqueueMemAdvise piextUSMEnqueueMemcpy piextUSMEnqueueMemcpy2D -piextUSMEnqueueMemset piextUSMEnqueueMemset2D piextUSMEnqueuePrefetch piextUSMFree diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 975e3315c0197..e748e4054e816 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -151,11 +151,11 @@ piextQueueCreateWithNativeHandle piextQueueGetNativeHandle piextSignalExternalSemaphore piextUSMDeviceAlloc +piextUSMEnqueueFill piextUSMEnqueueFill2D piextUSMEnqueueMemAdvise piextUSMEnqueueMemcpy piextUSMEnqueueMemcpy2D -piextUSMEnqueueMemset piextUSMEnqueueMemset2D piextUSMEnqueuePrefetch piextUSMFree diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 336e1cd3cdd8e..97c23450e95ba 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -150,11 +150,11 @@ piextQueueCreateWithNativeHandle piextQueueGetNativeHandle piextSignalExternalSemaphore piextUSMDeviceAlloc +piextUSMEnqueueFill piextUSMEnqueueFill2D piextUSMEnqueueMemAdvise piextUSMEnqueueMemcpy piextUSMEnqueueMemcpy2D -piextUSMEnqueueMemset piextUSMEnqueueMemset2D piextUSMEnqueuePrefetch piextUSMFree diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index fa7c7a2dc0525..0d3d00c76c52e 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -81,8 +81,8 @@ piSamplerGetInfo piSamplerRelease piSamplerRetain piTearDown -piextCommandBufferAdviseUSM piextBindlessImageSamplerCreate +piextCommandBufferAdviseUSM piextCommandBufferCreate piextCommandBufferFillUSM piextCommandBufferFinalize @@ -150,11 +150,11 @@ piextQueueCreateWithNativeHandle piextQueueGetNativeHandle piextSignalExternalSemaphore piextUSMDeviceAlloc +piextUSMEnqueueFill piextUSMEnqueueFill2D piextUSMEnqueueMemAdvise piextUSMEnqueueMemcpy piextUSMEnqueueMemcpy2D -piextUSMEnqueueMemset piextUSMEnqueueMemset2D piextUSMEnqueuePrefetch piextUSMFree diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index dffee1588a04a..57cf0f03051f1 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3941,7 +3941,7 @@ _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2D_cmd_bufferESt10shared_ptr _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2H_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjPcjSC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyH2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPcjNS0_5rangeILi3EEENS0_2idILi3EEEjPvjSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager30ext_oneapi_copy_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEPKvP22_pi_ext_command_buffermPvSt6vectorIjSaIjEEPj -_ZN4sycl3_V16detail13MemoryManager30ext_oneapi_fill_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPvmiSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager30ext_oneapi_fill_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPvmRKSt6vectorIcSaIcEES9_IjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager32ext_oneapi_advise_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPKvm14_pi_mem_adviceSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager34ext_oneapi_prefetch_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPvmSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager3mapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEENS0_6access4modeEjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ @@ -3954,8 +3954,8 @@ _ZN4sycl3_V16detail13MemoryManager7releaseESt10shared_ptrINS1_12context_implEEPN _ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_RKS5_INS1_10event_implEE -_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_ -_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_RKS4_INS1_10event_implEE +_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmRKSt6vectorIcSaIcEES7_IP9_pi_eventSaISD_EEPSD_ +_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmRKSt6vectorIcSaIcEES7_IP9_pi_eventSaISD_EEPSD_RKS4_INS1_10event_implEE _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc _ZN4sycl3_V16detail13make_platformEmNS0_7backendE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE @@ -4165,6 +4165,7 @@ _ZN4sycl3_V17handler6memcpyEPvPKvm _ZN4sycl3_V17handler6memsetEPvim _ZN4sycl3_V17handler8finalizeEv _ZN4sycl3_V17handler8prefetchEPKvm +_ZN4sycl3_V17handler9fill_implEPvPKvmm _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2f180817725d4..a3035337d2fcf 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1058,7 +1058,7 @@ ?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z ?ext_oneapi_fill_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAX_KPEBDIV?$range@$02@34@6V?$id@$02@34@IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z -?ext_oneapi_fill_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KHV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_fill_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KAEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z ?ext_oneapi_get_composite_devices@platform@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ ?ext_oneapi_get_graph@queue@_V1@sycl@@QEBA?AV?$command_graph@$0A@@experimental@oneapi@ext@23@XZ @@ -1109,8 +1109,9 @@ ?fill@MemoryManager@detail@_V1@sycl@@SAXPEAVSYCLMemObjI@234@PEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KPEBDIV?$range@$02@34@5V?$id@$02@34@IV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@7@AEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@7@@Z ?fill_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22AEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z ?fill_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22AEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z -?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KHV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z -?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KHV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z +?fill_impl@handler@_V1@sycl@@AEAAXPEAXPEBX_K2@Z +?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KAEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z +?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KAEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z ?finalize@handler@_V1@sycl@@AEAA?AVevent@23@XZ ?finalize@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$command_graph@$00@34567@AEBVproperty_list@67@@Z ?finalizeImpl@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXXZ diff --git a/sycl/tools/xpti_helpers/usm_analyzer.hpp b/sycl/tools/xpti_helpers/usm_analyzer.hpp index 9cdf27dd98ddc..1eee3474c51ed 100644 --- a/sycl/tools/xpti_helpers/usm_analyzer.hpp +++ b/sycl/tools/xpti_helpers/usm_analyzer.hpp @@ -214,8 +214,8 @@ class USMAnalyzer { USMAnalyzer::handleUSMSharedAlloc); ArgHandlerPreCall.set_piextUSMFree(USMAnalyzer::handleUSMFree); ArgHandlerPreCall.set_piMemBufferCreate(USMAnalyzer::handleMemBufferCreate); - ArgHandlerPreCall.set_piextUSMEnqueueMemset( - USMAnalyzer::handleUSMEnqueueMemset); + ArgHandlerPreCall.set_piextUSMEnqueueFill( + USMAnalyzer::handleUSMEnqueueFill); ArgHandlerPreCall.set_piextUSMEnqueueMemcpy( USMAnalyzer::handleUSMEnqueueMemcpy); ArgHandlerPreCall.set_piextUSMEnqueuePrefetch( @@ -350,11 +350,11 @@ class USMAnalyzer { } } - static void handleUSMEnqueueMemset(const pi_plugin &, - std::optional, pi_queue, - void *ptr, pi_int32, size_t numBytes, - pi_uint32, const pi_event *, pi_event *) { - CheckPointerValidness("input parameter", ptr, numBytes, "memset"); + static void handleUSMEnqueueFill(const pi_plugin &, std::optional, + pi_queue, void *ptr, const void *, size_t, + size_t numBytes, pi_uint32, const pi_event *, + pi_event *) { + CheckPointerValidness("input parameter", ptr, numBytes, "fill"); } static void handleUSMEnqueueMemcpy(const pi_plugin &, diff --git a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp index 27967973c1363..3860833256048 100644 --- a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp +++ b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp @@ -75,9 +75,9 @@ pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name, return PI_SUCCESS; } -static pi_result redefinedUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, - pi_uint32, const pi_event *, - pi_event *event) { +static pi_result redefinedUSMEnqueueMemset(pi_queue, void *, const void *, + size_t, size_t, pi_uint32, + const pi_event *, pi_event *event) { *event = reinterpret_cast(new int{}); return PI_SUCCESS; } @@ -97,7 +97,7 @@ TEST(GetNative, GetNativeHandle) { Mock.redefineBefore(redefinedMemRetain); Mock.redefineBefore( redefinedMemBufferCreate); - Mock.redefineBefore( + Mock.redefineBefore( redefinedUSMEnqueueMemset); context Context(Plt); diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index be777f23df239..ce1ed45c7aa6d 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1199,11 +1199,12 @@ inline pi_result mock_piextUSMFree(pi_context context, void *ptr) { return PI_SUCCESS; } -inline pi_result mock_piextUSMEnqueueMemset(pi_queue queue, void *ptr, - pi_int32 value, size_t count, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { +inline pi_result mock_piextUSMEnqueueFill(pi_queue queue, void *ptr, + const void *pattern, + size_t patternSize, size_t count, + pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, + pi_event *event) { *event = createDummyHandle(); return PI_SUCCESS; } diff --git a/sycl/unittests/queue/USM.cpp b/sycl/unittests/queue/USM.cpp index 30cdd2b37393e..03ff8c538d7ed 100644 --- a/sycl/unittests/queue/USM.cpp +++ b/sycl/unittests/queue/USM.cpp @@ -48,8 +48,8 @@ pi_result redefinedUSMEnqueueMemcpyAfter(pi_queue, pi_bool, void *, return PI_SUCCESS; } -pi_result redefinedUSMEnqueueMemsetAfter(pi_queue, void *, pi_int32, size_t, - pi_uint32, const pi_event *, +pi_result redefinedUSMEnqueueMemsetAfter(pi_queue, void *, const void *, size_t, + size_t, pi_uint32, const pi_event *, pi_event *Event) { // Set MEMSET to the event produced by the original USMEnqueueMemcpy MEMSET = *Event; @@ -64,7 +64,7 @@ TEST(USM, NoOpPreservesDependencyChain) { redefinedEnqueueEventsWaitAfter); Mock.redefineAfter( redefinedUSMEnqueueMemcpyAfter); - Mock.redefineAfter( + Mock.redefineAfter( redefinedUSMEnqueueMemsetAfter); context Ctx{Plt.get_devices()[0]}; diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 8b2d72055d847..f9ec3fb6083ac 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -37,7 +37,8 @@ pi_result redefinedQueueCreateEx(pi_context context, pi_device device, return PI_SUCCESS; } -pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, +pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, + const void *Pattern, size_t PatternSize, size_t Count, pi_uint32 Num_events_in_waitlist, const pi_event *Events_waitlist, @@ -88,7 +89,7 @@ TEST(QueueWait, QueueWaitTest) { Mock.redefineBefore( redefinedQueueCreateEx); Mock.redefineBefore(redefinedQueueFinish); - Mock.redefineBefore( + Mock.redefineBefore( redefinedUSMEnqueueMemset); Mock.redefineBefore(redefinedEventsWait); Mock.redefineBefore( diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 8693ff5e4c52b..07d36d479d3c0 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -61,7 +61,8 @@ inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, ExecutedCommands.push_back({CommandType::KERNEL, EventsCount}); return PI_SUCCESS; } -inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, +inline pi_result customextUSMEnqueueMemset(pi_queue, void *, const void *, + size_t, size_t, pi_uint32 EventsCount, const pi_event *, pi_event *) { ExecutedCommands.push_back({CommandType::MEMSET, EventsCount}); @@ -73,7 +74,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { sycl::unittest::PiMock Mock; Mock.redefineBefore( customEnqueueKernelLaunch); - Mock.redefineBefore( + Mock.redefineBefore( customextUSMEnqueueMemset); sycl::platform Plt = Mock.getPlatform(); @@ -126,7 +127,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { sycl::unittest::PiMock Mock; Mock.redefineBefore( customEnqueueKernelLaunch); - Mock.redefineBefore( + Mock.redefineBefore( customextUSMEnqueueMemset); sycl::platform Plt = Mock.getPlatform(); diff --git a/sycl/unittests/xpti_trace/QueueApiFailures.cpp b/sycl/unittests/xpti_trace/QueueApiFailures.cpp index 88c5fae49394c..c634f2dfce299 100644 --- a/sycl/unittests/xpti_trace/QueueApiFailures.cpp +++ b/sycl/unittests/xpti_trace/QueueApiFailures.cpp @@ -145,7 +145,8 @@ TEST_F(QueueApiFailures, QueueSingleTask) { EXPECT_FALSE(queryReceivedNotifications(TraceType, Message)); } -pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, +pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, + const void *Pattern, size_t PatternSize, size_t Count, pi_uint32 Num_events_in_waitlist, const pi_event *Events_waitlist, @@ -154,7 +155,7 @@ pi_result redefinedUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, } TEST_F(QueueApiFailures, QueueMemset) { - MockPlugin.redefine( + MockPlugin.redefine( redefinedUSMEnqueueMemset); MockPlugin.redefine( redefinedPluginGetLastError); @@ -241,18 +242,17 @@ TEST_F(QueueApiFailures, QueueCopy) { EXPECT_FALSE(queryReceivedNotifications(TraceType, Message)); } -pi_result redefinedEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, - const void *Pattern, size_t PatternSize, - size_t Offset, size_t Size, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, - pi_event *Event) { +pi_result redefinedUSMEnqueueFill(pi_queue Queue, void *Ptr, + const void *Pattern, size_t PatternSize, + size_t Count, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { return PI_ERROR_PLUGIN_SPECIFIC_ERROR; } TEST_F(QueueApiFailures, QueueFill) { - MockPlugin.redefine( - redefinedEnqueueMemBufferFill); + MockPlugin.redefine( + redefinedUSMEnqueueFill); MockPlugin.redefine( redefinedPluginGetLastError); sycl::queue Q; From bd59ffced44c53848dea9ca4f746b53f2fb6bd29 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 28 Feb 2024 13:04:38 -0500 Subject: [PATCH 2/7] Added overload to fill usm and pointed to UR PR tag --- sycl/plugins/unified_runtime/CMakeLists.txt | 10 ++-------- sycl/source/detail/memory_manager.cpp | 16 +++++++++++++++- sycl/source/detail/memory_manager.hpp | 9 ++++++++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 0eebfa5a02014..6678d0f16829b 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -81,14 +81,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f - # Merge: ca5c3421 69c43b45 - # Author: Kenneth Benzie (Benie) - # Date: Tue Mar 19 21:00:20 2024 +0000 - # Merge pull request #1326 from hdelan/refactor-guess-local-worksize - # [CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter - set(UNIFIED_RUNTIME_TAG ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f) + set(UNIFIED_RUNTIME_REPO "https://github.com/konradkusiak97/unified-runtime.git") + set(UNIFIED_RUNTIME_TAG URChangesForFillCombined) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index e799533572a68..4cad9353c7b9b 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1008,9 +1008,23 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, // TODO: This function will remain until ABI-breaking change void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, - const std::vector &Pattern, + int Pattern, + std::vector DepEvents, + sycl::detail::pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + std::vector vecPattern(sizeof(Pattern)); + std::memcpy(vecPattern.data(), &Pattern, sizeof(Pattern)); + MemoryManager::fill_usm(Mem, Queue, Length, Pattern, DepEvents, OutEvent, + OutEventImpl); +} + +// TODO: This function will remain until ABI-breaking change +void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, + int Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent) { + std::vector vecPattern(sizeof(Pattern)); + std::memcpy(vecPattern.data(), &Pattern, sizeof(Pattern)); MemoryManager::fill_usm(Mem, Queue, Length, Pattern, DepEvents, OutEvent, nullptr); // OutEventImpl); } diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 858e7232380ac..0636fe6f3d1c0 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -179,7 +179,14 @@ class __SYCL_EXPORT MemoryManager { // TODO: This function will remain until ABI-breaking change static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, - const std::vector &Pattern, + int Pattern, + std::vector DepEvents, + sycl::detail::pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl); + + // TODO: This function will remain until ABI-breaking change + static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, + int Pattern, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent); From c0a5439f500accc603a1dea1a91e4bbfdc3b473a Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Thu, 29 Feb 2024 18:00:51 +0000 Subject: [PATCH 3/7] Fixed handling ABI breaking interfaces --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/source/detail/memory_manager.cpp | 4 +- sycl/source/handler.cpp | 3 +- sycl/test/abi/sycl_symbols_linux.dump | 3 +- sycl/unittests/Extensions/CommandGraph.cpp | 2547 +++++++++++++++++++ 5 files changed, 2554 insertions(+), 5 deletions(-) create mode 100644 sycl/unittests/Extensions/CommandGraph.cpp diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 6678d0f16829b..387458e7ee8ba 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -82,7 +82,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/konradkusiak97/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG URChangesForFillCombined) + set(UNIFIED_RUNTIME_TAG e400d8f37e6ccef65ff62491b417fc43b8c659ce) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 4cad9353c7b9b..9b8d94d557f4c 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1014,7 +1014,7 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, const detail::EventImplPtr &OutEventImpl) { std::vector vecPattern(sizeof(Pattern)); std::memcpy(vecPattern.data(), &Pattern, sizeof(Pattern)); - MemoryManager::fill_usm(Mem, Queue, Length, Pattern, DepEvents, OutEvent, + MemoryManager::fill_usm(Mem, Queue, Length, vecPattern, DepEvents, OutEvent, OutEventImpl); } @@ -1025,7 +1025,7 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, sycl::detail::pi::PiEvent *OutEvent) { std::vector vecPattern(sizeof(Pattern)); std::memcpy(vecPattern.data(), &Pattern, sizeof(Pattern)); - MemoryManager::fill_usm(Mem, Queue, Length, Pattern, DepEvents, OutEvent, + MemoryManager::fill_usm(Mem, Queue, Length, vecPattern, DepEvents, OutEvent, nullptr); // OutEventImpl); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 83de1c17bb888..b66fae23a1249 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -952,7 +952,7 @@ void handler::memset(void *Dest, int Value, size_t Count) { MDstPtr = Dest; MPattern.push_back(static_cast(Value)); MLength = Count; - setUserFacingNodeType(ext::oneapi::experimental::node_type::memset); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); setType(detail::CG::FillUSM); } @@ -978,6 +978,7 @@ void handler::fill_impl(void *Dest, const void *Value, size_t ValueSize, MPattern.resize(ValueSize); std::memcpy(MPattern.data(), Value, ValueSize); MLength = Count * ValueSize; + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); setType(detail::CG::FillUSM); } diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 57cf0f03051f1..115a8d55711e6 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3954,8 +3954,9 @@ _ZN4sycl3_V16detail13MemoryManager7releaseESt10shared_ptrINS1_12context_implEEPN _ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_RKS5_INS1_10event_implEE -_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmRKSt6vectorIcSaIcEES7_IP9_pi_eventSaISD_EEPSD_ _ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmRKSt6vectorIcSaIcEES7_IP9_pi_eventSaISD_EEPSD_RKS4_INS1_10event_implEE +_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_ +_ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_RKS4_INS1_10event_implEE _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc _ZN4sycl3_V16detail13make_platformEmNS0_7backendE _ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp new file mode 100644 index 0000000000000..c583d64e77927 --- /dev/null +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -0,0 +1,2547 @@ +//==--------------------- CommandGraph.cpp -------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sycl/ext/oneapi/experimental/graph.hpp" +#include + +#include "../thread_safety/ThreadUtils.h" +#include "detail/graph_impl.hpp" + +#include +#include +#include +#include + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; + +// anonymous namespace used to avoid code redundancy by defining functions +// used by multiple times by unitests. +// Defining anonymous namespace prevents from function naming conflits +namespace { +/// Submits four kernels with diamond dependency to the queue Q +/// @param Q Queue to submit nodes to. +void runKernels(queue Q) { + auto NodeA = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on(NodeA); + cgh.single_task>([]() {}); + }); + auto NodeC = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on(NodeA); + cgh.single_task>([]() {}); + }); + auto NodeD = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on({NodeB, NodeC}); + cgh.single_task>([]() {}); + }); +} + +/// Submits four kernels without any additional dependencies the queue Q +/// @param Q Queue to submit nodes to. +void runKernelsInOrder(queue Q) { + auto NodeA = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeC = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeD = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); +} + +/// Adds four kernels with diamond dependency to the Graph G +/// @param G Modifiable graph to add commands to. +void addKernels( + experimental::command_graph G) { + auto NodeA = G.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeC = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeD = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeB, NodeC)}); +} + +bool checkExecGraphSchedule( + std::shared_ptr + GraphA, + std::shared_ptr + GraphB) { + auto ScheduleA = GraphA->getSchedule(); + auto ScheduleB = GraphB->getSchedule(); + if (ScheduleA.size() != ScheduleB.size()) + return false; + + std::vector< + std::shared_ptr> + VScheduleA{std::begin(ScheduleA), std::end(ScheduleA)}; + std::vector< + std::shared_ptr> + VScheduleB{std::begin(ScheduleB), std::end(ScheduleB)}; + + for (size_t i = 0; i < VScheduleA.size(); i++) { + if (!VScheduleA[i]->isSimilar(VScheduleB[i])) + return false; + } + return true; +} + +/// Define the three possible path to add node to a SYCL Graph. +/// Shortcut is a sub-type of Record&Replay using Queue shortcut +/// instead of standard kernel submitions. +enum OperationPath { Explicit, RecordReplay, Shortcut }; + +/// Tries to add a memcpy2D node to the graph G +/// It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_memcpy2d extension can not be used +/// along with SYCL Graph. +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +/// @param Dest Pointer to the memory destination +/// @param DestPitch pitch at the destination +/// @param Src Pointer to the memory source +/// @param SrcPitch pitch at the source +/// @param Witdh width of the data to copy +/// @param Height height of the data to copy +template +void addMemcpy2D(experimental::detail::modifiable_command_graph &G, queue &Q, + void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, + size_t Width, size_t Height) { + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +/// Tries to add nodes including images bindless copy instructions +/// to the graph G. It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_bindless_images extension can not be used +/// along with SYCL Graph. +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +/// @param Img Image memory +/// @param HostData Host Pointer to the memory +/// @param ImgUSM USM Pointer to Image memory +/// @param Pitch image pitch +/// @param Desc Image descriptor +template +void addImagesCopies(experimental::detail::modifiable_command_graph &G, + queue &Q, sycl::ext::oneapi::experimental::image_mem Img, + std::vector HostData, void *ImgUSM, + size_t Pitch, + sycl::ext::oneapi::experimental::image_descriptor Desc) { + // simple copy Host to Device + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), Img.get_handle(), Desc); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(HostData.data(), Img.get_handle(), Desc); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), Img.get_handle(), Desc); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + // simple copy Device to Host + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(Img.get_handle(), HostData.data(), Desc); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(Img.get_handle(), HostData.data(), Desc); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(Img.get_handle(), HostData.data(), Desc); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + // simple copy Host to Device USM + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), ImgUSM, Desc, Pitch); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(HostData.data(), ImgUSM, Desc, Pitch); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), ImgUSM, Desc, Pitch); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + // subregion copy Host to Device + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), {0, 0, 0}, {0, 0, 0}, + Img.get_handle(), {0, 0, 0}, Desc, {0, 0, 0}); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(HostData.data(), {0, 0, 0}, {0, 0, 0}, Img.get_handle(), + {0, 0, 0}, Desc, {0, 0, 0}); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), {0, 0, 0}, {0, 0, 0}, + Img.get_handle(), {0, 0, 0}, Desc, {0, 0, 0}); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + // subregion copy Device to Host + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(Img.get_handle(), {0, 0, 0}, Desc, HostData.data(), + {0, 0, 0}, {0, 0, 0}, {0, 0, 0}); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(Img.get_handle(), {0, 0, 0}, Desc, HostData.data(), + {0, 0, 0}, {0, 0, 0}, {0, 0, 0}); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(Img.get_handle(), {0, 0, 0}, Desc, HostData.data(), + {0, 0, 0}, {0, 0, 0}, {0, 0, 0}); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + // subregion copy Host to Device USM + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), {0, 0, 0}, ImgUSM, {0, 0, 0}, Desc, + Pitch, {0, 0, 0}, {0, 0, 0}); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_copy(HostData.data(), {0, 0, 0}, ImgUSM, {0, 0, 0}, Desc, + Pitch, {0, 0, 0}, {0, 0, 0}); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_copy(HostData.data(), {0, 0, 0}, ImgUSM, {0, 0, 0}, Desc, + Pitch, {0, 0, 0}, {0, 0, 0}); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} +} // anonymous namespace + +class CommandGraphTest : public ::testing::Test { +public: + CommandGraphTest() + : Mock{}, Plat{Mock.getPlatform()}, Dev{Plat.get_devices()[0]}, + Queue{Dev}, + Graph{Queue.get_context(), + Dev, + {experimental::property::graph::assume_buffer_outlives_graph{}}} { + } + +protected: + void SetUp() override {} + +protected: + unittest::PiMock Mock; + sycl::platform Plat; + sycl::device Dev; + sycl::queue Queue; + experimental::command_graph Graph; +}; + +TEST_F(CommandGraphTest, QueueState) { + experimental::queue_state State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::executing); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + Graph.begin_recording(Queue); + State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::recording); + + Graph.end_recording(); + State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::executing); +} + +TEST_F(CommandGraphTest, AddNode) { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + ASSERT_TRUE(GraphImpl->MRoots.empty()); + + auto Node1 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node1), nullptr); + ASSERT_FALSE(sycl::detail::getSyclObjImpl(Node1)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ((*GraphImpl->MRoots.begin()).lock(), + sycl::detail::getSyclObjImpl(Node1)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + + // Add a node which depends on the first + auto Node2Deps = experimental::property::node::depends_on(Node1); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2Deps.get_dependencies().front()), + sycl::detail::getSyclObjImpl(Node1)); + auto Node2 = Graph.add([&](sycl::handler &cgh) {}, {Node2Deps}); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front().lock(), + sycl::detail::getSyclObjImpl(Node2)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); + + // Add a third node which depends on both + auto Node3 = + Graph.add([&](sycl::handler &cgh) {}, + {experimental::property::node::depends_on(Node1, Node2)}); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node3), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size(), 1lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node3)->MPredecessors.size(), 2lu); + + // Add a fourth node without any dependencies on the others + auto Node4 = Graph.add([&](sycl::handler &cgh) {}); + ASSERT_NE(sycl::detail::getSyclObjImpl(Node4), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node4)->isEmpty()); + ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 2lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size(), 1lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node3)->MPredecessors.size(), 2lu); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node4)->MPredecessors.empty()); +} + +TEST_F(CommandGraphTest, Finalize) { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + sycl::buffer Buf(1); + auto Node1 = Graph.add([&](sycl::handler &cgh) { + sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); + cgh.single_task>([]() {}); + }); + + // Add independent node + auto Node2 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + // Add a node that depends on Node1 due to the accessor + auto Node3 = Graph.add([&](sycl::handler &cgh) { + sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); + cgh.single_task>([]() {}); + }); + + // Guarantee order of independent nodes 1 and 2 + Graph.make_edge(Node2, Node1); + + auto GraphExec = Graph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + + // The final schedule should contain three nodes in order: 2->1->3 + auto Schedule = GraphExecImpl->getSchedule(); + ASSERT_EQ(Schedule.size(), 3ul); + auto ScheduleIt = Schedule.begin(); + ASSERT_TRUE((*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node2))); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node1))); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node3))); + ASSERT_EQ(Queue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, MakeEdge) { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Add two independent nodes + auto Node1 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2 = Graph.add([&](sycl::handler &cgh) {}); + ASSERT_EQ(GraphImpl->MRoots.size(), 2ul); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.empty()); + + // Connect nodes and verify order + Graph.make_edge(Node1, Node2); + ASSERT_EQ(GraphImpl->MRoots.size(), 1ul); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front().lock(), + sycl::detail::getSyclObjImpl(Node2)); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MPredecessors.empty()); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.empty()); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size(), 1lu); +} + +TEST_F(CommandGraphTest, BeginEndRecording) { + sycl::queue Queue2{Queue.get_context(), Dev}; + + // Test throwing behaviour + // Check we can repeatedly begin recording on the same queues + ASSERT_NO_THROW(Graph.begin_recording(Queue)); + ASSERT_NO_THROW(Graph.begin_recording(Queue)); + ASSERT_NO_THROW(Graph.begin_recording(Queue2)); + ASSERT_NO_THROW(Graph.begin_recording(Queue2)); + // Check we can repeatedly end recording on the same queues + ASSERT_NO_THROW(Graph.end_recording(Queue)); + ASSERT_NO_THROW(Graph.end_recording(Queue)); + ASSERT_NO_THROW(Graph.end_recording(Queue2)); + ASSERT_NO_THROW(Graph.end_recording(Queue2)); + // Vector versions + ASSERT_NO_THROW(Graph.begin_recording({Queue, Queue2})); + ASSERT_NO_THROW(Graph.begin_recording({Queue, Queue2})); + ASSERT_NO_THROW(Graph.end_recording({Queue, Queue2})); + ASSERT_NO_THROW(Graph.end_recording({Queue, Queue2})); + + experimental::command_graph Graph2(Queue.get_context(), Dev); + + Graph.begin_recording(Queue); + // Trying to record to a second Graph should throw + ASSERT_ANY_THROW(Graph2.begin_recording(Queue)); + // Trying to end when it is recording to a different graph should throw + ASSERT_ANY_THROW(Graph2.end_recording(Queue)); + Graph.end_recording(Queue); + + // Testing return values of begin and end recording + // Queue should change state so should return true here + ASSERT_TRUE(Graph.begin_recording(Queue)); + // But not changed state here + ASSERT_FALSE(Graph.begin_recording(Queue)); + + // Queue2 should change state so should return true here + ASSERT_TRUE(Graph.begin_recording(Queue2)); + // But not changed state here + ASSERT_FALSE(Graph.begin_recording(Queue2)); + + // Queue should have changed state so should return true + ASSERT_TRUE(Graph.end_recording(Queue)); + // But not changed state here + ASSERT_FALSE(Graph.end_recording(Queue)); + + // Should end recording on Queue2 + ASSERT_TRUE(Graph.end_recording()); + // State should not change on Queue2 now + ASSERT_FALSE(Graph.end_recording(Queue2)); + + // Testing vector begin and end + ASSERT_TRUE(Graph.begin_recording({Queue, Queue2})); + // Both shoudl now not have state changed + ASSERT_FALSE(Graph.begin_recording(Queue)); + ASSERT_FALSE(Graph.begin_recording(Queue2)); + + // End recording on both + ASSERT_TRUE(Graph.end_recording({Queue, Queue2})); + // Both shoudl now not have state changed + ASSERT_FALSE(Graph.end_recording(Queue)); + ASSERT_FALSE(Graph.end_recording(Queue2)); + + // First add one single queue + ASSERT_TRUE(Graph.begin_recording(Queue)); + // Vector begin should still return true as Queue2 has state changed + ASSERT_TRUE(Graph.begin_recording({Queue, Queue2})); + // End recording on Queue2 + ASSERT_TRUE(Graph.end_recording(Queue2)); + // Vector end should still return true as Queue will have state changed + ASSERT_TRUE(Graph.end_recording({Queue, Queue2})); +} + +TEST_F(CommandGraphTest, GetCGCopy) { + auto Node1 = Graph.add([&](sycl::handler &cgh) {}); + auto Node2 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node1)}); + + // Get copy of CG of Node2 and check equality + auto Node2Imp = sycl::detail::getSyclObjImpl(Node2); + auto Node2CGCopy = Node2Imp->getCGCopy(); + ASSERT_EQ(Node2CGCopy->getType(), Node2Imp->MCGType); + ASSERT_EQ(Node2CGCopy->getType(), sycl::detail::CG::Kernel); + ASSERT_EQ(Node2CGCopy->getType(), Node2Imp->MCommandGroup->getType()); + ASSERT_EQ(Node2CGCopy->getAccStorage(), + Node2Imp->MCommandGroup->getAccStorage()); + ASSERT_EQ(Node2CGCopy->getArgsStorage(), + Node2Imp->MCommandGroup->getArgsStorage()); + ASSERT_EQ(Node2CGCopy->getEvents(), Node2Imp->MCommandGroup->getEvents()); + ASSERT_EQ(Node2CGCopy->getRequirements(), + Node2Imp->MCommandGroup->getRequirements()); + ASSERT_EQ(Node2CGCopy->getSharedPtrStorage(), + Node2Imp->MCommandGroup->getSharedPtrStorage()); +} +TEST_F(CommandGraphTest, SubGraph) { + // Add sub-graph with two nodes + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node1Graph)}); + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + auto Node1MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2MainGraph = + MainGraph.add([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }, + {experimental::property::node::depends_on(Node1MainGraph)}); + auto Node3MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node2MainGraph)}); + + // Assert order of the added sub-graph + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2MainGraph), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2MainGraph)->MNodeType == + experimental::node_type::subgraph); + ASSERT_EQ(sycl::detail::getSyclObjImpl(MainGraph)->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + // Subgraph nodes are duplicated when inserted to parent graph on + // finalization. we thus check the node content only. + const bool CompareContentOnly = true; + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1MainGraph) + ->MSuccessors.front() + .lock() + ->MNodeType == experimental::node_type::subgraph); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MPredecessors.size(), + 0lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MPredecessors.size(), + 1lu); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // The schedule list must contain 4 nodes: the two nodes from the subgraph are + // merged into the main graph in place of the subgraph node. + ASSERT_EQ(Schedule.size(), 4ul); + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node1MainGraph))); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node1Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node2Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node3MainGraph))); + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, SubGraphWithEmptyNode) { + // Add sub-graph with two nodes + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Empty1Graph = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on(Node1Graph)}); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Empty1Graph)}); + + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + auto Node1MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2MainGraph = + MainGraph.add([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }, + {experimental::property::node::depends_on(Node1MainGraph)}); + auto Node3MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node2MainGraph)}); + + // Assert order of the added sub-graph + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2MainGraph), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2MainGraph)->MNodeType == + experimental::node_type::subgraph); + // Check the structure of the main graph. + // 1 root connected to 1 successor (the single root of the subgraph) + ASSERT_EQ(sycl::detail::getSyclObjImpl(MainGraph)->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + // Subgraph nodes are duplicated when inserted to parent graph. + // we thus check the node content only. + const bool CompareContentOnly = true; + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1MainGraph) + ->MSuccessors.front() + .lock() + ->MNodeType == experimental::node_type::subgraph); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MPredecessors.size(), + 0lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MPredecessors.size(), + 1lu); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // The schedule list must contain 5 nodes: 2 main graph nodes and 3 subgraph + // nodes which have been merged. + ASSERT_EQ(Schedule.size(), 5ul); + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node1MainGraph))); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node1Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isEmpty()); // empty node inside the subgraph + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node2Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node3MainGraph))); + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, SubGraphWithEmptyNodeLast) { + // Add sub-graph with two nodes + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node1Graph)}); + auto EmptyGraph = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on(Node2Graph)}); + + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + auto Node1MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2MainGraph = + MainGraph.add([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }, + {experimental::property::node::depends_on(Node1MainGraph)}); + auto Node3MainGraph = MainGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node2MainGraph)}); + + // Assert order of the added sub-graph + ASSERT_NE(sycl::detail::getSyclObjImpl(Node2MainGraph), nullptr); + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2MainGraph)->MNodeType == + experimental::node_type::subgraph); + // Check the structure of the main graph. + // 1 root connected to 1 successor (the single root of the subgraph) + ASSERT_EQ(sycl::detail::getSyclObjImpl(MainGraph)->MRoots.size(), 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + // Subgraph nodes are duplicated when inserted to parent graph. + // we thus check the node content only. + const bool CompareContentOnly = true; + ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1MainGraph) + ->MSuccessors.front() + .lock() + ->MNodeType == experimental::node_type::subgraph); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MSuccessors.size(), + 1lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node1MainGraph)->MPredecessors.size(), + 0lu); + ASSERT_EQ(sycl::detail::getSyclObjImpl(Node2MainGraph)->MPredecessors.size(), + 1lu); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // The schedule list must contain 5 nodes: 2 main graph nodes and 3 subgraph + // nodes which have been merged. + ASSERT_EQ(Schedule.size(), 5ul); + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node1MainGraph))); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node1Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt) + ->isSimilar(sycl::detail::getSyclObjImpl(Node2Graph), + CompareContentOnly)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isEmpty()); // empty node inside the subgraph + ScheduleIt++; + ASSERT_TRUE( + (*ScheduleIt)->isSimilar(sycl::detail::getSyclObjImpl(Node3MainGraph))); + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, RecordSubGraph) { + // Record sub-graph with two nodes + Graph.begin_recording(Queue); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node1Graph); + cgh.single_task>([]() {}); + }); + Graph.end_recording(Queue); + auto GraphExec = Graph.finalize(); + + // Add node to main graph followed by sub-graph and another node + experimental::command_graph MainGraph(Queue.get_context(), Dev); + MainGraph.begin_recording(Queue); + auto Node1MainGraph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2MainGraph = Queue.submit([&](handler &cgh) { + cgh.depends_on(Node1MainGraph); + cgh.ext_oneapi_graph(GraphExec); + }); + auto Node3MainGraph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node2MainGraph); + cgh.single_task>([]() {}); + }); + MainGraph.end_recording(Queue); + + // Finalize main graph and check schedule + auto MainGraphExec = MainGraph.finalize(); + auto MainGraphExecImpl = sycl::detail::getSyclObjImpl(MainGraphExec); + auto Schedule = MainGraphExecImpl->getSchedule(); + + // The schedule list must contain 4 nodes: 2 main graph nodes and 2 subgraph + // nodes which have been merged in to the main graph. + ASSERT_EQ(Schedule.size(), 4ul); + + ASSERT_EQ(Queue.get_context(), MainGraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueue) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + ASSERT_EQ(Schedule.size(), 3ul); + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode1)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode2)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode3)); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmpty) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with a regular node then empty node then a regular + // node + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // the schedule list contains all types of nodes (even empty nodes) + ASSERT_EQ(Schedule.size(), 3ul); + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode1)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isEmpty()); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode3)); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmptyFirst) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with an empty node then two regular nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // the schedule list contains all types of nodes (even empty nodes) + ASSERT_EQ(Schedule.size(), 3ul); + ASSERT_TRUE((*ScheduleIt)->isEmpty()); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode2)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode3)); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with two regular nodes then an empty node + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit([&](sycl::handler &cgh) {}); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + // Note that empty nodes are not scheduled + auto GraphExec = InOrderGraph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto Schedule = GraphExecImpl->getSchedule(); + auto ScheduleIt = Schedule.begin(); + // the schedule list contains all types of nodes (even empty nodes) + ASSERT_EQ(Schedule.size(), 3ul); + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode1)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isSimilar(PtrNode2)); + ScheduleIt++; + ASSERT_TRUE((*ScheduleIt)->isEmpty()); + ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); +} + +TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + auto EventInitial = + InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); }); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes. + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto EventLast = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto WaitList = EventLastImpl->getWaitList(); + // Previous task is a host task. Explicit dependency is needed to enforce the + // execution order. + ASSERT_EQ(WaitList.size(), 1lu); + ASSERT_EQ(WaitList[0], EventInitialImpl); +} + +TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + auto EventInitial = + InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); }); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes. + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is a host task. Explicit dependency is needed to enforce the + // execution order. + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not a host task. Explicit dependency is still needed + // to properly handle blocked tasks (the event will be filtered out before + // submission to the backend). + ASSERT_EQ(EventLastWaitList.size(), 1lu); + ASSERT_EQ(EventLastWaitList[0], EventGraphImpl); +} + +TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Check if device has usm shared allocation. + if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations)) + return; + size_t Size = 128; + std::vector TestDataHost(Size); + int *TestData = sycl::malloc_shared(Size, InOrderQueue); + + auto EventInitial = InOrderQueue.memset(TestData, 1, Size * sizeof(int)); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes. + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is a memset. Explicit dependency is needed to enforce the + // execution order. + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not a host task. In Order queue dependency is managed by + // the backend for non-host kernels. + ASSERT_EQ(EventLastWaitList.size(), 0lu); +} + +TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Check if device has usm shared allocation. + if (!InOrderQueue.get_device().has(sycl::aspect::usm_shared_allocations)) + return; + size_t Size = 128; + std::vector TestDataHost(Size); + int *TestData = sycl::malloc_shared(Size, InOrderQueue); + + auto EventInitial = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes. + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is a memcpy. Explicit dependency is needed to enforce the + // execution order + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = + InOrderQueue.memcpy(TestData, TestDataHost.data(), Size * sizeof(int)); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not a host task. In Order queue dependency is managed by + // the backend for non-host kernels. + ASSERT_EQ(EventLastWaitList.size(), 0lu); +} + +TEST_F(CommandGraphTest, ExplicitBarrierException) { + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + auto Barrier = + Graph.add([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +TEST_F(CommandGraphTest, EnqueueBarrier) { + Graph.begin_recording(Queue); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Barrier = + Queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + + auto Node4Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node5Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(Queue); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | / + // \ | / + // (B) + // / \ + // (4) (5) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + for (auto Root : GraphImpl->MRoots) { + auto Node = Root.lock(); + ASSERT_EQ(Node->MSuccessors.size(), 1lu); + auto BarrierNode = Node->MSuccessors.front().lock(); + ASSERT_EQ(BarrierNode->MCGType, sycl::detail::CG::Barrier); + ASSERT_EQ(GraphImpl->getEventForNode(BarrierNode), + sycl::detail::getSyclObjImpl(Barrier)); + ASSERT_EQ(BarrierNode->MPredecessors.size(), 3lu); + ASSERT_EQ(BarrierNode->MSuccessors.size(), 2lu); + } +} + +TEST_F(CommandGraphTest, EnqueueBarrierMultipleQueues) { + sycl::queue Queue2{Queue.get_context(), Dev}; + Graph.begin_recording({Queue, Queue2}); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Barrier = + Queue2.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + + auto Node4Graph = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node5Graph = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | / + // \ | / + // (B) + // / \ + // (4) (5) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + for (auto Root : GraphImpl->MRoots) { + auto Node = Root.lock(); + ASSERT_EQ(Node->MSuccessors.size(), 1lu); + auto BarrierNode = Node->MSuccessors.front().lock(); + ASSERT_EQ(BarrierNode->MCGType, sycl::detail::CG::Barrier); + ASSERT_EQ(GraphImpl->getEventForNode(BarrierNode), + sycl::detail::getSyclObjImpl(Barrier)); + ASSERT_EQ(BarrierNode->MPredecessors.size(), 3lu); + ASSERT_EQ(BarrierNode->MSuccessors.size(), 2lu); + } +} + +TEST_F(CommandGraphTest, EnqueueBarrierWaitList) { + Graph.begin_recording(Queue); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Barrier = Queue.submit([&](sycl::handler &cgh) { + cgh.ext_oneapi_barrier({Node1Graph, Node2Graph}); + }); + + auto Node4Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node5Graph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node3Graph); + cgh.single_task>([]() {}); + }); + + Graph.end_recording(Queue); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | | + // \ | | + // (B) | + // / \ / + // (4) (5) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + for (auto Root : GraphImpl->MRoots) { + auto Node = Root.lock(); + ASSERT_EQ(Node->MSuccessors.size(), 1lu); + auto SuccNode = Node->MSuccessors.front().lock(); + if (SuccNode->MCGType == sycl::detail::CG::Barrier) { + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier)); + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 2lu); + } else { + // Node 5 + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + } + } +} + +TEST_F(CommandGraphTest, EnqueueBarrierWaitListMultipleQueues) { + sycl::queue Queue2{Queue.get_context(), Dev}; + Graph.begin_recording({Queue, Queue2}); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + // Node1Graph comes from Queue, and Node2Graph comes from Queue2 + auto Barrier = Queue.submit([&](sycl::handler &cgh) { + cgh.ext_oneapi_barrier({Node1Graph, Node2Graph}); + }); + + auto Node4Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node5Graph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node3Graph); + cgh.single_task>([]() {}); + }); + + auto Barrier2 = Queue2.submit([&](sycl::handler &cgh) { + cgh.ext_oneapi_barrier({Barrier, Node4Graph, Node5Graph}); + }); + + Graph.end_recording(); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | | + // \ | | + // (B) | + // /|\ / + // (4)|(5) + // \ | / + // \|/ + // (B2) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + for (auto Root : GraphImpl->MRoots) { + auto Node = Root.lock(); + ASSERT_EQ(Node->MSuccessors.size(), 1lu); + auto SuccNode = Node->MSuccessors.front().lock(); + if (SuccNode->MCGType == sycl::detail::CG::Barrier) { + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier)); + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 3lu); + } else { + // Node 5 + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + } + } +} + +TEST_F(CommandGraphTest, EnqueueMultipleBarrier) { + Graph.begin_recording(Queue); + auto Node1Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Barrier1 = Queue.submit([&](sycl::handler &cgh) { + cgh.ext_oneapi_barrier({Node1Graph, Node2Graph}); + }); + + auto Node4Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node5Graph = Queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(Node3Graph); + cgh.single_task>([]() {}); + }); + + auto Barrier2 = + Queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + + auto Node6Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node7Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node8Graph = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + Graph.end_recording(Queue); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | | + // \ | | + // (B1) | + // /|\ / + // (4)|(5) + // \|/ + // (B2) + // /|\ + // / | \ + // (6) (7) (8) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + for (auto Root : GraphImpl->MRoots) { + auto Node = Root.lock(); + ASSERT_EQ(Node->MSuccessors.size(), 1lu); + auto SuccNode = Node->MSuccessors.front().lock(); + if (SuccNode->MCGType == sycl::detail::CG::Barrier) { + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier1)); + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 3lu); + for (auto Succ1 : SuccNode->MSuccessors) { + auto SuccBarrier1 = Succ1.lock(); + if (SuccBarrier1->MCGType == sycl::detail::CG::Barrier) { + ASSERT_EQ(GraphImpl->getEventForNode(SuccBarrier1), + sycl::detail::getSyclObjImpl(Barrier2)); + ASSERT_EQ(SuccBarrier1->MPredecessors.size(), 3lu); + ASSERT_EQ(SuccBarrier1->MSuccessors.size(), 3lu); + for (auto Succ2 : SuccBarrier1->MSuccessors) { + auto SuccBarrier2 = Succ2.lock(); + // Nodes 6, 7, 8 + ASSERT_EQ(SuccBarrier2->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccBarrier2->MSuccessors.size(), 0lu); + } + } else { + // Node 4 or Node 5 + if (GraphImpl->getEventForNode(SuccBarrier1) == + sycl::detail::getSyclObjImpl(Node4Graph)) { + // Node 4 + ASSERT_EQ(SuccBarrier1->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccBarrier1->MSuccessors.size(), 1lu); + } + } + } + } else { + // Node 5 + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 1lu); + } + } +} + +TEST_F(CommandGraphTest, DependencyLeavesKeyword1) { + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto EmptyNode = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on_all_leaves()}); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | / + // \ | / + // (E) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + auto EmptyImpl = sycl::detail::getSyclObjImpl(EmptyNode); + ASSERT_EQ(EmptyImpl->MPredecessors.size(), 3lu); + ASSERT_EQ(EmptyImpl->MSuccessors.size(), 0lu); + + auto Node1Impl = sycl::detail::getSyclObjImpl(Node1Graph); + ASSERT_EQ(Node1Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node1Impl->MSuccessors[0].lock(), EmptyImpl); + auto Node2Impl = sycl::detail::getSyclObjImpl(Node2Graph); + ASSERT_EQ(Node2Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node2Impl->MSuccessors[0].lock(), EmptyImpl); + auto Node3Impl = sycl::detail::getSyclObjImpl(Node3Graph); + ASSERT_EQ(Node3Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node3Impl->MSuccessors[0].lock(), EmptyImpl); +} + +TEST_F(CommandGraphTest, DependencyLeavesKeyword2) { + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node3Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node4Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node3Graph)}); + + auto EmptyNode = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on_all_leaves()}); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1) (2) (3) + // \ | / + // \ | (4) + // \| / + // (E) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + auto EmptyImpl = sycl::detail::getSyclObjImpl(EmptyNode); + ASSERT_EQ(EmptyImpl->MPredecessors.size(), 3lu); + ASSERT_EQ(EmptyImpl->MSuccessors.size(), 0lu); + + auto Node1Impl = sycl::detail::getSyclObjImpl(Node1Graph); + ASSERT_EQ(Node1Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node1Impl->MSuccessors[0].lock(), EmptyImpl); + auto Node2Impl = sycl::detail::getSyclObjImpl(Node2Graph); + ASSERT_EQ(Node2Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node2Impl->MSuccessors[0].lock(), EmptyImpl); + auto Node3Impl = sycl::detail::getSyclObjImpl(Node3Graph); + ASSERT_EQ(Node3Impl->MSuccessors.size(), 1lu); + + auto Node4Impl = sycl::detail::getSyclObjImpl(Node4Graph); + ASSERT_EQ(Node4Impl->MPredecessors.size(), 1lu); + ASSERT_EQ(Node4Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node4Impl->MSuccessors[0].lock(), EmptyImpl); +} + +TEST_F(CommandGraphTest, DependencyLeavesKeyword3) { + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EmptyNode = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on_all_leaves()}); + auto Node3Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node1Graph)}); + auto Node4Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(EmptyNode)}); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1)(2) + // |\ | + // | (E) + // (3) | + // (4) + ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); + auto EmptyImpl = sycl::detail::getSyclObjImpl(EmptyNode); + ASSERT_EQ(EmptyImpl->MPredecessors.size(), 2lu); + ASSERT_EQ(EmptyImpl->MSuccessors.size(), 1lu); + + auto Node1Impl = sycl::detail::getSyclObjImpl(Node1Graph); + auto Node2Impl = sycl::detail::getSyclObjImpl(Node2Graph); + ASSERT_EQ(Node1Impl->MSuccessors.size(), 2lu); + ASSERT_EQ(Node2Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node2Impl->MSuccessors[0].lock(), EmptyImpl); + + auto Node3Impl = sycl::detail::getSyclObjImpl(Node3Graph); + ASSERT_EQ(Node3Impl->MPredecessors.size(), 1lu); + ASSERT_EQ(Node3Impl->MPredecessors[0].lock(), Node1Impl); + + auto Node4Impl = sycl::detail::getSyclObjImpl(Node4Graph); + ASSERT_EQ(Node4Impl->MPredecessors.size(), 1lu); + ASSERT_EQ(Node4Impl->MPredecessors[0].lock(), EmptyImpl); +} + +TEST_F(CommandGraphTest, DependencyLeavesKeyword4) { + auto Node1Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EmptyNode = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on_all_leaves()}); + auto Node3Graph = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EmptyNode2 = + Graph.add([&](sycl::handler &cgh) { /*empty node */ }, + {experimental::property::node::depends_on_all_leaves()}); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + + // Check the graph structure + // (1)(2) + // \/ + // (E1) (3) + // \ / + // (E2) + ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); + auto EmptyImpl = sycl::detail::getSyclObjImpl(EmptyNode); + ASSERT_EQ(EmptyImpl->MPredecessors.size(), 2lu); + ASSERT_EQ(EmptyImpl->MSuccessors.size(), 1lu); + + auto Node1Impl = sycl::detail::getSyclObjImpl(Node1Graph); + ASSERT_EQ(Node1Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node1Impl->MSuccessors[0].lock(), EmptyImpl); + auto Node2Impl = sycl::detail::getSyclObjImpl(Node2Graph); + ASSERT_EQ(Node2Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node2Impl->MSuccessors[0].lock(), EmptyImpl); + + auto EmptyImpl2 = sycl::detail::getSyclObjImpl(EmptyNode2); + auto Node3Impl = sycl::detail::getSyclObjImpl(Node3Graph); + ASSERT_EQ(Node3Impl->MPredecessors.size(), 0lu); + ASSERT_EQ(Node3Impl->MSuccessors.size(), 1lu); + ASSERT_EQ(Node3Impl->MSuccessors[0].lock(), EmptyImpl2); + + ASSERT_EQ(EmptyImpl2->MPredecessors.size(), 2lu); +} + +TEST_F(CommandGraphTest, FusionExtensionExceptionCheck) { + device D; + if (!D.get_info< + ext::codeplay::experimental::info::device::supports_fusion>()) { + // Skip this test if the device does not support fusion. Otherwise, the + // queue construction in the next step would fail. + GTEST_SKIP(); + } + + queue Q{D, ext::codeplay::experimental::property::queue::enable_fusion{}}; + + experimental::command_graph Graph{ + Q.get_context(), Q.get_device()}; + + ext::codeplay::experimental::fusion_wrapper fw{Q}; + + // Test: Start fusion on a queue that is in recording mode + Graph.begin_recording(Q); + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + fw.start_fusion(); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + Graph.end_recording(Q); + + // Test: begin recording a queue in fusion mode + + fw.start_fusion(); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.begin_recording(Q); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +TEST_F(CommandGraphTest, Memcpy2DExceptionCheck) { + constexpr size_t RECT_WIDTH = 30; + constexpr size_t RECT_HEIGHT = 21; + constexpr size_t SRC_ELEMS = RECT_WIDTH * RECT_HEIGHT; + constexpr size_t DST_ELEMS = SRC_ELEMS; + + using T = int; + + Graph.begin_recording(Queue); + + T *USMMemSrc = malloc_device(SRC_ELEMS, Queue); + T *USMMemDst = malloc_device(DST_ELEMS, Queue); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + Graph.end_recording(); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + sycl::free(USMMemSrc, Queue); + sycl::free(USMMemDst, Queue); +} + +// Tests that using reductions in a graph will throw. +TEST_F(CommandGraphTest, Reductions) { + int ReduVar = 0; + ASSERT_THROW( + { + try { + Graph.add([&](handler &CGH) { + CGH.parallel_for( + range<1>{1}, reduction(&ReduVar, int{0}, sycl::plus<>()), + [=](item<1> idx, auto &Sum) {}); + }); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); +} + +TEST_F(CommandGraphTest, BindlessExceptionCheck) { + auto Ctxt = Queue.get_context(); + + // declare image data + size_t Height = 13; + size_t Width = 7; + size_t Depth = 11; + size_t N = Height * Width * Depth; + std::vector DataIn(N); + + // Extension: image descriptor - can use the same for both images + sycl::ext::oneapi::experimental::image_descriptor Desc( + {Width, Height, Depth}, sycl::image_channel_order::rgba, + sycl::image_channel_type::fp32); + + // Extension: allocate memory on device and create the handle + // Input images memory + sycl::ext::oneapi::experimental::image_mem ImgMem(Desc, Dev, Ctxt); + // Extension: returns the device pointer to USM allocated pitched memory + size_t Pitch = 0; + auto ImgMemUSM = sycl::ext::oneapi::experimental::pitched_alloc_device( + &Pitch, Desc, Queue); + + Graph.begin_recording(Queue); + + addImagesCopies(Graph, Queue, ImgMem, DataIn, + ImgMemUSM, Pitch, Desc); + + addImagesCopies(Graph, Queue, ImgMem, DataIn, + ImgMemUSM, Pitch, Desc); + + Graph.end_recording(); + + addImagesCopies(Graph, Queue, ImgMem, DataIn, + ImgMemUSM, Pitch, Desc); + + sycl::free(ImgMemUSM, Ctxt); +} + +TEST_F(CommandGraphTest, MakeEdgeErrors) { + // Set up some nodes in the graph + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + // Test error on calling make_edge when a queue is recording to the graph + Graph.begin_recording(Queue); + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeB); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + Graph.end_recording(Queue); + + // Test error on Src and Dest being the same + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeA); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Test Src or Dest not being found in the graph + experimental::command_graph GraphOther{ + Queue.get_context(), Queue.get_device()}; + auto NodeOther = GraphOther.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeOther); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + ASSERT_THROW( + { + try { + Graph.make_edge(NodeOther, NodeB); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Test that adding a cycle with cycle checks leaves the graph in the correct + // state. + + auto CheckGraphStructure = [&]() { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto NodeAImpl = sycl::detail::getSyclObjImpl(NodeA); + auto NodeBImpl = sycl::detail::getSyclObjImpl(NodeB); + + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ((*GraphImpl->MRoots.begin()).lock(), NodeAImpl); + + ASSERT_EQ(NodeAImpl->MSuccessors.size(), 1lu); + ASSERT_EQ(NodeAImpl->MPredecessors.size(), 0lu); + ASSERT_EQ(NodeAImpl->MSuccessors.front().lock(), NodeBImpl); + + ASSERT_EQ(NodeBImpl->MSuccessors.size(), 0lu); + ASSERT_EQ(NodeBImpl->MPredecessors.size(), 1lu); + ASSERT_EQ(NodeBImpl->MPredecessors.front().lock(), NodeAImpl); + }; + // Make a normal edge + ASSERT_NO_THROW(Graph.make_edge(NodeA, NodeB)); + + // Check the expected structure of the graph + CheckGraphStructure(); + + // Introduce a cycle, make sure it throws + ASSERT_THROW( + { + try { + Graph.make_edge(NodeB, NodeA); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Re-check graph structure to make sure the graph state has not been modified + CheckGraphStructure(); +} + +TEST_F(CommandGraphTest, InvalidBuffer) { + // Check that using a buffer with write_back enabled in a graph will throw. + int Data; + // Create a buffer which does not have write-back disabled. + buffer Buffer{&Data, range<1>{1}}; + + // Use this buffer in the graph, this should throw. + ASSERT_THROW( + { + try { + Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + }); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); +} + +TEST_F(CommandGraphTest, InvalidHostAccessor) { + // Check that creating a host_accessor on a buffer which is in use by a graph + // will throw. + + // Create a buffer which does not have write-back disabled. + buffer Buffer{range<1>{1}}; + + { + // Create a graph in local scope so we can destroy it + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + // Add the buffer to the graph. + Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + }); + + // Attempt to create a host_accessor, which should throw. + ASSERT_THROW( + { + try { + host_accessor HostAcc{Buffer}; + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + } + // Graph is now out of scope so we should be able to create a host_accessor + ASSERT_NO_THROW({ host_accessor HostAcc{Buffer}; }); +} + +TEST_F(CommandGraphTest, GraphPartitionsMerging) { + // Tests that the parition merging algo works as expected in case of backward + // dependencies + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeHT1 = Graph.add([&](sycl::handler &cgh) { cgh.host_task([=]() {}); }, + {experimental::property::node::depends_on(NodeB)}); + auto NodeC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeHT1)}); + auto NodeD = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeB)}); + auto NodeHT2 = Graph.add([&](sycl::handler &cgh) { cgh.host_task([=]() {}); }, + {experimental::property::node::depends_on(NodeD)}); + auto NodeE = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeHT2)}); + auto NodeF = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeHT2)}); + + // Backward dependency + Graph.make_edge(NodeE, NodeHT1); + + auto GraphExec = Graph.finalize(); + auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); + auto PartitionsList = GraphExecImpl->getPartitions(); + ASSERT_EQ(PartitionsList.size(), 5ul); + ASSERT_FALSE(PartitionsList[0]->isHostTask()); + ASSERT_TRUE(PartitionsList[1]->isHostTask()); + ASSERT_FALSE(PartitionsList[2]->isHostTask()); + ASSERT_TRUE(PartitionsList[3]->isHostTask()); + ASSERT_FALSE(PartitionsList[4]->isHostTask()); +} + +TEST_F(CommandGraphTest, GetNodeQueries) { + // Tests graph and node queries for correctness + + // Add some nodes to the graph for testing and test after each addition. + auto RootA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 1lu); + ASSERT_EQ(GraphNodes.size(), 1lu); + } + auto RootB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 2lu); + } + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootA, RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 3lu); + } + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 4lu); + } + auto RootC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 3lu); + ASSERT_EQ(GraphNodes.size(), 5lu); + } + + ASSERT_EQ(RootA.get_predecessors().size(), 0lu); + ASSERT_EQ(RootA.get_successors().size(), 1lu); + ASSERT_EQ(RootB.get_predecessors().size(), 0lu); + ASSERT_EQ(RootB.get_successors().size(), 2lu); + ASSERT_EQ(RootC.get_predecessors().size(), 0lu); + ASSERT_EQ(RootC.get_successors().size(), 0lu); + ASSERT_EQ(NodeA.get_predecessors().size(), 2lu); + ASSERT_EQ(NodeA.get_successors().size(), 0lu); + ASSERT_EQ(NodeB.get_predecessors().size(), 1lu); + ASSERT_EQ(NodeB.get_successors().size(), 0lu); + + // List of nodes that we've added in the order they were added. + std::vector NodeList{RootA, RootB, NodeA, NodeB, RootC}; + auto GraphNodes = Graph.get_nodes(); + + // Check ordering of all nodes is correct + for (size_t i = 0; i < GraphNodes.size(); i++) { + ASSERT_EQ(sycl::detail::getSyclObjImpl(GraphNodes[i]), + sycl::detail::getSyclObjImpl(NodeList[i])); + } +} + +TEST_F(CommandGraphTest, NodeTypeQueries) { + + // Allocate some pointers for testing memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + auto NodeKernel = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_EQ(NodeKernel.get_type(), experimental::node_type::kernel); + + auto NodeMemcpy = Graph.add( + [&](sycl::handler &cgh) { cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemcpy.get_type(), experimental::node_type::memcpy); + + auto NodeMemset = Graph.add( + [&](sycl::handler &cgh) { cgh.memset(PtrB, 7, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memfill); + + auto NodeMemfill = + Graph.add([&](sycl::handler &cgh) { cgh.fill(PtrB, 7, 16); }); + ASSERT_EQ(NodeMemfill.get_type(), experimental::node_type::memfill); + + auto NodePrefetch = Graph.add( + [&](sycl::handler &cgh) { cgh.prefetch(PtrA, 16 * sizeof(int)); }); + ASSERT_EQ(NodePrefetch.get_type(), experimental::node_type::prefetch); + + auto NodeMemadvise = Graph.add( + [&](sycl::handler &cgh) { cgh.mem_advise(PtrA, 16 * sizeof(int), 1); }); + ASSERT_EQ(NodeMemadvise.get_type(), experimental::node_type::memadvise); + + // Use queue recording for barrier since it is not supported in explicit API + Graph.begin_recording(Queue); + auto EventBarrier = + Queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + Graph.end_recording(); + + auto NodeBarrier = experimental::node::get_node_from_event(EventBarrier); + ASSERT_EQ(NodeBarrier.get_type(), + experimental::node_type::ext_oneapi_barrier); + + auto NodeHostTask = + Graph.add([&](sycl::handler &cgh) { cgh.host_task([]() {}); }); + ASSERT_EQ(NodeHostTask.get_type(), experimental::node_type::host_task); + + auto NodeEmpty = Graph.add(); + ASSERT_EQ(NodeEmpty.get_type(), experimental::node_type::empty); + + experimental::command_graph Subgraph(Queue.get_context(), Dev); + // Add an empty node to the subgraph + Subgraph.add(); + + auto SubgraphExec = Subgraph.finalize(); + auto NodeSubgraph = Graph.add( + [&](sycl::handler &cgh) { cgh.ext_oneapi_graph(SubgraphExec); }); + ASSERT_EQ(NodeSubgraph.get_type(), experimental::node_type::subgraph); +} + +TEST_F(CommandGraphTest, GetNodeFromEvent) { + // Test getting a node from a recorded event and using that as a dependency + // for an explicit node + Graph.begin_recording(Queue); + auto EventKernel = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + experimental::node NodeKernelA = + experimental::node::get_node_from_event(EventKernel); + + // Add node as a dependency with the property + auto NodeKernelB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + experimental::property::node::depends_on(NodeKernelA)); + + // Test adding a dependency through make_edge + auto NodeKernelC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_NO_THROW(Graph.make_edge(NodeKernelA, NodeKernelC)); + + auto GraphExec = Graph.finalize(); +} + +TEST_F(CommandGraphTest, ProfilingException) { + Graph.begin_recording(Queue); + auto Event1 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Event2 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(Queue); + + try { + Event1.get_profiling_info(); + } catch (exception &Exception) { + ASSERT_FALSE( + std::string(Exception.what()) + .find("Profiling information is unavailable for events returned " + "from a submission to a queue in the recording state.") == + std::string::npos); + } +} + +class MultiThreadGraphTest : public CommandGraphTest { +public: + MultiThreadGraphTest() + : CommandGraphTest(), NumThreads(std::thread::hardware_concurrency()), + SyncPoint(NumThreads) { + Threads.reserve(NumThreads); + } + +protected: + const unsigned NumThreads; + Barrier SyncPoint; + std::vector Threads; +}; + +TEST_F(MultiThreadGraphTest, BeginEndRecording) { + auto RecordGraph = [&]() { + queue MyQueue{Queue.get_context(), Queue.get_device()}; + + SyncPoint.wait(); + + Graph.begin_recording(MyQueue); + runKernels(MyQueue); + Graph.end_recording(MyQueue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef{Queue.get_context(), Queue.get_device()}; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + for (unsigned i = 0; i < NumThreads; ++i) { + queue MyQueue{Queue.get_context(), Queue.get_device()}; + GraphRef.begin_recording(MyQueue); + runKernels(MyQueue); + GraphRef.end_recording(MyQueue); + } + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, ExplicitAddNodes) { + auto RecordGraph = [&]() { + queue MyQueue{Queue.get_context(), Queue.get_device()}; + + SyncPoint.wait(); + addKernels(Graph); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + for (unsigned i = 0; i < NumThreads; ++i) { + addKernels(GraphRef); + } + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, RecordAddNodes) { + Graph.begin_recording(Queue); + auto RecordGraph = [&]() { + SyncPoint.wait(); + runKernels(Queue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // We stop recording the Queue when all threads have finished their processing + Graph.end_recording(Queue); + + // Reference computation + queue QueueRef{Queue.get_context(), Queue.get_device()}; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + GraphRef.begin_recording(QueueRef); + for (unsigned i = 0; i < NumThreads; ++i) { + runKernels(QueueRef); + } + GraphRef.end_recording(QueueRef); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, RecordAddNodesInOrderQueue) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + InOrderGraph.begin_recording(InOrderQueue); + auto RecordGraph = [&]() { + SyncPoint.wait(); + runKernelsInOrder(InOrderQueue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // We stop recording the Queue when all threads have finished their processing + InOrderGraph.end_recording(InOrderQueue); + + // Reference computation + queue InOrderQueueRef{Dev, Properties}; + experimental::command_graph + InOrderGraphRef{InOrderQueueRef.get_context(), + InOrderQueueRef.get_device()}; + + InOrderGraphRef.begin_recording(InOrderQueueRef); + for (unsigned i = 0; i < NumThreads; ++i) { + runKernelsInOrder(InOrderQueueRef); + } + InOrderGraphRef.end_recording(InOrderQueueRef); + + auto GraphImpl = sycl::detail::getSyclObjImpl(InOrderGraph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(InOrderGraphRef); + ASSERT_EQ(GraphImpl->getNumberOfNodes(), GraphRefImpl->getNumberOfNodes()); + + // In-order graph must have only a single root + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + + // Check structure graph + auto CurrentNode = (*GraphImpl->MRoots.begin()).lock(); + for (size_t i = 1; i <= GraphImpl->getNumberOfNodes(); i++) { + EXPECT_LE(CurrentNode->MSuccessors.size(), 1lu); + + // Checking the last node has no successors + if (i == GraphImpl->getNumberOfNodes()) { + EXPECT_EQ(CurrentNode->MSuccessors.size(), 0lu); + } else { + // Check other nodes have 1 successor + EXPECT_EQ(CurrentNode->MSuccessors.size(), 1lu); + CurrentNode = CurrentNode->MSuccessors[0].lock(); + } + } +} + +TEST_F(MultiThreadGraphTest, Finalize) { + addKernels(Graph); + + std::mutex MutexMap; + + std::map> + GraphsExecMap; + auto FinalizeGraph = [&](int ThreadNum) { + SyncPoint.wait(); + auto GraphExec = Graph.finalize(); + Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + std::lock_guard Guard(MutexMap); + GraphsExecMap.insert( + std::map>:: + value_type(ThreadNum, GraphExec)); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(FinalizeGraph, i); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + addKernels(GraphRef); + + for (unsigned i = 0; i < NumThreads; ++i) { + auto GraphExecRef = GraphRef.finalize(); + QueueRef.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExecRef); }); + auto GraphExecImpl = + sycl::detail::getSyclObjImpl(GraphsExecMap.find(i)->second); + auto GraphExecRefImpl = sycl::detail::getSyclObjImpl(GraphExecRef); + ASSERT_EQ(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl), true); + } +} + +// Test adding fill and memset nodes to a graph +TEST_F(CommandGraphTest, FillMemsetNodes) { + const int Value = 7; + // Buffer fill + buffer Buffer{range<1>{1}}; + Buffer.set_write_back(false); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto NodeA = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + auto NodeB = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + + auto NodeAImpl = sycl::detail::getSyclObjImpl(NodeA); + auto NodeBImpl = sycl::detail::getSyclObjImpl(NodeB); + + // Check Operator== + EXPECT_EQ(NodeAImpl, NodeAImpl); + EXPECT_NE(NodeAImpl, NodeBImpl); + } + + // USM + { + int *USMPtr = malloc_device(1, Queue); + + // We need to create some differences between nodes because unlike buffer + // fills they are not differentiated on accessor ptr value. + auto FillNodeA = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value, 1); }); + auto FillNodeB = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value + 1, 1); }); + auto MemsetNodeA = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 1); }); + auto MemsetNodeB = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 2); }); + + auto FillNodeAImpl = sycl::detail::getSyclObjImpl(FillNodeA); + auto FillNodeBImpl = sycl::detail::getSyclObjImpl(FillNodeB); + auto MemsetNodeAImpl = sycl::detail::getSyclObjImpl(MemsetNodeA); + auto MemsetNodeBImpl = sycl::detail::getSyclObjImpl(MemsetNodeB); + + // Check Operator== + EXPECT_EQ(FillNodeAImpl, FillNodeAImpl); + EXPECT_EQ(FillNodeBImpl, FillNodeBImpl); + EXPECT_NE(FillNodeAImpl, FillNodeBImpl); + + EXPECT_EQ(MemsetNodeAImpl, MemsetNodeAImpl); + EXPECT_EQ(MemsetNodeBImpl, MemsetNodeBImpl); + EXPECT_NE(MemsetNodeAImpl, MemsetNodeBImpl); + sycl::free(USMPtr, Queue); + } +} From 221e852d3f5f6a5bfffa62b3f08613f941b4d853 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Fri, 1 Mar 2024 13:38:27 +0000 Subject: [PATCH 4/7] Adjusted graph test to treat memset as memfill --- sycl/unittests/Extensions/CommandGraph/Queries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/unittests/Extensions/CommandGraph/Queries.cpp b/sycl/unittests/Extensions/CommandGraph/Queries.cpp index 35057a19b86b7..0b05f57482d64 100644 --- a/sycl/unittests/Extensions/CommandGraph/Queries.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Queries.cpp @@ -110,7 +110,7 @@ TEST_F(CommandGraphTest, NodeTypeQueries) { auto NodeMemset = Graph.add( [&](sycl::handler &cgh) { cgh.memset(PtrB, 7, 16 * sizeof(int)); }); - ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memset); + ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memfill); auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { cgh.fill(PtrB, 7, 16); }); From c696e69d26e90edc66b06215aa5b0b13363e386f Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Fri, 1 Mar 2024 13:51:26 +0000 Subject: [PATCH 5/7] Updated windows symbols --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/test/abi/sycl_symbols_windows.dump | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 387458e7ee8ba..6678d0f16829b 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -82,7 +82,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/konradkusiak97/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG e400d8f37e6ccef65ff62491b417fc43b8c659ce) + set(UNIFIED_RUNTIME_TAG URChangesForFillCombined) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index a3035337d2fcf..9d64d4e5fa3cd 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1110,8 +1110,9 @@ ?fill_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22AEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z ?fill_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22AEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z ?fill_impl@handler@_V1@sycl@@AEAAXPEAXPEBX_K2@Z -?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KAEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z ?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KAEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z +?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KHV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z +?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KHV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@AEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@6@@Z ?finalize@handler@_V1@sycl@@AEAA?AVevent@23@XZ ?finalize@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$command_graph@$00@34567@AEBVproperty_list@67@@Z ?finalizeImpl@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXXZ From 0fc3f06e1e9f2f70402a1630f3da80748c1408ed Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Fri, 29 Mar 2024 18:43:11 +0000 Subject: [PATCH 6/7] Changed to correctly setting memset node and modified native_cpu symbols --- sycl/source/handler.cpp | 2 +- sycl/test/abi/pi_nativecpu_symbol_check.dump | 2 +- sycl/unittests/Extensions/CommandGraph/Queries.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 4e2351d7cbea1..53c5738e6b66b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -978,7 +978,7 @@ void handler::memset(void *Dest, int Value, size_t Count) { MDstPtr = Dest; MPattern.push_back(static_cast(Value)); MLength = Count; - setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memset); setType(detail::CG::FillUSM); } diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 6198c8aeb5832..c5cb7c2932119 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -154,11 +154,11 @@ piextQueueCreateWithNativeHandle piextQueueGetNativeHandle piextSignalExternalSemaphore piextUSMDeviceAlloc +piextUSMEnqueueFill piextUSMEnqueueFill2D piextUSMEnqueueMemAdvise piextUSMEnqueueMemcpy piextUSMEnqueueMemcpy2D -piextUSMEnqueueMemset piextUSMEnqueueMemset2D piextUSMEnqueuePrefetch piextUSMFree diff --git a/sycl/unittests/Extensions/CommandGraph/Queries.cpp b/sycl/unittests/Extensions/CommandGraph/Queries.cpp index 0b05f57482d64..35057a19b86b7 100644 --- a/sycl/unittests/Extensions/CommandGraph/Queries.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Queries.cpp @@ -110,7 +110,7 @@ TEST_F(CommandGraphTest, NodeTypeQueries) { auto NodeMemset = Graph.add( [&](sycl::handler &cgh) { cgh.memset(PtrB, 7, 16 * sizeof(int)); }); - ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memfill); + ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memset); auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { cgh.fill(PtrB, 7, 16); }); From d98d82eb009aff84f3e072c7dbd78b3c4f1f7cd3 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 3 Apr 2024 10:26:06 +0100 Subject: [PATCH 7/7] Updated graph usm fill tests and Command Graph docs --- sycl/doc/design/CommandGraph.md | 2 ++ sycl/test-e2e/Graph/Explicit/usm_fill.cpp | 2 ++ sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp | 3 +++ sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp | 3 +++ sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp | 2 ++ sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp | 3 +++ sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp | 3 +++ sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp | 2 ++ 8 files changed, 20 insertions(+) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index c1e570d13ab21..acd159cb435ca 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -486,6 +486,8 @@ The types of commands which are unsupported, and lead to this exception are: This corresponds to a memory buffer write command. * `handler::copy(src, dest)` or `handler::memcpy(dest, src)` - Where both `src` and `dest` are USM pointers. This corresponds to a USM copy command. +* `handler::fill(ptr, pattern, count)` - This corresponds to a USM memory + fill command. * `handler::memset(ptr, value, numBytes)` - This corresponds to a USM memory fill command. * `handler::prefetch()`. diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp index d2aff0dee2c20..26d3b88e64ff9 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp @@ -5,6 +5,8 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp index 7acd5143f6f54..e005170a5feff 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp @@ -7,6 +7,9 @@ // REQUIRES: aspect-usm_host_allocations +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl + #define GRAPH_E2E_EXPLICIT #include "../Inputs/usm_fill_host.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp index 8b8c623f48107..bb40a5683843c 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp @@ -7,6 +7,9 @@ // REQUIRES: aspect-usm_shared_allocations +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl + #define GRAPH_E2E_EXPLICIT #include "../Inputs/usm_fill_shared.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp index 16bf82ac572ae..d2925683837c2 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp @@ -5,6 +5,8 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp index 0e9b3dbdb39e8..c08b70c695e4f 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp @@ -7,6 +7,9 @@ // REQUIRES: aspect-usm_host_allocations +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl + #define GRAPH_E2E_RECORD_REPLAY #include "../Inputs/usm_fill_host.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp index 950afd3da8b97..f40f8d3aa4930 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp @@ -7,6 +7,9 @@ // REQUIRES: aspect-usm_shared_allocations +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl + #define GRAPH_E2E_RECORD_REPLAY #include "../Inputs/usm_fill_shared.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp index 9506e99cd73ad..84a5d42f0da04 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp @@ -2,6 +2,8 @@ // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{ %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// USM fill command not supported for OpenCL +// UNSUPPORTED: opencl // // Tests adding a USM memset queue shortcut operation as a graph node.