From b1954095fa19fffc0ea8ce1f34a4e6d8893b668b Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 12 Feb 2024 18:33:10 +0000 Subject: [PATCH 01/26] [SYCL][Graph] Prototype of explicit update with indices - Experimental implementation of explicit update with indices - New scheduler command for updating a command buffer command - PI equivalents for new UR APIs - E2E and Unit Tests --- sycl/doc/design/CommandGraph.md | 42 ++ sycl/include/sycl/detail/pi.def | 1 + sycl/include/sycl/detail/pi.h | 56 ++- sycl/include/sycl/detail/pi.hpp | 1 + sycl/include/sycl/detail/property_helper.hpp | 3 +- .../sycl/ext/oneapi/experimental/graph.hpp | 102 ++++- sycl/include/sycl/handler.hpp | 15 + sycl/plugins/cuda/pi_cuda.cpp | 12 +- sycl/plugins/hip/pi_hip.cpp | 12 +- sycl/plugins/level_zero/pi_level_zero.cpp | 12 +- sycl/plugins/native_cpu/pi_native_cpu.cpp | 12 +- sycl/plugins/opencl/pi_opencl.cpp | 12 +- sycl/plugins/unified_runtime/pi2ur.hpp | 79 +++- .../unified_runtime/pi_unified_runtime.cpp | 12 +- sycl/source/detail/event_impl.hpp | 18 +- sycl/source/detail/graph_impl.cpp | 381 +++++++++++++++++- sycl/source/detail/graph_impl.hpp | 275 ++++++++++++- sycl/source/detail/handler_impl.hpp | 12 + sycl/source/detail/scheduler/commands.cpp | 70 +++- sycl/source/detail/scheduler/commands.hpp | 27 ++ .../source/detail/scheduler/graph_builder.cpp | 91 +++++ sycl/source/detail/scheduler/scheduler.cpp | 46 +++ sycl/source/detail/scheduler/scheduler.hpp | 32 ++ sycl/source/handler.cpp | 4 + .../Graph/Explicit/update_before_finalize.cpp | 63 +++ .../Graph/Explicit/update_nd_range.cpp | 60 +++ sycl/test-e2e/Graph/Explicit/update_range.cpp | 60 +++ .../Explicit/update_with_indices_accessor.cpp | 72 ++++ ...date_with_indices_multiple_exec_graphs.cpp | 79 ++++ .../Explicit/update_with_indices_ordering.cpp | 80 ++++ .../Explicit/update_with_indices_ptr.cpp | 72 ++++ .../update_with_indices_ptr_double_update.cpp | 82 ++++ ...update_with_indices_ptr_multiple_nodes.cpp | 87 ++++ ...pdate_with_indices_ptr_multiple_params.cpp | 86 ++++ .../update_with_indices_ptr_subgraph.cpp | 82 ++++ .../Explicit/update_with_indices_scalar.cpp | 67 +++ sycl/test/abi/pi_cuda_symbol_check.dump | 2 +- sycl/test/abi/pi_level_zero_symbol_check.dump | 1 + sycl/test/abi/pi_opencl_symbol_check.dump | 3 +- sycl/test/abi/sycl_symbols_linux.dump | 18 +- sycl/test/abi/sycl_symbols_windows.dump | 14 +- sycl/unittests/Extensions/CommandGraph.cpp | 140 +++++++ sycl/unittests/helpers/PiMockPlugin.hpp | 11 +- 43 files changed, 2345 insertions(+), 61 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_nd_range.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_range.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 9cc66626a2e78..9cbf7b129b3ec 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -46,6 +46,7 @@ with the following entry-points: | `urCommandBufferAppendMemBufferReadRectExp` | Append a rectangular memory read command to a command-buffer object. | | `urCommandBufferAppendMemBufferFillExp` | Append a memory fill command to a command-buffer object. | | `urCommandBufferEnqueueExp` | Submit command-buffer to a command-queue for execution. | +| `urCommandBufferUpdateKernelLaunchExp` | Updates the parameters of a previous kernel launch command. | See the [UR EXP-COMMAND-BUFFER](https://oneapi-src.github.io/unified-runtime/core/EXP-COMMAND-BUFFER.html) specification for more details. @@ -230,6 +231,47 @@ on buffer usage in a graph so that their lifetime semantics are compatible with a lazy work execution model. However these changes to storage lifetimes have not yet been implemented. +## Graph Update + +### Design Challenges + +Graph update faces significant design challenges in SYCL: + +* Lambda capture order is explicitly undefined in C++, so the user cannot reason + about the indices of arguments captured by kernel lambdas. +* Once arguments have been captured the actual type information is lost in the + transition through the integration header and extracting arguments in the SYCL + runtime, therefore we cannot automatically match new argument values by + querying the captured arguments without significant possibility for + collisions. For example, if a kernel captures two USM pointers and the user + wishes to update one, we cannot reason about which pointer they actually want + to update when we only know that: they are pointer args of a certain size. + +The current approach is to limit graph update to the explicit APIs and where the +user is using `handler::set_arg()` or some equivalent to manually set kernel +arguments using indices. Therefore when updating we can use indices to avoid +collisions. In practice there are only a few current scenarios where `set_arg()` +can be used: + +* The proposed ["Free Function Kernel" + extension](../extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc) +* OpenCL interop kernels created from SPIR-V source at runtime. + +A possible future workaround lambda capture issues could be "Whole-Graph Update" +where if we can guarantee that lambda capture order is the same across two +different recordings we can then match parameter order when updating. + +### Scheduler Integration + +Graph updates in the runtime are sychronous calls however they can optionally be +done through the scheduler using a new command, +`sycl::detail::UpdateCommandBufferCommand`. This is needed when dealing with +accessor updates. Since a new buffer which the user creates for updating may not +yet have been lazily initialized on device we schedule a new command which has +requirements for these new accessors to correctly trigger allocations before +updating. This is similar to how individual graph commands are enqueued when +accessors are used in a graph node. + ## Backend Implementation Implementation of UR command-buffers diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index c6b962b8b0f48..b53ec285f04b9 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -183,6 +183,7 @@ _PI_API(piextCommandBufferFillUSM) _PI_API(piextCommandBufferPrefetchUSM) _PI_API(piextCommandBufferAdviseUSM) _PI_API(piextEnqueueCommandBuffer) +_PI_API(piextCommandBufferUpdateKernelLaunch) _PI_API(piextUSMPitchedAlloc) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 56fdeb7a1051b..ffd140825d2fd 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -154,9 +154,10 @@ // 15.44 Add coarse-grain memory advice flag for HIP. // 15.45 Added piextKernelSuggestMaxCooperativeGroupCount and // piextEnqueueCooperativeKernelLaunch. +// 15.46 Add CommandBuffer update definitions #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 45 +#define _PI_H_VERSION_MINOR 46 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -443,6 +444,9 @@ typedef enum { // Composite device PI_EXT_ONEAPI_DEVICE_INFO_COMPONENT_DEVICES = 0x20111, PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE = 0x20112, + + // Command Buffers + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT = 0x20113, } _pi_device_info; typedef enum { @@ -2314,7 +2318,10 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, /// Command buffer extension struct _pi_ext_command_buffer; struct _pi_ext_sync_point; +struct _pi_ext_command_buffer_command; + using pi_ext_command_buffer = _pi_ext_command_buffer *; +using pi_ext_command_buffer_command = _pi_ext_command_buffer_command *; using pi_ext_sync_point = pi_uint32; typedef enum { @@ -2324,7 +2331,40 @@ typedef enum { struct pi_ext_command_buffer_desc final { pi_ext_structure_type stype; const void *pNext; - pi_queue_properties *properties; + pi_bool is_updateable; +}; + +// Command Buffer Update types +struct pi_ext_command_buffer_update_memobj_arg_desc_t final { + uint32_t arg_index; + const pi_mem_obj_property *properties; + pi_mem new_mem_obj; +}; + +struct pi_ext_command_buffer_update_pointer_arg_desc_t final { + uint32_t arg_index; + void *new_ptr; +}; + +struct pi_ext_command_buffer_update_value_arg_desc_t final { + uint32_t arg_index; + uint32_t arg_size; + void *new_value; +}; + +struct pi_ext_command_buffer_update_kernel_launch_desc final { + uint32_t num_mem_obj_args; + uint32_t num_ptr_args; + uint32_t num_value_args; + uint32_t num_work_dim; + + pi_ext_command_buffer_update_memobj_arg_desc_t *mem_obj_arg_list; + pi_ext_command_buffer_update_pointer_arg_desc_t *ptr_arg_list; + pi_ext_command_buffer_update_value_arg_desc_t *value_arg_list; + + size_t *global_work_offset; + size_t *global_work_size; + size_t *local_work_size; }; /// API to create a command-buffer. @@ -2368,12 +2408,14 @@ piextCommandBufferFinalize(pi_ext_command_buffer command_buffer); /// \param sync_point_wait_list A list of sync points that this command must /// wait on. /// \param sync_point The sync_point associated with this kernel execution. +/// \param command Return pointer to the command representing this kernel +/// execution. __SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, - pi_ext_sync_point *sync_point); + pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command); /// API to append a USM memcpy command to the command-buffer. /// \param command_buffer The command-buffer to append onto. @@ -2601,6 +2643,14 @@ piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +/// API to update a kernel launch command inside of a command-buffer. +/// @param command The command to be updated. +/// @param desc Descriptor which describes the updated parameters of the kernel +/// launch. +__SYCL_EXPORT pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command command, + pi_ext_command_buffer_update_kernel_launch_desc *desc); + /// API to destroy bindless unsampled image handles. /// /// \param context is the pi_context diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 84f9272d22bf5..6ac57370102ce 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -156,6 +156,7 @@ using PiKernelCacheConfig = ::pi_kernel_cache_config; using PiExtSyncPoint = ::pi_ext_sync_point; using PiExtCommandBuffer = ::pi_ext_command_buffer; using PiExtCommandBufferDesc = ::pi_ext_command_buffer_desc; +using PiExtCommandBufferCommand = ::pi_ext_command_buffer_command; using PiPeerAttr = ::pi_peer_attr; using PiImageHandle = ::pi_image_handle; using PiImageMemHandle = ::pi_image_mem_handle; diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 3009af8ee2890..1e1c91590d231 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -47,8 +47,9 @@ enum DataLessPropKind { GraphAssumeDataOutlivesBuffer = 22, GraphAssumeBufferOutlivesGraph = 23, GraphDependOnAllLeaves = 24, + GraphUpdateable = 25, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 24, + LastKnownDataLessPropKind = 25, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 209a0ed25f72f..d718021f0b90e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -8,10 +8,13 @@ #pragma once -#include // for context -#include // for __SYCL_EXPORT +#include +#include // for context +#include // for __SYCL_EXPORT +#include #include // for DataLessPropKind, PropWith... #include // for device +#include #include // for is_property, is_property_of #include // for property_list @@ -30,6 +33,15 @@ namespace ext { namespace oneapi { namespace experimental { +/// State to template the command_graph class on. +enum class graph_state { + modifiable, ///< In modifiable state, commands can be added to graph. + executable, ///< In executable state, the graph is ready to execute. +}; + +// Forward declare Graph class +template class command_graph; + namespace detail { // List of sycl features and extensions which are not supported by graphs. Used // for throwing errors when these features are used with graphs. @@ -73,15 +85,9 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) { class node_impl; class graph_impl; class exec_graph_impl; - +class dynamic_parameter_impl; } // namespace detail -/// State to template the command_graph class on. -enum class graph_state { - modifiable, ///< In modifiable state, commands can be added to graph. - executable, ///< In executable state, the graph is ready to execute. -}; - enum class node_type { empty = 0, subgraph = 1, @@ -113,6 +119,13 @@ class __SYCL_EXPORT node { /// submission. static node get_node_from_event(event nodeEvent); + /// Update the ND-Range of this node if it is a kernel execution node + template + void update_nd_range(nd_range executionRange); + + /// Update the Range of this node if it is a kernel execution node + template void update_range(range executionRange); + private: node(const std::shared_ptr &Impl) : impl(Impl) {} @@ -146,6 +159,14 @@ class assume_buffer_outlives_graph public: assume_buffer_outlives_graph() = default; }; + +/// Property passed to command_graph::finalize() to +/// mark the resulting executable command_graph as able to be updated. +class updateable + : public ::sycl::detail::DataLessProperty<::sycl::detail::GraphUpdateable> { +public: + updateable() = default; +}; } // namespace graph namespace node { @@ -336,12 +357,24 @@ class __SYCL_EXPORT executable_command_graph { /// @param Graph Graph to use the inputs and outputs of. void update(const command_graph &Graph); + /// Updates a single node in this graph based on the contents of the provided + /// node. + /// @param Node The node to use for updating the graph. + void update(const node &Node); + + /// Updates a number of nodes in this graph based on the contents of the + /// provided nodes. + /// @param Nodes The nodes to use for updating the graph. + void update(const std::vector &Nodes); + protected: /// Constructor used by internal runtime. /// @param Graph Detail implementation class to construct with. /// @param Ctx Context to use for graph. + /// @param PropList Optional list of properties to pass. executable_command_graph(const std::shared_ptr &Graph, - const sycl::context &Ctx); + const sycl::context &Ctx, + const property_list &PropList = {}); template friend decltype(Obj::impl) @@ -385,13 +418,60 @@ class command_graph : public detail::modifiable_command_graph { template <> class command_graph : public detail::executable_command_graph { - protected: friend command_graph detail::modifiable_command_graph::finalize(const sycl::property_list &) const; using detail::executable_command_graph::executable_command_graph; }; +namespace detail { +class __SYCL_EXPORT dynamic_parameter_base { +public: + dynamic_parameter_base( + sycl::ext::oneapi::experimental::command_graph + Graph); + + void register_with_node(handler &CGH, int ArgIndex); + +protected: + void updateValue(void *NewValue, size_t Size); + + void updateAccessor(sycl::detail::AccessorBaseHost *Acc); + std::shared_ptr impl; +}; +} // namespace detail + +template +class dynamic_parameter : public detail::dynamic_parameter_base { + static constexpr bool IsAccessor = + std::is_base_of_v; + static constexpr sycl::detail::kernel_param_kind_t ParamType = + IsAccessor ? sycl::detail::kernel_param_kind_t::kind_accessor + : std::is_pointer_v + ? sycl::detail::kernel_param_kind_t::kind_pointer + : sycl::detail::kernel_param_kind_t::kind_std_layout; + +public: + dynamic_parameter(experimental::command_graph Graph) + : detail::dynamic_parameter_base(Graph), MValue() {} + + dynamic_parameter(ValueT InitialValue, + experimental::command_graph Graph) + : detail::dynamic_parameter_base(Graph), MValue(InitialValue) {} + + void update(const ValueT &NewValue) { + MValue = NewValue; + if constexpr (IsAccessor) { + detail::dynamic_parameter_base::updateAccessor(&MValue); + } else { + detail::dynamic_parameter_base::updateValue(&MValue, sizeof(ValueT)); + } + } + +private: + ValueT MValue; +}; + /// Additional CTAD deduction guide. template command_graph(const context &SyclContext, const device &SyclDevice, diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 7a6371624b3ef..784c40e849504 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1307,6 +1307,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(Wrapper)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif } else #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && @@ -1324,6 +1325,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif } } @@ -1378,6 +1380,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(true); #endif } @@ -1395,6 +1398,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); setType(detail::CG::Kernel); + setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -1433,6 +1437,7 @@ class __SYCL_EXPORT handler { MNDRDesc.setNumWorkGroups(NumWorkGroups); StoreLambda(std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif // __SYCL_DEVICE_ONLY__ } @@ -1986,6 +1991,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif } @@ -2078,6 +2084,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); setType(detail::CG::Kernel); + setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -2096,6 +2103,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); setType(detail::CG::Kernel); + setNDRangeUsed(true); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -2158,6 +2166,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(false); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2197,6 +2206,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(false); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2235,6 +2245,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(true); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -3421,6 +3432,7 @@ class __SYCL_EXPORT handler { void ext_intel_write_host_pipe(const std::string &Name, void *Ptr, size_t Size, bool Block = false); friend class ext::oneapi::experimental::detail::graph_impl; + friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; bool DisableRangeRounding(); @@ -3644,6 +3656,9 @@ class __SYCL_EXPORT handler { "for use with the SYCL Graph extension."); } } + + // Set that an ND Range was used during a call to parallel_for + void setNDRangeUsed(bool Value); }; } // namespace _V1 } // namespace sycl diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 02fe3af901cb8..3649e76ce1915 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1079,10 +1079,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1208,6 +1210,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index b895727c9d0fa..de3bef26264bb 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1082,10 +1082,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1211,6 +1213,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c923c802f1d3f..b7f354f560daa 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1240,10 +1240,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1369,6 +1371,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { // missing diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 48ce104a94e90..1dcae848af9f5 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -1087,10 +1087,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1177,6 +1179,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 45fb66575ec42..cd303ef4e220c 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1018,10 +1018,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1147,6 +1149,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 87ee60f41e2da..ac77041ca10f9 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1278,6 +1278,9 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, UR_DEVICE_INFO_COMPONENT_DEVICES) PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE, UR_DEVICE_INFO_COMPOSITE_DEVICE) + PI_TO_UR_MAP_DEVICE_INFO( + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) #undef PI_TO_UR_MAP_DEVICE_INFO default: return PI_ERROR_UNKNOWN; @@ -4462,13 +4465,14 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_context_handle_t UrContext = reinterpret_cast(Context); ur_device_handle_t UrDevice = reinterpret_cast(Device); - const ur_exp_command_buffer_desc_t *UrDesc = - reinterpret_cast(Desc); + ur_exp_command_buffer_desc_t UrDesc; + UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + UrDesc.isUpdatable = Desc->is_updateable; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer); HANDLE_ERRORS( - urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer)); + urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer)); return PI_SUCCESS; } @@ -4506,16 +4510,18 @@ inline pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { ur_exp_command_buffer_handle_t UrCommandBuffer = reinterpret_cast(CommandBuffer); ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); - + ur_exp_command_buffer_command_handle_t *UrCommandHandle = + reinterpret_cast(Command); HANDLE_ERRORS(urCommandBufferAppendKernelLaunchExp( UrCommandBuffer, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, - nullptr)); + UrCommandHandle)); return PI_SUCCESS; } @@ -4792,6 +4798,67 @@ inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, return PI_SUCCESS; } +inline pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command command, + pi_ext_command_buffer_update_kernel_launch_desc *desc) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + ur_exp_command_buffer_update_kernel_launch_desc_t UrDesc; + + UrDesc.stype = ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_EXEC_INFO_DESC; + UrDesc.numNewMemObjArgs = desc->num_mem_obj_args; + UrDesc.numNewPointerArgs = desc->num_ptr_args; + UrDesc.numNewValueArgs = desc->num_value_args; + UrDesc.newWorkDim = desc->num_work_dim; + + // Exec info updates are unused and will be removed from UR in future + UrDesc.numNewExecInfos = 0; + UrDesc.pNewExecInfoList = nullptr; + + // Convert arg descs + std::vector UrMemObjDescs; + std::vector UrPointerDescs; + std::vector UrValueDescs; + + for (size_t i = 0; i < UrDesc.numNewMemObjArgs; i++) { + auto &PiDesc = desc->mem_obj_arg_list[i]; + UrMemObjDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC, + nullptr, PiDesc.arg_index, nullptr, + reinterpret_cast(PiDesc.new_mem_obj)}); + } + UrDesc.pNewMemObjArgList = UrMemObjDescs.data(); + + for (size_t i = 0; i < UrDesc.numNewPointerArgs; i++) { + auto &PiDesc = desc->ptr_arg_list[i]; + UrPointerDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, + nullptr, PiDesc.arg_index, nullptr, PiDesc.new_ptr}); + } + UrDesc.pNewPointerArgList = UrPointerDescs.data(); + + for (size_t i = 0; i < UrDesc.numNewValueArgs; i++) { + auto &PiDesc = desc->value_arg_list[i]; + UrValueDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, + nullptr, PiDesc.arg_index, PiDesc.arg_size, nullptr, + PiDesc.new_value}); + } + UrDesc.pNewValueArgList = UrValueDescs.data(); + + UrDesc.pNewGlobalWorkSize = desc->global_work_size; + UrDesc.pNewGlobalWorkOffset = desc->global_work_offset; + UrDesc.pNewLocalWorkSize = desc->local_work_size; + + HANDLE_ERRORS(urCommandBufferUpdateKernelLaunchExp(UrCommand, &UrDesc)); + + return PI_SUCCESS; +} + // Command-buffer extension /////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 51d7041d03e48..38736139feba4 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1021,10 +1021,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1150,6 +1152,12 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index feb65cfc88946..8e26628e52311 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -264,7 +264,7 @@ class event_impl { } // Sets a sync point which is used when this event represents an enqueue to a - // Command Bufferr. + // Command Buffer. void setSyncPoint(sycl::detail::pi::PiExtSyncPoint SyncPoint) { MSyncPoint = SyncPoint; } @@ -290,6 +290,17 @@ class event_impl { return MEventFromSubmittedExecCommandBuffer; } + // Sets a command-buffer command when this event represents an enqueue to a + // Command Buffer. + void + setCommandBufferCommand(sycl::detail::pi::PiExtCommandBufferCommand Command) { + MCommandBufferCommand = Command; + } + + sycl::detail::pi::PiExtCommandBufferCommand getCommandBufferCommand() const { + return MCommandBufferCommand; + } + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -348,6 +359,11 @@ class event_impl { // stored here. sycl::detail::pi::PiExtSyncPoint MSyncPoint; + // If this event represents a submission to a + // sycl::detail::pi::PiExtCommandBuffer the command-buffer command + // (if any) associated with that submission is stored here. + sycl::detail::pi::PiExtCommandBufferCommand MCommandBufferCommand; + friend std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index bdfc90537b520..913499bceb763 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -392,8 +392,24 @@ graph_impl::add(const std::shared_ptr &Impl, Handler.MCGType); auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep); + NodeImpl->MNDRangeUsed = Handler.MImpl->MNDRangeUsed; // Add an event associated with this explicit node for mixed usage addEventForNode(Impl, std::make_shared(), NodeImpl); + + // Retrieve any dynamic parameters which have been registered in the CGF and + // register the actual nodes with them. + auto &DynamicParams = Handler.MImpl->MDynamicParameters; + + if (NodeType != node_type::kernel && DynamicParams.size() > 0) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "dynamic_parameters cannot be registered with graph " + "nodes which do not represent kernel executions"); + } + + for (auto &DynamicParam : DynamicParams) { + DynamicParam->registerNode(NodeImpl); + } + return NodeImpl; } @@ -643,10 +659,13 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNodeDirect( findRealDeps(Deps, N.lock(), MPartitionNodes[Node]); } sycl::detail::pi::PiExtSyncPoint NewSyncPoint; + sycl::detail::pi::PiExtCommandBufferCommand NewCommand; pi_int32 Res = sycl::detail::enqueueImpCommandBufferKernel( Ctx, DeviceImpl, CommandBuffer, *static_cast((Node->MCommandGroup.get())), - Deps, &NewSyncPoint, nullptr); + Deps, &NewSyncPoint, &NewCommand, nullptr); + + MCommandMap[Node] = NewCommand; if (Res != pi_result::PI_SUCCESS) { throw sycl::exception(errc::invalid, @@ -675,12 +694,15 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode( sycl::detail::Scheduler::getInstance().addCG( Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps); + MCommandMap[Node] = Event->getCommandBufferCommand(); return Event->getSyncPoint(); } void exec_graph_impl::createCommandBuffers( sycl::device Device, std::shared_ptr &Partition) { sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; - sycl::detail::pi::PiExtCommandBufferDesc Desc{}; + sycl::detail::pi::PiExtCommandBufferDesc Desc{ + pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, + MIsUpdateable}; auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); @@ -736,6 +758,35 @@ void exec_graph_impl::createCommandBuffers( } } +exec_graph_impl::exec_graph_impl(sycl::context Context, + const std::shared_ptr &GraphImpl, + const property_list &PropList) + : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), + MRequirements(), MExecutionEvents(), + MIsUpdateable(PropList.has_property()) { + + // If the graph has been marked as updateable then check if the backend + // actually supports that. + if (MIsUpdateable) { + pi_bool SupportsUpdate = PI_FALSE; + bool CallSuccessful = + sycl::detail::getSyclObjImpl(MContext) + ->getPlugin() + ->call_nocheck( + sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()) + ->getHandleRef(), + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + sizeof(pi_bool), &SupportsUpdate, nullptr) == PI_SUCCESS; + + if (!CallSuccessful || !SupportsUpdate) { + throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), + "Device does not support Command Graph update"); + } + } + // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. + duplicateNodes(); +} + exec_graph_impl::~exec_graph_impl() { const sycl::detail::PluginPtr &Plugin = sycl::detail::getSyclObjImpl(MContext)->getPlugin(); @@ -960,6 +1011,10 @@ void exec_graph_impl::duplicateNodes() { std::shared_ptr NodeCopy = std::make_shared(*OriginalNode); + // Associate the ID of the original node with the node copy for later quick + // access + MIDCache.insert(std::make_pair(OriginalNode->MID, NodeCopy)); + // Clear edges between nodes so that we can replace with new ones NodeCopy->MSuccessors.clear(); NodeCopy->MPredecessors.clear(); @@ -1003,6 +1058,10 @@ void exec_graph_impl::duplicateNodes() { for (size_t i = 0; i < SubgraphNodes.size(); i++) { auto SubgraphNode = SubgraphNodes[i]; auto NodeCopy = std::make_shared(*SubgraphNode); + // Associate the ID of the original subgraph node with all extracted node + // copies for future quick access. + MIDCache.insert(std::make_pair(SubgraphNode->MID, NodeCopy)); + NewSubgraphNodes.push_back(NodeCopy); SubgraphNodesMap.insert({SubgraphNode, NodeCopy}); NodeCopy->MSuccessors.clear(); @@ -1091,6 +1150,246 @@ void exec_graph_impl::duplicateNodes() { MNodeStorage.insert(MNodeStorage.begin(), NewNodes.begin(), NewNodes.end()); } +void exec_graph_impl::update(std::shared_ptr Node) { + this->update(std::vector>{Node}); +} +void exec_graph_impl::update( + const std::vector> Nodes) { + + if (!MIsUpdateable) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "update() cannot be called on a executable graph " + "which was not created with property::updatable"); + } + + // If there are any accessor requirements, we have to update through the + // scheduler to ensure that any allocations have taken place before trying to + // update. + bool NeedScheduledUpdate = false; + for (auto &Node : Nodes) { + // Check if node(s) derived from this modifiable node exists in this graph + if (MIDCache.count(Node->getID()) == 0) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Node passed to update() is not part of the graph."); + } + if (Node->MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception(errc::invalid, "Cannot update non-kernel nodes"); + } + + if (Node->MCommandGroup->getRequirements().size() == 0) { + continue; + } + NeedScheduledUpdate = true; + // Update cached requirements for this graph with updated node ones + auto UpdatedReqs = Node->MUpdatedAccessorsCache; + for (auto &CachedReq : MRequirements) { + for (auto &UpdatedReq : UpdatedReqs) { + if (CachedReq == UpdatedReq.first) { + CachedReq = UpdatedReq.second; + } + } + } + } + + // Clean up any execution events which have finished so we don't pass them to + // the scheduler. + for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) { + if ((*It)->isCompleted()) { + It = MExecutionEvents.erase(It); + continue; + } + ++It; + } + + // If we have previous execution events do the update through the scheduler to + // ensure it is ordered correctly. + NeedScheduledUpdate |= MExecutionEvents.size() > 0; + + if (NeedScheduledUpdate) { + auto AllocaQueue = std::make_shared( + sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()), + sycl::detail::getSyclObjImpl(MGraphImpl->getContext()), + sycl::async_handler{}, sycl::property_list{}); + // Don't need to care about the return event here because it is synchronous + sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( + this, Nodes, AllocaQueue, MRequirements, MExecutionEvents); + } else { + for (auto &Node : Nodes) { + updateImpl(Node); + } + } +} + +void exec_graph_impl::updateImpl(std::shared_ptr Node) { + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()); + + // Gather arg information from Node + auto &ExecCG = + *(static_cast(Node->MCommandGroup.get())); + // Copy args because we may modify them + std::vector NodeArgs = ExecCG.getArguments(); + // Copy NDR desc since we need to modify it + auto NDRDesc = ExecCG.MNDRDesc; + + pi_kernel PiKernel = nullptr; + auto Kernel = ExecCG.MSyclKernel; + auto KernelBundleImplPtr = ExecCG.MKernelBundle; + std::shared_ptr SyclKernelImpl = nullptr; + const sycl::detail::KernelArgMask *EliminatedArgMask = nullptr; + + // Use kernel_bundle if available unless it is interop. + // Interop bundles can't be used in the first branch, because the kernels + // in interop kernel bundles (if any) do not have kernel_id + // and can therefore not be looked up, but since they are self-contained + // they can simply be launched directly. + if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + auto KernelName = ExecCG.MKernelName; + kernel_id KernelID = + sycl::detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + SyclKernelImpl = sycl::detail::getSyclObjImpl(SyclKernel); + PiKernel = SyclKernelImpl->getHandleRef(); + EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); + } else if (Kernel != nullptr) { + PiKernel = Kernel->getHandleRef(); + auto SyclProg = Kernel->getProgramImpl(); + EliminatedArgMask = Kernel->getKernelArgMask(); + } else { + std::tie(PiKernel, std::ignore, EliminatedArgMask, std::ignore) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, ExecCG.MKernelName); + } + + // Remove eliminated args + std::vector MaskedArgs; + MaskedArgs.reserve(NodeArgs.size()); + + sycl::detail::applyFuncOnFilteredArgs( + EliminatedArgMask, NodeArgs, + [&MaskedArgs](sycl::detail::ArgDesc &Arg, int NextTrueIndex) { + MaskedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex); + }); + + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + + // Reverse kernel dims + sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc); + + size_t RequiredWGSize[3] = {0, 0, 0}; + size_t *LocalSize = nullptr; + + if (HasLocalSize) + LocalSize = &NDRDesc.LocalSize[0]; + else { + Plugin->call( + PiKernel, DeviceImpl->getHandleRef(), + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize), + RequiredWGSize, + /* param_value_size_ret = */ nullptr); + + const bool EnforcedLocalSize = + (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || + RequiredWGSize[2] != 0); + if (EnforcedLocalSize) + LocalSize = RequiredWGSize; + } + // Create update descriptor + + // Storage for individual arg descriptors + std::vector MemobjDescs; + std::vector PtrDescs; + std::vector ValueDescs; + MemobjDescs.reserve(MaskedArgs.size()); + PtrDescs.reserve(MaskedArgs.size()); + ValueDescs.reserve(MaskedArgs.size()); + + pi_ext_command_buffer_update_kernel_launch_desc UpdateDesc; + + // Collect arg descriptors and fill kernel launch descriptor + using sycl::detail::kernel_param_kind_t; + for (size_t i = 0; i < MaskedArgs.size(); i++) { + auto &NodeArg = MaskedArgs[i]; + switch (NodeArg.MType) { + case kernel_param_kind_t::kind_pointer: { + PtrDescs.push_back({static_cast(NodeArg.MIndex), NodeArg.MPtr}); + } break; + case kernel_param_kind_t::kind_std_layout: { + ValueDescs.push_back({static_cast(NodeArg.MIndex), + static_cast(NodeArg.MSize), + NodeArg.MPtr}); + } break; + case kernel_param_kind_t::kind_accessor: { + sycl::detail::Requirement *Req = + static_cast(NodeArg.MPtr); + + pi_mem_obj_property MemObjData{}; + + switch (Req->MAccessMode) { + case access::mode::read: { + MemObjData.mem_access = PI_ACCESS_READ_ONLY; + break; + } + case access::mode::write: + case access::mode::discard_write: { + MemObjData.mem_access = PI_ACCESS_WRITE_ONLY; + break; + } + default: { + MemObjData.mem_access = PI_ACCESS_READ_WRITE; + break; + } + } + MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; + MemobjDescs.push_back(pi_ext_command_buffer_update_memobj_arg_desc_t{ + static_cast(NodeArg.MIndex), &MemObjData, + static_cast(Req->MData)}); + + } break; + + default: + break; + } + } + + UpdateDesc.num_mem_obj_args = MemobjDescs.size(); + UpdateDesc.mem_obj_arg_list = MemobjDescs.data(); + UpdateDesc.num_ptr_args = PtrDescs.size(); + UpdateDesc.ptr_arg_list = PtrDescs.data(); + UpdateDesc.num_value_args = ValueDescs.size(); + UpdateDesc.value_arg_list = ValueDescs.data(); + + UpdateDesc.global_work_offset = &NDRDesc.GlobalOffset[0]; + UpdateDesc.global_work_size = &NDRDesc.GlobalSize[0]; + UpdateDesc.local_work_size = LocalSize; + UpdateDesc.num_work_dim = NDRDesc.Dims; + + // Query the ID cache to find the equivalent exec node for the node passed to + // this function. + // TODO: Handle subgraphs or any other cases where multiple nodes may be + // associated with a single key, once those node types are supported for + // update. + auto ExecNode = MIDCache.find(Node->MID); + assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache"); + + // Update ExecNode with new values from Node, in case we ever need to + // rebuild the command buffers + (*ExecNode).second->updateFromOtherNode(Node); + + auto Command = MCommandMap[(*ExecNode).second]; + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferUpdateKernelLaunch>( + Command, &UpdateDesc); + + if (Res != PI_SUCCESS) { + throw sycl::exception(errc::invalid, "Error updating command_graph"); + } +} + modifiable_command_graph::modifiable_command_graph( const sycl::context &SyclContext, const sycl::device &SyclDevice, const sycl::property_list &PropList) @@ -1152,12 +1451,12 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) { } command_graph -modifiable_command_graph::finalize(const sycl::property_list &) const { +modifiable_command_graph::finalize(const sycl::property_list &PropList) const { // Graph is read and written in this scope so we lock // this graph with full priviledges. graph_impl::WriteLock Lock(impl->MMutex); - return command_graph{this->impl, - this->impl->getContext()}; + return command_graph{ + this->impl, this->impl->getContext(), PropList}; } bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { @@ -1271,8 +1570,9 @@ std::vector modifiable_command_graph::get_root_nodes() const { } executable_command_graph::executable_command_graph( - const std::shared_ptr &Graph, const sycl::context &Ctx) - : impl(std::make_shared(Ctx, Graph)) { + const std::shared_ptr &Graph, const sycl::context &Ctx, + const property_list &PropList) + : impl(std::make_shared(Ctx, Graph, PropList)) { finalizeImpl(); // Create backend representation for executable graph } @@ -1305,8 +1605,56 @@ void executable_command_graph::update( throw sycl::exception(sycl::make_error_code(errc::invalid), "Method not yet implemented"); } + + +void executable_command_graph::update(const node &Node) { + impl->update(sycl::detail::getSyclObjImpl(Node)); +} + +void executable_command_graph::update(const std::vector &Nodes) { + std::vector> NodeImpls{}; + NodeImpls.reserve(Nodes.size()); + for (auto &Node : Nodes) { + NodeImpls.push_back(sycl::detail::getSyclObjImpl(Node)); + } + + impl->update(NodeImpls); +} + +void dynamic_parameter_impl::registerWithNode(int ArgIndex, + sycl::handler &CGH) { + if (CGH.MGraph != MGraph) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Dynamic parameters cannot be registered with nodes " + "associated with graphs other than the one used to " + "construct the dynamic parameter object."); + } + + CGH.MImpl->MDynamicParameters.push_back(this); + MIndex = ArgIndex; +} + +dynamic_parameter_base::dynamic_parameter_base( + command_graph Graph) + : impl(std::make_shared( + sycl::detail::getSyclObjImpl(Graph))) {} + +void dynamic_parameter_base::register_with_node(handler &CGH, int ArgIndex) { + impl->registerWithNode(ArgIndex, CGH); +} + +void dynamic_parameter_base::updateValue(void *NewValue, size_t Size) { + impl->updateValue(NewValue, Size); +} + +void dynamic_parameter_base::updateAccessor( + sycl::detail::AccessorBaseHost *Acc) { + impl->updateAccessor(Acc); +} + } // namespace detail + node_type node::get_type() const { return impl->MNodeType; } std::vector node::get_predecessors() const { @@ -1325,6 +1673,25 @@ node node::get_node_from_event(event nodeEvent) { GraphImpl->getNodeForEvent(EventImpl)); } + +template <> void node::update_nd_range<1>(nd_range<1> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_nd_range<2>(nd_range<2> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_nd_range<3>(nd_range<3> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_range<1>(range<1> Range) { + impl->updateRange(Range); +} +template <> void node::update_range<2>(range<2> Range) { + impl->updateRange(Range); +} +template <> void node::update_range<3>(range<3> Range) { + impl->updateRange(Range); +} } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index eafb66b1dca9b..f2eededf0c1e1 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -75,6 +76,10 @@ inline node_type getNodeTypeFromCG(sycl::detail::CG::CGTYPE CGType) { /// Implementation of node class from SYCL_EXT_ONEAPI_GRAPH. class node_impl { public: + using id_type = uint64_t; + + /// Unique identifier for this node. + id_type MID = getNextNodeID(); /// List of successors to this node. std::vector> MSuccessors; /// List of predecessors to this node. @@ -99,6 +104,14 @@ class node_impl { /// cannot be used to find out the partion of a node outside of this process. int MPartitionNum = -1; + /// Cache of accessors which have been updated on this node + std::vector> + MUpdatedAccessorsCache; + + /// Track whether an ND-Range was used for kernel nodes + bool MNDRangeUsed = false; + /// Add successor to the node. /// @param Node Node to add as a successor. /// @param Prev Predecessor to \p node being added as successor. @@ -180,8 +193,12 @@ class node_impl { /// @return A unique ptr to the new command group object. std::unique_ptr getCGCopy() const { switch (MCGType) { - case sycl::detail::CG::Kernel: - return createCGCopy(); + case sycl::detail::CG::Kernel: { + auto CGCopy = createCGCopy(); + rebuildArgStorage(CGCopy->MArgs, MCommandGroup->getArgsStorage(), + CGCopy->getArgsStorage()); + return std::move(CGCopy); + } case sycl::detail::CG::CopyAccToPtr: case sycl::detail::CG::CopyPtrToAcc: case sycl::detail::CG::CopyAccToAcc: @@ -218,6 +235,11 @@ class node_impl { CommandGroupPtr->getSharedPtrStorage(), CommandGroupPtr->getRequirements(), CommandGroupPtr->getEvents()); + std::vector NewArgs = CommandGroupPtr->MArgs; + + rebuildArgStorage(NewArgs, CommandGroupPtr->getArgsStorage(), + Data.MArgsStorage); + sycl::detail::code_location Loc(CommandGroupPtr->MFileName.data(), CommandGroupPtr->MFunctionName.data(), CommandGroupPtr->MLine, @@ -226,7 +248,7 @@ class node_impl { return std::make_unique( sycl::detail::CGHostTask( std::move(HostTaskUPtr), CommandGroupPtr->MQueue, - CommandGroupPtr->MContext, CommandGroupPtr->MArgs, Data, + CommandGroupPtr->MContext, std::move(NewArgs), std::move(Data), CommandGroupPtr->getType(), Loc)); } case sycl::detail::CG::Barrier: @@ -331,11 +353,194 @@ class node_impl { } } + /// Update the value of an accessor inside this node. Accessors must be + /// handled specifically compared to other argument values. + /// @param ArgIndex The index of the accessor arg to be updated + /// @param Acc Pointer to the new accessor value + void updateAccessor(int ArgIndex, sycl::detail::AccessorBaseHost *Acc) { + auto &Args = + static_cast(MCommandGroup.get())->MArgs; + auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); + + // Find old accessor in accessor storage and replace with new one + if (static_cast(NewAccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } + + // All accessors passed to this function will be placeholders, so we must + // perform steps similar to what happens when handler::require() is + // called here. + sycl::detail::Requirement *NewReq = NewAccImpl.get(); + if (NewReq->MAccessMode != sycl::access_mode::read) { + auto SYCLMemObj = + static_cast(NewReq->MSYCLMemObj); + SYCLMemObj->handleWriteAccessorCreation(); + } + + for (auto &Acc : MCommandGroup->getAccStorage()) { + if (auto OldAcc = + static_cast(Arg.MPtr); + Acc.get() == OldAcc) { + Acc = NewAccImpl; + } + } + + for (auto &Req : MCommandGroup->getRequirements()) { + if (auto OldReq = + static_cast(Arg.MPtr); + Req == OldReq) { + Req = NewReq; + } + } + // Cache the old and new values so the graph can access it when updating + MUpdatedAccessorsCache.push_back(std::make_pair( + static_cast(Arg.MPtr), + NewAccImpl.get())); + Arg.MPtr = NewAccImpl.get(); + break; + } + } + + void updateArgValue(int ArgIndex, void *NewValue, size_t Size) { + + auto &Args = + static_cast(MCommandGroup.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MSize == static_cast(Size)); + // MPtr may be a pointer into arg storage so we memcpy the contents of + // NewValue rather than assign it directly + std::memcpy(Arg.MPtr, NewValue, Size); + break; + } + } + + template + void updateNDRange(nd_range ExecutionRange) { + if (MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception( + sycl::errc::invalid, + "Cannot update execution range of nodes which are not kernel nodes"); + } + if (!MNDRangeUsed) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update node which was created with a " + "sycl::range with a sycl::nd_range"); + } + + auto &NDRDesc = + static_cast(MCommandGroup.get()) + ->MNDRDesc; + + if (NDRDesc.Dims != Dimensions) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update execution range of a node with an " + "execution range of different dimensions than what " + "the node was originall created with."); + } + + NDRDesc.set(ExecutionRange); + } + + template void updateRange(range ExecutionRange) { + if (MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception( + sycl::errc::invalid, + "Cannot update execution range of nodes which are not kernel nodes"); + } + if (MNDRangeUsed) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update node which was created with a " + "sycl::nd_range with a sycl::range"); + } + + auto &NDRDesc = + static_cast(MCommandGroup.get()) + ->MNDRDesc; + + if (NDRDesc.Dims != Dimensions) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update execution range of a node with an " + "execution range of different dimensions than what " + "the node was originall created with."); + } + + NDRDesc.set(ExecutionRange); + } + + void updateFromOtherNode(const std::shared_ptr &Other) { + auto ExecCG = + static_cast(MCommandGroup.get()); + auto OtherExecCG = + static_cast(Other->MCommandGroup.get()); + + ExecCG->MArgs = OtherExecCG->MArgs; + ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; + ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); + ExecCG->getRequirements() = OtherExecCG->getRequirements(); + + auto &OldArgStorage = OtherExecCG->getArgsStorage(); + auto &NewArgStorage = ExecCG->getArgsStorage(); + // Rebuild the arg storage and update the args + rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); + + MUpdatedAccessorsCache = Other->MUpdatedAccessorsCache; + } + + id_type getID() const { return MID; } + private: + void rebuildArgStorage(std::vector &Args, + const std::vector> &OldArgStorage, + std::vector> &NewArgStorage) const { + // Clear the arg storage so we can rebuild it + NewArgStorage.clear(); + + // Loop over all the args, any std_layout ones need their pointers updated + // to point to the new arg storage. + for (auto &Arg : Args) { + if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_std_layout) { + continue; + } + // Find which ArgStorage Arg.MPtr is pointing to + for (auto &ArgStorage : OldArgStorage) { + if (ArgStorage.data() != Arg.MPtr) { + continue; + } + NewArgStorage.emplace_back(Arg.MSize); + // Memcpy contents from old storage to new storage + std::memcpy(NewArgStorage.back().data(), ArgStorage.data(), Arg.MSize); + // Update MPtr to point to the new storage instead of the old + Arg.MPtr = NewArgStorage.back().data(); + + break; + } + } + } + // Gets the next unique identifier for a node, should only be used when + // constructing nodes. + static id_type getNextNodeID() { + static id_type nextID = 0; + + // Return the value then increment the next ID + return nextID++; + } + /// Prints Node information to Stream. /// @param Stream Where to print the Node information - /// @param Verbose If true, print additional information about the nodes such - /// as kernel args or memory access where applicable. + /// @param Verbose If true, print additional information about the nodes + /// such as kernel args or memory access where applicable. void printDotCG(std::ostream &Stream, bool Verbose) { Stream << "\"" << this << "\" [style=bold, label=\""; @@ -1021,13 +1226,10 @@ class exec_graph_impl { /// nodes). /// @param Context Context to create graph with. /// @param GraphImpl Modifiable graph implementation to create with. + /// @param PropList List of properties for constructing this object exec_graph_impl(sycl::context Context, - const std::shared_ptr &GraphImpl) - : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), - MRequirements(), MExecutionEvents() { - // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. - duplicateNodes(); - } + const std::shared_ptr &GraphImpl, + const property_list &PropList); /// Destructor. /// @@ -1095,6 +1297,11 @@ class exec_graph_impl { return MRequirements; } + void update(std::shared_ptr Node); + void update(const std::vector> Nodes); + + void updateImpl(std::shared_ptr NodeImpl); + private: /// Create a command-group for the node and add it to command-buffer by going /// through the scheduler. @@ -1195,6 +1402,52 @@ class exec_graph_impl { MPartitionsExecutionEvents; /// Storage for copies of nodes from the original modifiable graph. std::vector> MNodeStorage; + /// Map of nodes to their associated PI command handles. + std::unordered_map, + sycl::detail::pi::PiExtCommandBufferCommand> + MCommandMap; + + bool MIsUpdateable; + + // Stores a cache of node ids from modifiable graph nodes to the companion + // node(s) in this graph. Used for quick access when updating this graph. + std::multimap> MIDCache; +}; + +class dynamic_parameter_impl { +public: + dynamic_parameter_impl(std::shared_ptr GraphImpl) + : MGraph(GraphImpl) {} + void registerWithNode(int ArgIndex, sycl::handler &CGH); + + void registerNode(std::shared_ptr NodeImpl) { + MNodes.push_back(NodeImpl); + } + + void updateValue(void *NewValue, size_t Size) { + for (auto &NodeWeak : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + NodeShared->updateArgValue(MIndex, NewValue, Size); + } + } + } + + void updateAccessor(sycl::detail::AccessorBaseHost *Acc) { + for (auto &NodeWeak : MNodes) { + auto NodeShared = NodeWeak.lock(); + // Should we fail here if the node isn't alive anymore? + if (NodeShared) { + NodeShared->updateAccessor(MIndex, Acc); + } + } + } + + // Weak ptrs to node_impls which will be updated + std::vector> MNodes; + + int MIndex; + std::shared_ptr MGraph; }; } // namespace detail diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index c96d60bd85ecd..ff4a601447a7f 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -15,6 +15,9 @@ namespace sycl { inline namespace _V1 { +namespace ext::oneapi::experimental::detail { +class dynamic_parameter_impl; +} namespace detail { using KernelBundleImplPtr = std::shared_ptr; @@ -128,6 +131,15 @@ class handler_impl { // created for later query by users. sycl::ext::oneapi::experimental::node_type MUserFacingNodeType = sycl::ext::oneapi::experimental::node_type::empty; + + // Storage for any SYCL Graph dynamic parameters which have been flagged for + // registration in the CG + std::vector + MDynamicParameters; + + // Track whether an NDRange was used when submitting a kernel (as opposed to a + // range), needed for graph update + bool MNDRangeUsed = false; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b113da757bd0c..bc8d4c4b4cbaf 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2431,6 +2431,7 @@ pi_int32 enqueueImpCommandBufferKernel( const CGExecKernel &CommandGroup, std::vector &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function &getMemAllocationFunc) { auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); @@ -2512,7 +2513,8 @@ pi_int32 enqueueImpCommandBufferKernel( sycl::detail::PiApiKind::piextCommandBufferNDRangeKernel>( CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(), - SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint); + SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint, + OutCommand); if (!SyclKernelImpl && !Kernel) { Plugin->call(PiKernel); @@ -2722,6 +2724,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { ? nullptr : &MEvent->getHandleRef(); sycl::detail::pi::PiExtSyncPoint OutSyncPoint; + sycl::detail::pi::PiExtCommandBufferCommand OutCommand; switch (MCommandGroup->getType()) { case CG::CGTYPE::Kernel: { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); @@ -2743,8 +2746,10 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { } auto result = enqueueImpCommandBufferKernel( MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer, - *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc); + *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand, + getMemAllocationFunc); MEvent->setSyncPoint(OutSyncPoint); + MEvent->setCommandBufferCommand(OutCommand); return result; } case CG::CGTYPE::CopyUSM: { @@ -3395,6 +3400,67 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { } } +UpdateCommandBufferCommand::UpdateCommandBufferCommand( + QueueImplPtr Queue, + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes) + : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph), + MNodes(Nodes) {} + +pi_int32 UpdateCommandBufferCommand::enqueueImp() { + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; + auto RawEvents = getPiEvents(EventImpls); + flushCrossQueueDeps(EventImpls, getWorkerQueue()); + + for (auto &Node : MNodes) { + auto CG = static_cast(Node->MCommandGroup.get()); + for (auto &Arg : CG->MArgs) { + if (Arg.MType != kernel_param_kind_t::kind_accessor) { + continue; + } + // Search through deps to get actual allocation for accessor args. + for (const DepDesc &Dep : MDeps) { + Requirement *Req = static_cast(Arg.MPtr); + if (Dep.MDepRequirement == Req) { + if (Dep.MAllocaCmd) { + Req->MData = Dep.MAllocaCmd->getMemAllocation(); + } else { + throw sycl::exception(make_error_code(errc::invalid), + "No allocation available for accessor when " + "updating command buffer!"); + } + } + } + } + MGraph->updateImpl(Node); + } + + return PI_SUCCESS; +} + +void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const { + Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; + + Stream << "ID = " << this << "\\n"; + Stream << "CommandBuffer Command Update" + << "\\n"; + + Stream << "\"];" << std::endl; + + for (const auto &Dep : MDeps) { + Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\"" + << " [ label = \"Access mode: " + << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n" + << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]" + << std::endl; + } +} + +void UpdateCommandBufferCommand::emitInstrumentationData() {} +bool UpdateCommandBufferCommand::producesPiEvent() const { return false; } + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8dc12120bdd9a..8ba0cceee9e6a 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -25,6 +25,11 @@ namespace sycl { inline namespace _V1 { + +namespace ext::oneapi::experimental::detail { +class exec_graph_impl; +class node_impl; +} // namespace ext::oneapi::experimental::detail namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -114,6 +119,7 @@ class Command { HOST_TASK, FUSION, EXEC_CMD_BUFFER, + UPDATE_CMD_BUFFER }; Command(CommandType Type, QueueImplPtr Queue, @@ -755,6 +761,26 @@ class KernelFusionCommand : public Command { FusionStatus MStatus; }; +class UpdateCommandBufferCommand : public Command { +public: + explicit UpdateCommandBufferCommand( + QueueImplPtr Queue, + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes); + + void printDot(std::ostream &Stream) const final; + void emitInstrumentationData() final; + bool producesPiEvent() const final; + +private: + pi_int32 enqueueImp() final; + + ext::oneapi::experimental::detail::exec_graph_impl *MGraph; + std::vector> + MNodes; +}; + // Enqueues a given kernel to a PiExtCommandBuffer pi_int32 enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, @@ -762,6 +788,7 @@ pi_int32 enqueueImpCommandBufferKernel( const CGExecKernel &CommandGroup, std::vector &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function &getMemAllocationFunc); // Sets arguments for a given kernel and device based on the argument type. diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 7b50192cf3b43..f0c5dc670aa05 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -14,6 +14,7 @@ #if SYCL_EXT_CODEPLAY_KERNEL_FUSION #include #endif +#include #include #include #include @@ -1679,6 +1680,96 @@ bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) { return FusionList->second->isActive(); } +Command *Scheduler::GraphBuilder::addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events, + std::vector &ToEnqueue) { + auto NewCmd = + std::make_unique(Queue, Graph, Nodes); + // If there are multiple requirements for the same memory object, its + // AllocaCommand creation will be dependent on the access mode of the first + // requirement. Combine these access modes to take all of them into account. + combineAccessModesOfReqs(Requirements); + std::vector ToCleanUp; + for (Requirement *Req : Requirements) { + MemObjRecord *Record = nullptr; + AllocaCommandBase *AllocaCmd = nullptr; + + bool isSameCtx = false; + + { + + Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue); + markModifiedIfWrite(Record, Req); + + AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); + + isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext); + } + + if (!isSameCtx) { + // Cannot directly copy memory from OpenCL device to OpenCL device - + // create two copies: device->host and host->device. + bool NeedMemMoveToHost = false; + auto MemMoveTargetQueue = Queue; + + if (!Queue->is_host() && !Record->MCurContext->is_host()) + NeedMemMoveToHost = true; + + if (NeedMemMoveToHost) + insertMemoryMove(Record, Req, + Scheduler::getInstance().getDefaultHostQueue(), + ToEnqueue); + insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); + } + std::set Deps = + findDepsForReq(Record, Req, Queue->getContextImplPtr()); + + for (Command *Dep : Deps) { + if (Dep != NewCmd.get()) { + Command *ConnCmd = + NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); + } + } + } + + // Set new command as user for dependencies and update leaves. + // Node dependencies can be modified further when adding the node to leaves, + // iterate over their copy. + // FIXME employ a reference here to eliminate copying of a vector + std::vector Deps = NewCmd->MDeps; + for (DepDesc &Dep : Deps) { + const Requirement *Req = Dep.MDepRequirement; + MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); + updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp); + addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); + } + + // Register all the events as dependencies + for (detail::EventImplPtr e : Events) { + if (e->getCommand() && + e->getCommand() == static_cast(NewCmd.get())) { + continue; + } + if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp)) + ToEnqueue.push_back(ConnCmd); + } + + if (MPrintOptionsArray[AfterAddCG]) + printGraphAsDot("after_addCG"); + + for (Command *Cmd : ToCleanUp) { + cleanupCommand(Cmd); + } + + return NewCmd.release(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index a83298a628539..8bfaf07c31c22 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -8,6 +8,7 @@ #include "detail/sycl_mem_obj_i.hpp" #include +#include #include #include #include @@ -670,6 +671,51 @@ KernelFusionCommand *Scheduler::isPartOfActiveFusion(Command *Cmd) { } } +EventImplPtr Scheduler::addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events) { + std::vector AuxiliaryCmds; + EventImplPtr NewCmdEvent = nullptr; + + { + WriteLockT Lock = acquireWriteLock(); + + Command *NewCmd = MGraphBuilder.addCommandGraphUpdate( + Graph, Nodes, Queue, Requirements, Events, AuxiliaryCmds); + if (!NewCmd) + return nullptr; + NewCmdEvent = NewCmd->getEvent(); + } + + std::vector ToCleanUp; + { + ReadLockT Lock = acquireReadLock(); + EnqueueResultT Res; + bool Enqueued; + + for (Command *Cmd : AuxiliaryCmds) { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", + PI_ERROR_INVALID_OPERATION); + } + + if (Command *NewCmd = static_cast(NewCmdEvent->getCommand())) { + Enqueued = + GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", + PI_ERROR_INVALID_OPERATION); + } + } + + cleanupCommands(ToCleanUp); + return NewCmdEvent; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 53ce295626045..d9068292ed800 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -173,6 +173,10 @@ class MockScheduler; namespace sycl { inline namespace _V1 { +namespace ext::oneapi::experimental::detail { +class exec_graph_impl; +class node_impl; +} // namespace ext::oneapi::experimental::detail namespace detail { class queue_impl; class event_impl; @@ -468,6 +472,19 @@ class Scheduler { std::vector &AuxilaryCmds, BlockingT Blocking = NON_BLOCKING); + /// Adds a command buffer update operation to the execution graph. This is + /// required when buffers/accessors are updated to ensure that the memory has + /// been allocated when updating. + /// \param Graph The executable graph to be updated. + /// \param Nodes The list of Nodes which are to be updated in the graph. + /// \param Requirements List of accessor requirements for this update. + EventImplPtr addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events); + protected: using RWLockT = std::shared_timed_mutex; using ReadLockT = std::shared_lock; @@ -668,6 +685,21 @@ class Scheduler { bool isInFusionMode(QueueIdT queue); + /// Adds a command buffer update operation to the execution graph. This is + /// required when buffers/accessors are updated to ensure that the memory + /// has been allocated when updating. + /// \param Graph The executable graph to be updated. + /// \param Nodes The list of Nodes which are to be updated in the graph. + /// \param Requirements List of accessor requirements for this update. + Command *addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector< + std::shared_ptr> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events, + std::vector &ToEnqueue); + std::vector MMemObjs; private: diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 94baf075b8c23..30cecd7abe374 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -564,6 +564,8 @@ event handler::finalize() { // Associate an event with this new node and return the event. GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); + NodeImpl->MNDRangeUsed = MImpl->MNDRangeUsed; + return detail::createSyclObjFromImpl(EventImpl); } @@ -1524,5 +1526,7 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { return {std::array{0, 0, 0}, false}; } +void handler::setNDRangeUsed(bool Value) { MImpl->MNDRangeUsed = Value; } + } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp new file mode 100644 index 0000000000000..ab1a3230d4987 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp @@ -0,0 +1,63 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node before finalization + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + // Swap PtrB to be the input + InputParam.update(PtrB); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // Only PtrB should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == 0); + assert(HostDataB[i] == i); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp new file mode 100644 index 0000000000000..ee9818a7b129c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + nd_range<1> NDRange{range{1024}, range{32}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_nd_range(nd_range<1>{range{512}, range{32}}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_range.cpp b/sycl/test-e2e/Graph/Explicit/update_range.cpp new file mode 100644 index 0000000000000..25468020fd291 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_range.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + range<1> Range{1024}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(Range, [=](item<1> Item) { + size_t GlobalID = Item.get_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_range(range<1>{512}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp new file mode 100644 index 0000000000000..84f526c5db2e2 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp @@ -0,0 +1,72 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node scalar argument using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + exp_ext::dynamic_parameter InputParam(BufA.get_access(), Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, Acc) + auto Acc = BufA.get_access(cgh); + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + Acc[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap BufB to be the input + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..c58804d226106 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp @@ -0,0 +1,79 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests creating multiple executable graphs from the same modifiable graph and +// only updating one of them. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updateable{}); + + // PtrA values should be modified twice + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Only update ExecGraph, which should now modify PtrB while ExecGraph2 + // modifies PtrA still + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + // A should have been modified 3 times by now, B only once + assert(HostDataA[i] == i * 3); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp new file mode 100644 index 0000000000000..ccb74168f5ac1 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests that updating a graph is ordered with respect to previous executions of +// the graph which may be in flight. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + // Use a large N to try and make the kernel slow + const size_t N = 1 << 16; + // Loop inside kernel to make even slower (too large N runs out of memory) + const size_t NumKernelLoops = 4; + const size_t NumSubmitLoops = 8; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t j = 0; j < NumKernelLoops; j++) { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // Submit a bunch of graphs without waiting + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + + // Submit another set of graphs then wait on all submissions + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops); + assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp new file mode 100644 index 0000000000000..26030709914f9 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp @@ -0,0 +1,72 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp new file mode 100644 index 0000000000000..d78efb98d0fc4 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp @@ -0,0 +1,82 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrUnused = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataUnused(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + Queue.memset(PtrUnused, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataUnused[i] == 0); + } + + // Swap PtrUnused to be the input, then swap to PtrB without executing + InputParam.update(PtrUnused); + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + // Check that PtrUnused was never actually used in a kernel + assert(HostDataUnused[i] == 0); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp new file mode 100644 index 0000000000000..74fddf4b447eb --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp @@ -0,0 +1,87 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp new file mode 100644 index 0000000000000..5e5af9afd13d6 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp @@ -0,0 +1,86 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating multiple parameters to a singlegraph node using index-based +// explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + std::vector OutData(N); + + std::iota(HostDataA.begin(), HostDataA.end(), 10); + std::iota(HostDataB.begin(), HostDataB.end(), 100); + + Queue.memcpy(PtrA, HostDataA.data(), N * sizeof(int)).wait(); + Queue.memcpy(PtrB, HostDataB.data(), N * sizeof(int)).wait(); + Queue.memset(PtrC, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter ParamA(PtrA, Graph); + exp_ext::dynamic_parameter ParamB(PtrB, Graph); + exp_ext::dynamic_parameter ParamOut(PtrC, Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input pointers, we should be using set_arg but can't + // currently test that with CUDA e.g. + // cgh.set_arg(0, PtrA) + ParamOut.register_with_node(cgh, 0); + ParamA.register_with_node(cgh, 1); + ParamB.register_with_node(cgh, 2); + cgh.parallel_for(range<1>{Size}, [=](item<1> Item) { + size_t ID = Item.get_id(); + PtrC[ID] += PtrA[ID] * PtrB[ID]; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrC, OutData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(OutData[i] == HostDataC[i] + (HostDataA[i] * HostDataB[i])); + } + + // Update C's host data + HostDataC = OutData; + + // Swap PtrB to be the input + ParamOut.update(PtrB); + ParamB.update(PtrC); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrB, OutData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(OutData[i] == HostDataB[i] + (HostDataA[i] * HostDataC[i])); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp new file mode 100644 index 0000000000000..6b313760e55e7 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp @@ -0,0 +1,82 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node in an executable graph that was used as a +// subgraph node in another executable graph is not reflected in the graph +// containing the subgraph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(SubGraph); + + auto SubKernelNode = SubGraph.add([&](handler &cgh) { + // Register the input pointer, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(0, PtrA) + InputParam.register_with_node(cgh, 0); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }); + + auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updateable{}); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + Graph.add([&](handler &cgh) { cgh.ext_oneapi_graph(SubExecGraph); }, + exp_ext::property::node::depends_on{KernelNode}); + + // Finalize the parent graph with the original values + auto ExecGraph = Graph.finalize(); + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Update the executable graph that was used as a subgraph with the new value, + // this should not affect ExecGraph + SubExecGraph.update(SubKernelNode); + // Only PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp new file mode 100644 index 0000000000000..f5363a2bae880 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp @@ -0,0 +1,67 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Unsupported: opencl, level_zero + +// Tests updating a graph node scalar argument using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *DeviceData = malloc_device(N, Queue); + + int ScalarValue = 17; + + std::vector HostData(N); + + Queue.memset(DeviceData, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(ScalarValue, Graph); + + auto KernelNode = Graph.add([&](handler &cgh) { + // Register the input scalar, we should be using set_arg but can't + // currently test that with CUDA + // cgh.set_arg(1, ScalarValue) + InputParam.register_with_node(cgh, 1); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + DeviceData[i] = ScalarValue; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + + // DeviceData should be filled with current ScalarValue (17) + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == 17); + } + + // Update ScalarValue to be 99 instead + InputParam.update(99); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == 99); + } + return 0; +} diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index 13b555bec8880..eed3998d171d7 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -5,7 +5,6 @@ # RUN: env LLVM_BIN_PATH=%llvm_build_bin_dir %python %sycl_tools_src_dir/abi_check.py --mode check_symbols --reference %s %sycl_libs_dir/libpi_cuda.so # REQUIRES: linux -# REQUIRES: cuda_be # UNSUPPORTED: libcxx piContextCreate @@ -99,6 +98,7 @@ piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease piextCommandBufferRetain +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 7a90e461a30f6..6fbf2b4cfc6fd 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -98,6 +98,7 @@ piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease piextCommandBufferRetain +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 159e427835651..362cc26e18298 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -81,8 +81,8 @@ piSamplerGetInfo piSamplerRelease piSamplerRetain piTearDown -piextCommandBufferAdviseUSM piextBindlessImageSamplerCreate +piextCommandBufferAdviseUSM piextCommandBufferCreate piextCommandBufferFillUSM piextCommandBufferFinalize @@ -98,6 +98,7 @@ piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease piextCommandBufferRetain +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 4fe3042f807ab..ff1d46525f5ad 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3717,16 +3717,29 @@ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_ _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKSt6vectorISt4byteSaIS9_EE _ZN4sycl3_V13ext6oneapi12experimental33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi1EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi2EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi3EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi1EEEvNS0_8nd_rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi2EEEvNS0_8nd_rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi3EEEvNS0_8nd_rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node19get_node_from_eventENS0_5eventE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev _ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_ +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPvm +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPNS0_6detail16AccessorBaseHostE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base18register_with_nodeERNS0_7handlerEi +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEE _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph12finalizeImplEv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_13command_graphILNS3_11graph_stateE0EEE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_4nodeE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERKSt6vectorINS0_5queueESaIS7_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingEv @@ -4111,6 +4124,7 @@ _ZN4sycl3_V17handler10mem_adviseEPKvmi _ZN4sycl3_V17handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE _ZN4sycl3_V17handler13getKernelNameB5cxx11Ev +_ZN4sycl3_V17handler14setNDRangeUsedEb _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_ _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleEPvRKNS4_16image_descriptorE _ZN4sycl3_V17handler15ext_oneapi_copyEPvNS0_3ext6oneapi12experimental16image_mem_handleERKNS5_16image_descriptorE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 435176097e7bd..8b7a5380eb29b 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -506,6 +506,9 @@ ??0device_image_plain@detail@_V1@sycl@@QEAA@AEBV?$shared_ptr@Vdevice_image_impl@detail@_V1@sycl@@@std@@@Z ??0device_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0device_selector@_V1@sycl@@QEAA@XZ +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z @@ -532,7 +535,7 @@ ??0exception_list@_V1@sycl@@QEAA@$$QEAV012@@Z ??0exception_list@_V1@sycl@@QEAA@AEBV012@@Z ??0exception_list@_V1@sycl@@QEAA@XZ -??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@@Z +??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@AEBVproperty_list@56@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0filter_selector@ONEAPI@_V1@sycl@@QEAA@$$QEAV0123@@Z @@ -664,6 +667,7 @@ ??1device@_V1@sycl@@QEAA@XZ ??1device_image_plain@detail@_V1@sycl@@QEAA@XZ ??1device_selector@_V1@sycl@@UEAA@XZ +??1dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1event@_V1@sycl@@QEAA@XZ ??1exception@_V1@sycl@@UEAA@XZ ??1exception_list@_V1@sycl@@QEAA@XZ @@ -740,6 +744,8 @@ ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4device_selector@_V1@sycl@@QEAAAEAV012@AEBV012@@Z +??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@$$QEAV0123456@@Z +??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@AEBV0123456@@Z ??4event@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4event@_V1@sycl@@QEAAAEAV012@AEBV012@@Z ??4exception@_V1@sycl@@QEAAAEAV012@AEBV012@@Z @@ -1464,6 +1470,7 @@ ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z +?register_with_node@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEAVhandler@67@H@Z ?release@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAVSYCLMemObjI@234@PEAXV?$vector@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@V?$allocator@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@2@@6@AEAPEAU_pi_event@@@Z ?releaseHostMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXPEAX@Z ?releaseMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAX@Z @@ -1496,6 +1503,7 @@ ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4_pi_kernel_cache_config@@@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z +?setNDRangeUsed@handler@_V1@sycl@@AEAAX_N@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ ?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ @@ -5126,8 +5134,12 @@ ?unsampledImageDestructorNotification@image_plain@detail@_V1@sycl@@IEAAXPEAX@Z ?unset_flag@stream@_V1@sycl@@AEBAXI@Z ?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$command_graph@$0A@@34567@@Z +?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBVnode@34567@@Z +?updateAccessor@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEAVAccessorBaseHost@267@@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXQEAX@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXXZ +?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEAX_K@Z ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z ?usesPinnedHostMemory@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 5ece9b1391baf..b375d564674b0 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -2545,3 +2545,143 @@ TEST_F(CommandGraphTest, FillMemsetNodes) { sycl::free(USMPtr, Queue); } } + +TEST_F(CommandGraphTest, UpdatableException) { + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraphUpdatable = + Graph.finalize(experimental::property::graph::updateable{}); + + EXPECT_NO_THROW(ExecGraphUpdatable.update(Node)); + + auto ExecGraphNoUpdatable = Graph.finalize(); + + // Graph without the property should throw + EXPECT_ANY_THROW(ExecGraphNoUpdatable.update(Node)); +} + +TEST_F(CommandGraphTest, DynamicParamRegister) { + // Check that registering a dynamic param with a node from a graph that was + // not passed to its constructor throws. + experimental::dynamic_parameter DynamicParam(Graph); + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto Node = OtherGraph.add([&](sycl::handler &cgh) { + // This should throw since OtherGraph is not associated with DynamicParam + EXPECT_ANY_THROW(DynamicParam.register_with_node(cgh, 0)); + cgh.single_task>([]() {}); + }); +} + +TEST_F(CommandGraphTest, UpdateNodeNotInGraph) { + // Check that updating a graph with a node which is not part of that graph is + // an error. + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto OtherNode = OtherGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updateable{}); + EXPECT_ANY_THROW(ExecGraph.update(OtherNode)); +} + +TEST_F(CommandGraphTest, UpdateWithUnchangedNode) { + // Tests that updating a graph with a node with unchanged + // parameters is not an error + + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updateable{}); + EXPECT_NO_THROW(ExecGraph.update(Node)); +} + +TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { + // Check that registering a dynamic parameter with various node types either + // throws or does not throw as appropriate + + // Allocate some pointers for memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + experimental::dynamic_parameter DynamicParam{Graph}; + + ASSERT_NO_THROW(auto NodeKernel = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.single_task>([]() {}); + })); + + ASSERT_ANY_THROW(auto NodeMemcpy = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemset = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.memset(PtrB, 7, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.fill(PtrB, 7, 16); + })); + + ASSERT_ANY_THROW(auto NodePrefetch = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.prefetch(PtrA, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemadvise = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.mem_advise(PtrA, 16 * sizeof(int), 1); + })); + + ASSERT_ANY_THROW(auto NodeHostTask = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.host_task([]() {}); + })); + + auto NodeEmpty = Graph.add(); + + experimental::command_graph Subgraph(Queue.get_context(), Dev); + // Add an empty node to the subgraph + Subgraph.add(); + + auto SubgraphExec = Subgraph.finalize(); + ASSERT_ANY_THROW(auto NodeSubgraph = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.ext_oneapi_graph(SubgraphExec); + })); +} + +TEST_F(CommandGraphTest, UpdateRangeErrors) { + // Test that the correct errors are throw when trying to update node ranges + + nd_range<1> NDRange{range{128}, range{32}}; + range<1> Range{128}; + auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(NDRange, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); + // Can't update an nd_range node with a range + EXPECT_ANY_THROW(NodeNDRange.update_range(Range)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeNDRange.update_nd_range( + nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); + + auto NodeRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(range<1>{128}, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeRange.update_range(Range)); + // Can't update a range node with an nd_range + EXPECT_ANY_THROW(NodeRange.update_nd_range(NDRange)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 5ab408d2eed01..6c992c95c44ce 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -207,7 +207,8 @@ inline pi_result mock_piDeviceGetInfo(pi_device device, case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: case PI_DEVICE_INFO_AVAILABLE: case PI_DEVICE_INFO_LINKER_AVAILABLE: - case PI_DEVICE_INFO_COMPILER_AVAILABLE: { + case PI_DEVICE_INFO_COMPILER_AVAILABLE: + case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT: { if (param_value) *static_cast(param_value) = PI_TRUE; if (param_value_size_ret) @@ -1316,7 +1317,7 @@ inline pi_result mock_piextCommandBufferNDRangeKernel( const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, - pi_ext_sync_point *sync_point) { + pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command) { return PI_SUCCESS; } @@ -1373,6 +1374,12 @@ inline pi_result mock_piextEnqueueCommandBuffer( return PI_SUCCESS; } +inline pi_result mock_piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return PI_SUCCESS; +} + inline pi_result mock_piextCommandBufferMemBufferCopy( pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, From 384b9aa1e3b22b1345a7b515b17f809a8ad1a009 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 6 Mar 2024 14:14:56 +0000 Subject: [PATCH 02/26] Spell updatable consistently, add missing comments --- sycl/doc/design/CommandGraph.md | 4 ++-- sycl/include/sycl/detail/pi.h | 2 +- sycl/include/sycl/detail/property_helper.hpp | 2 +- sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 6 +++--- sycl/plugins/unified_runtime/pi2ur.hpp | 2 +- sycl/source/detail/graph_impl.cpp | 10 +++++----- sycl/source/detail/graph_impl.hpp | 4 ++-- sycl/source/detail/scheduler/scheduler.hpp | 3 +++ .../test-e2e/Graph/Explicit/update_before_finalize.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_nd_range.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_range.cpp | 2 +- .../Graph/Explicit/update_with_indices_accessor.cpp | 2 +- .../update_with_indices_multiple_exec_graphs.cpp | 4 ++-- .../Graph/Explicit/update_with_indices_ordering.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr.cpp | 2 +- .../Explicit/update_with_indices_ptr_double_update.cpp | 2 +- .../update_with_indices_ptr_multiple_nodes.cpp | 2 +- .../update_with_indices_ptr_multiple_params.cpp | 2 +- .../Explicit/update_with_indices_ptr_subgraph.cpp | 2 +- .../Graph/Explicit/update_with_indices_scalar.cpp | 2 +- sycl/unittests/Extensions/CommandGraph.cpp | 6 +++--- 21 files changed, 34 insertions(+), 31 deletions(-) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 9cbf7b129b3ec..68bcbfd8fd34b 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -263,8 +263,8 @@ different recordings we can then match parameter order when updating. ### Scheduler Integration -Graph updates in the runtime are sychronous calls however they can optionally be -done through the scheduler using a new command, +Graph updates in the runtime are synchronous calls however they can optionally +be done through the scheduler using a new command, `sycl::detail::UpdateCommandBufferCommand`. This is needed when dealing with accessor updates. Since a new buffer which the user creates for updating may not yet have been lazily initialized on device we schedule a new command which has diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index ffd140825d2fd..749a91a8f525d 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2331,7 +2331,7 @@ typedef enum { struct pi_ext_command_buffer_desc final { pi_ext_structure_type stype; const void *pNext; - pi_bool is_updateable; + pi_bool is_updatable; }; // Command Buffer Update types diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 1e1c91590d231..f438b5098065e 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -47,7 +47,7 @@ enum DataLessPropKind { GraphAssumeDataOutlivesBuffer = 22, GraphAssumeBufferOutlivesGraph = 23, GraphDependOnAllLeaves = 24, - GraphUpdateable = 25, + GraphUpdatable = 25, // Indicates the last known dataless property. LastKnownDataLessPropKind = 25, // Exceeding 32 may cause ABI breaking change on some of OSes. diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index d718021f0b90e..057bb12db2be0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -162,10 +162,10 @@ class assume_buffer_outlives_graph /// Property passed to command_graph::finalize() to /// mark the resulting executable command_graph as able to be updated. -class updateable - : public ::sycl::detail::DataLessProperty<::sycl::detail::GraphUpdateable> { +class updatable + : public ::sycl::detail::DataLessProperty<::sycl::detail::GraphUpdatable> { public: - updateable() = default; + updatable() = default; }; } // namespace graph diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index ac77041ca10f9..244f7ddcb3253 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4467,7 +4467,7 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_device_handle_t UrDevice = reinterpret_cast(Device); ur_exp_command_buffer_desc_t UrDesc; UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; - UrDesc.isUpdatable = Desc->is_updateable; + UrDesc.isUpdatable = Desc->is_updatable; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 913499bceb763..eb5f12cb3fbe5 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -702,7 +702,7 @@ void exec_graph_impl::createCommandBuffers( sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; sycl::detail::pi::PiExtCommandBufferDesc Desc{ pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, - MIsUpdateable}; + MIsUpdatable}; auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); @@ -763,11 +763,11 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, const property_list &PropList) : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), MRequirements(), MExecutionEvents(), - MIsUpdateable(PropList.has_property()) { + MIsUpdatable(PropList.has_property()) { - // If the graph has been marked as updateable then check if the backend + // If the graph has been marked as updatable then check if the backend // actually supports that. - if (MIsUpdateable) { + if (MIsUpdatable) { pi_bool SupportsUpdate = PI_FALSE; bool CallSuccessful = sycl::detail::getSyclObjImpl(MContext) @@ -1156,7 +1156,7 @@ void exec_graph_impl::update(std::shared_ptr Node) { void exec_graph_impl::update( const std::vector> Nodes) { - if (!MIsUpdateable) { + if (!MIsUpdatable) { throw sycl::exception(sycl::make_error_code(errc::invalid), "update() cannot be called on a executable graph " "which was not created with property::updatable"); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index f2eededf0c1e1..b6affd8df5eaf 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1406,8 +1406,8 @@ class exec_graph_impl { std::unordered_map, sycl::detail::pi::PiExtCommandBufferCommand> MCommandMap; - - bool MIsUpdateable; + /// True if this graph can be updated (set with property::updatable) + bool MIsUpdatable; // Stores a cache of node ids from modifiable graph nodes to the companion // node(s) in this graph. Used for quick access when updating this graph. diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index d9068292ed800..ecad8ede1e9ff 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -478,6 +478,7 @@ class Scheduler { /// \param Graph The executable graph to be updated. /// \param Nodes The list of Nodes which are to be updated in the graph. /// \param Requirements List of accessor requirements for this update. + /// \param Events List of events that this update operation depends on EventImplPtr addCommandGraphUpdate( ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector> @@ -691,6 +692,8 @@ class Scheduler { /// \param Graph The executable graph to be updated. /// \param Nodes The list of Nodes which are to be updated in the graph. /// \param Requirements List of accessor requirements for this update. + /// \param Events List of events that this operation depends on. + /// \param ToEnqueue List of commands which need to be enqueued. Command *addCommandGraphUpdate( ext::oneapi::experimental::detail::exec_graph_impl *Graph, std::vector< diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp index ab1a3230d4987..a1827e56de3fa 100644 --- a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp @@ -47,7 +47,7 @@ int main() { // Swap PtrB to be the input InputParam.update(PtrB); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // Only PtrB should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp index ee9818a7b129c..f0a8928c5b633 100644 --- a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp @@ -37,7 +37,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // first half of PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_range.cpp b/sycl/test-e2e/Graph/Explicit/update_range.cpp index 25468020fd291..e04adf8cb453e 100644 --- a/sycl/test-e2e/Graph/Explicit/update_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_range.cpp @@ -37,7 +37,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // first half of PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp index 84f526c5db2e2..ed317c97a5085 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp @@ -47,7 +47,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // BufA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp index c58804d226106..0764327407da1 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp @@ -46,8 +46,8 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); - auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{}); // PtrA values should be modified twice Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp index ccb74168f5ac1..9f976f8bbb56f 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp @@ -52,7 +52,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // Submit a bunch of graphs without waiting for (size_t i = 0; i < NumSubmitLoops; i++) { diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp index 26030709914f9..838c578358999 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp @@ -45,7 +45,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp index d78efb98d0fc4..b1443f62a33b3 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp @@ -48,7 +48,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp index 74fddf4b447eb..b725bd137efa1 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp @@ -60,7 +60,7 @@ int main() { }, exp_ext::property::node::depends_on{KernelNodeA}); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp index 5e5af9afd13d6..7e62d8be9b3f8 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp @@ -56,7 +56,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp index 6b313760e55e7..6c068445193c4 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp @@ -48,7 +48,7 @@ int main() { }); }); - auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updateable{}); + auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updatable{}); auto KernelNode = Graph.add([&](handler &cgh) { cgh.single_task([=]() { diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp index f5363a2bae880..3ffa2e3b0b9a2 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp @@ -44,7 +44,7 @@ int main() { }); }); - auto ExecGraph = Graph.finalize(exp_ext::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); // DeviceData should be filled with current ScalarValue (17) Queue.ext_oneapi_graph(ExecGraph).wait(); diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index b375d564674b0..db9f33335d718 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -2551,7 +2551,7 @@ TEST_F(CommandGraphTest, UpdatableException) { [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto ExecGraphUpdatable = - Graph.finalize(experimental::property::graph::updateable{}); + Graph.finalize(experimental::property::graph::updatable{}); EXPECT_NO_THROW(ExecGraphUpdatable.update(Node)); @@ -2584,7 +2584,7 @@ TEST_F(CommandGraphTest, UpdateNodeNotInGraph) { auto OtherNode = OtherGraph.add( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - auto ExecGraph = Graph.finalize(experimental::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); EXPECT_ANY_THROW(ExecGraph.update(OtherNode)); } @@ -2595,7 +2595,7 @@ TEST_F(CommandGraphTest, UpdateWithUnchangedNode) { auto Node = Graph.add( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - auto ExecGraph = Graph.finalize(experimental::property::graph::updateable{}); + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); EXPECT_NO_THROW(ExecGraph.update(Node)); } From dfef9b3a3a10d0ff560d9b479bce6b2d935b3f37 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 6 Mar 2024 15:26:58 +0000 Subject: [PATCH 03/26] Re-add update tests --- .../Extensions/CommandGraph/CMakeLists.txt | 1 + .../Extensions/CommandGraph/Update.cpp | 152 ++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 sycl/unittests/Extensions/CommandGraph/Update.cpp diff --git a/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt b/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt index 712d7345fd895..7fb1849268236 100644 --- a/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt +++ b/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt @@ -8,4 +8,5 @@ add_sycl_unittest(CommandGraphExtensionTests OBJECT MultiThreaded.cpp Queries.cpp Subgraph.cpp + Update.cpp ) diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp new file mode 100644 index 0000000000000..7d1e1b8cfa8f7 --- /dev/null +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -0,0 +1,152 @@ +//==----------------------------- Update.cpp -------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Common.hpp" + +using namespace sycl; +using namespace sycl::ext::oneapi; + +TEST_F(CommandGraphTest, UpdatableException) { + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraphUpdatable = + Graph.finalize(experimental::property::graph::updatable{}); + + EXPECT_NO_THROW(ExecGraphUpdatable.update(Node)); + + auto ExecGraphNoUpdatable = Graph.finalize(); + + // Graph without the property should throw + EXPECT_ANY_THROW(ExecGraphNoUpdatable.update(Node)); +} + +TEST_F(CommandGraphTest, DynamicParamRegister) { + // Check that registering a dynamic param with a node from a graph that was + // not passed to its constructor throws. + experimental::dynamic_parameter DynamicParam(Graph); + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto Node = OtherGraph.add([&](sycl::handler &cgh) { + // This should throw since OtherGraph is not associated with DynamicParam + EXPECT_ANY_THROW(DynamicParam.register_with_node(cgh, 0)); + cgh.single_task>([]() {}); + }); +} + +TEST_F(CommandGraphTest, UpdateNodeNotInGraph) { + // Check that updating a graph with a node which is not part of that graph is + // an error. + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto OtherNode = OtherGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); + EXPECT_ANY_THROW(ExecGraph.update(OtherNode)); +} + +TEST_F(CommandGraphTest, UpdateWithUnchangedNode) { + // Tests that updating a graph with a node with unchanged + // parameters is not an error + + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); + EXPECT_NO_THROW(ExecGraph.update(Node)); +} + +TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { + // Check that registering a dynamic parameter with various node types either + // throws or does not throw as appropriate + + // Allocate some pointers for memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + experimental::dynamic_parameter DynamicParam{Graph}; + + ASSERT_NO_THROW(auto NodeKernel = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.single_task>([]() {}); + })); + + ASSERT_ANY_THROW(auto NodeMemcpy = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemset = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.memset(PtrB, 7, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.fill(PtrB, 7, 16); + })); + + ASSERT_ANY_THROW(auto NodePrefetch = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.prefetch(PtrA, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemadvise = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.mem_advise(PtrA, 16 * sizeof(int), 1); + })); + + ASSERT_ANY_THROW(auto NodeHostTask = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.host_task([]() {}); + })); + + auto NodeEmpty = Graph.add(); + + experimental::command_graph Subgraph(Queue.get_context(), Dev); + // Add an empty node to the subgraph + Subgraph.add(); + + auto SubgraphExec = Subgraph.finalize(); + ASSERT_ANY_THROW(auto NodeSubgraph = Graph.add([&](sycl::handler &cgh) { + DynamicParam.register_with_node(cgh, 0); + cgh.ext_oneapi_graph(SubgraphExec); + })); +} + +TEST_F(CommandGraphTest, UpdateRangeErrors) { + // Test that the correct errors are throw when trying to update node ranges + + nd_range<1> NDRange{range{128}, range{32}}; + range<1> Range{128}; + auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(NDRange, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); + // Can't update an nd_range node with a range + EXPECT_ANY_THROW(NodeNDRange.update_range(Range)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeNDRange.update_nd_range( + nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); + + auto NodeRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(range<1>{128}, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeRange.update_range(Range)); + // Can't update a range node with an nd_range + EXPECT_ANY_THROW(NodeRange.update_nd_range(NDRange)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); +} From 149611d03128ed6def3d33f434ab65ea463a916b Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 6 Mar 2024 16:40:36 +0000 Subject: [PATCH 04/26] Update PI native_cpu dump file --- sycl/test/abi/pi_nativecpu_symbol_check.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 1929f3871cfe0..3849a40f7fd24 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -99,6 +99,7 @@ piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease piextCommandBufferRetain +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter From 5b8952777ce0bebc4943f69bcfbc56e703dde421 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 6 Mar 2024 18:14:16 +0000 Subject: [PATCH 05/26] Fix inconsistencies between spec and implementation --- .../sycl/ext/oneapi/experimental/graph.hpp | 17 +++++++++++------ .../Graph/Explicit/update_before_finalize.cpp | 2 +- .../Explicit/update_with_indices_accessor.cpp | 2 +- ...update_with_indices_multiple_exec_graphs.cpp | 2 +- .../Explicit/update_with_indices_ordering.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr.cpp | 2 +- .../update_with_indices_ptr_double_update.cpp | 2 +- .../update_with_indices_ptr_multiple_nodes.cpp | 2 +- .../update_with_indices_ptr_multiple_params.cpp | 6 +++--- .../update_with_indices_ptr_subgraph.cpp | 2 +- .../Explicit/update_with_indices_scalar.cpp | 2 +- .../Extensions/CommandGraph/Update.cpp | 4 ++-- 12 files changed, 25 insertions(+), 20 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 057bb12db2be0..a4610f8591f02 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -452,13 +452,15 @@ class dynamic_parameter : public detail::dynamic_parameter_base { : sycl::detail::kernel_param_kind_t::kind_std_layout; public: - dynamic_parameter(experimental::command_graph Graph) + /// Constructs a new dynamic parameter. + /// @param Graph The graph associated with this parameter. + /// @param Param A reference value for this parameter used for CTAD. + dynamic_parameter(experimental::command_graph Graph, + const ValueT &Param) : detail::dynamic_parameter_base(Graph), MValue() {} - dynamic_parameter(ValueT InitialValue, - experimental::command_graph Graph) - : detail::dynamic_parameter_base(Graph), MValue(InitialValue) {} - + /// Updates this dynamic parameter and all registered nodes with a new value. + /// @param NewValue The new value for the parameter. void update(const ValueT &NewValue) { MValue = NewValue; if constexpr (IsAccessor) { @@ -472,7 +474,10 @@ class dynamic_parameter : public detail::dynamic_parameter_base { ValueT MValue; }; -/// Additional CTAD deduction guide. +/// Additional CTAD deduction guides. +template +dynamic_parameter(experimental::command_graph Graph, + const ValueT &Param) -> dynamic_parameter; template command_graph(const context &SyclContext, const device &SyclDevice, const property_list &PropList) -> command_graph; diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp index a1827e56de3fa..c2d5af7735c14 100644 --- a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp @@ -31,7 +31,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp index ed317c97a5085..6ea48246ee1fa 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp @@ -32,7 +32,7 @@ int main() { BufA.set_write_back(false); BufB.set_write_back(false); - exp_ext::dynamic_parameter InputParam(BufA.get_access(), Graph); + exp_ext::dynamic_parameter InputParam(Graph, BufA.get_access()); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp index 0764327407da1..c3f730d8d1831 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp @@ -32,7 +32,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp index 9f976f8bbb56f..63da9510d264d 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp @@ -36,7 +36,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp index 838c578358999..127543cdcb400 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp @@ -31,7 +31,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp index b1443f62a33b3..fd7ec939f90c0 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp @@ -34,7 +34,7 @@ int main() { Queue.memset(PtrB, 0, N * sizeof(int)).wait(); Queue.memset(PtrUnused, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp index b725bd137efa1..42b92d4ecd02b 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp @@ -32,7 +32,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(Graph); + exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNodeA = Graph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp index 7e62d8be9b3f8..4eb411e1cb6e5 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp @@ -39,9 +39,9 @@ int main() { Queue.memcpy(PtrB, HostDataB.data(), N * sizeof(int)).wait(); Queue.memset(PtrC, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter ParamA(PtrA, Graph); - exp_ext::dynamic_parameter ParamB(PtrB, Graph); - exp_ext::dynamic_parameter ParamOut(PtrC, Graph); + exp_ext::dynamic_parameter ParamA(Graph, PtrA); + exp_ext::dynamic_parameter ParamB(Graph, PtrB); + exp_ext::dynamic_parameter ParamOut(Graph, PtrC); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input pointers, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp index 6c068445193c4..386d503e4b0b5 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp @@ -34,7 +34,7 @@ int main() { Queue.memset(PtrA, 0, N * sizeof(int)).wait(); Queue.memset(PtrB, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(SubGraph); + exp_ext::dynamic_parameter InputParam(SubGraph, PtrA); auto SubKernelNode = SubGraph.add([&](handler &cgh) { // Register the input pointer, we should be using set_arg but can't diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp index 3ffa2e3b0b9a2..0ee594b005293 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp @@ -30,7 +30,7 @@ int main() { Queue.memset(DeviceData, 0, N * sizeof(int)).wait(); - exp_ext::dynamic_parameter InputParam(ScalarValue, Graph); + exp_ext::dynamic_parameter InputParam(Graph, ScalarValue); auto KernelNode = Graph.add([&](handler &cgh) { // Register the input scalar, we should be using set_arg but can't diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp index 7d1e1b8cfa8f7..72ea40f2a8e97 100644 --- a/sycl/unittests/Extensions/CommandGraph/Update.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -29,7 +29,7 @@ TEST_F(CommandGraphTest, UpdatableException) { TEST_F(CommandGraphTest, DynamicParamRegister) { // Check that registering a dynamic param with a node from a graph that was // not passed to its constructor throws. - experimental::dynamic_parameter DynamicParam(Graph); + experimental::dynamic_parameter DynamicParam(Graph, int{}); auto OtherGraph = experimental::command_graph(Queue.get_context(), Queue.get_device()); @@ -72,7 +72,7 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { int *PtrA = malloc_device(16, Queue); int *PtrB = malloc_device(16, Queue); - experimental::dynamic_parameter DynamicParam{Graph}; + experimental::dynamic_parameter DynamicParam{Graph, int{}}; ASSERT_NO_THROW(auto NodeKernel = Graph.add([&](sycl::handler &cgh) { DynamicParam.register_with_node(cgh, 0); From f5a55189490cb92e1330490aa267376a14779d8f Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 7 Mar 2024 18:52:01 +0000 Subject: [PATCH 06/26] Fix update tests not handling unsupported backends correctly --- sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_nd_range.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_range.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp | 2 +- .../Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr_double_update.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr_multiple_params.cpp | 2 +- .../Graph/Explicit/update_with_indices_ptr_subgraph.cpp | 2 +- sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp index c2d5af7735c14..8b1d8379aeba2 100644 --- a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node before finalization diff --git a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp index f0a8928c5b633..e8352c391a1ca 100644 --- a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node using index-based explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_range.cpp b/sycl/test-e2e/Graph/Explicit/update_range.cpp index e04adf8cb453e..b0a907526e42d 100644 --- a/sycl/test-e2e/Graph/Explicit/update_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_range.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node using index-based explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp index 6ea48246ee1fa..1f0ee89bd5536 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node scalar argument using index-based explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp index c3f730d8d1831..fb4f8cc7dab15 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests creating multiple executable graphs from the same modifiable graph and // only updating one of them. diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp index 63da9510d264d..3f081ab4633be 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests that updating a graph is ordered with respect to previous executions of // the graph which may be in flight. diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp index 127543cdcb400..843da6a4b4ac6 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node using index-based explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp index fd7ec939f90c0..f933a754c0e2e 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node using index-based explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp index 42b92d4ecd02b..d33fa99b97af2 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a single dynamic parameter which is registered with multiple // graph nodes diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp index 4eb411e1cb6e5..eda375af0b733 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating multiple parameters to a singlegraph node using index-based // explicit update diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp index 386d503e4b0b5..0478a80ca938a 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node in an executable graph that was used as a // subgraph node in another executable graph is not reflected in the graph diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp index 0ee594b005293..85d2389436851 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp @@ -5,7 +5,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// Unsupported: opencl, level_zero +// UNSUPPORTED: opencl, level_zero // Tests updating a graph node scalar argument using index-based explicit update From 7cb9d3f6986886e543322cb85de17c0cae83242a Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 11 Mar 2024 12:52:16 +0000 Subject: [PATCH 07/26] [SYCL][Graph] Add limited_graph aspect --- sycl/include/sycl/device_aspect_macros.hpp | 10 ++++++++++ sycl/include/sycl/info/aspects.def | 1 + sycl/source/detail/device_impl.cpp | 3 +++ 3 files changed, 14 insertions(+) diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp index 489163c556223..9a7a1ad9e0c22 100644 --- a/sycl/include/sycl/device_aspect_macros.hpp +++ b/sycl/include/sycl/device_aspect_macros.hpp @@ -313,6 +313,11 @@ #define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_is_component__ 0 #endif +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_limited_graph__ +// __SYCL_ASPECT(ext_oneapi_limited_graph, 61) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_limited_graph__ 0 +#endif + #ifndef __SYCL_ANY_DEVICE_HAS_host__ // __SYCL_ASPECT(host, 0) #define __SYCL_ANY_DEVICE_HAS_host__ 0 @@ -617,3 +622,8 @@ // __SYCL_ASPECT(ext_oneapi_is_component, 60) #define __SYCL_ANY_DEVICE_HAS_ext_oneapi_is_component__ 0 #endif + +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_limited_graph__ +// __SYCL_ASPECT(ext_oneapi_limited_graph, 61) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_limited_graph__ 0 +#endif diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index a4a2296609e56..3bac33b8f0e49 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -55,3 +55,4 @@ __SYCL_ASPECT(ext_oneapi_tangle_group, 57) __SYCL_ASPECT(ext_intel_matrix, 58) __SYCL_ASPECT(ext_oneapi_is_composite, 59) __SYCL_ASPECT(ext_oneapi_is_component, 60) +__SYCL_ASPECT(ext_oneapi_limited_graph, 61) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index a78daf5fe0f28..345083ac71917 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -602,6 +602,9 @@ bool device_impl::has(aspect Aspect) const { return Result != nullptr; } + case aspect::ext_oneapi_limited_graph: { + return getBackend() != backend::ext_oneapi_cuda; + } } throw runtime_error("This device aspect has not been implemented yet.", PI_ERROR_INVALID_DEVICE); From b59ccc35988baef87d7563a371f310de62b45449 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 11 Mar 2024 13:44:29 +0000 Subject: [PATCH 08/26] Update UR tag temporarily - Fixes CUDA build issue with version < 12.0 --- sycl/plugins/unified_runtime/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index ab8cddeff8bb1..273adf6eee5d9 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -56,14 +56,14 @@ endif() if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") # commit e2ee9a4720414e0a59fa9c911e9575ab564ac57c # Merge: 7a5150cd be622e7c # Author: Kenneth Benzie (Benie) # Date: Sun Mar 10 18:02:50 2024 +0000 # Merge pull request #1340 from Bensuo/ewan/coverity_cuda_update # [HIP][CUDA][Command-Buffer] Fix Coverity issues in HIP/CUDA command-buffer code - set(UNIFIED_RUNTIME_TAG e2ee9a4720414e0a59fa9c911e9575ab564ac57c) + set(UNIFIED_RUNTIME_TAG 8e9adfa9ce293b1f026948b09f2c9394929dc7a0) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 0fd968bf8d8dc5b7a3e6ded589fe2694f7af177e Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 11 Mar 2024 15:49:43 +0000 Subject: [PATCH 09/26] Fix aspect and tests --- .../llvm/SYCLLowerIR/DeviceConfigFile.td | 1 + sycl/include/sycl/detail/pi.h | 3 +- sycl/plugins/unified_runtime/pi2ur.hpp | 3 ++ sycl/source/detail/device_impl.cpp | 52 +++++++++++++------ .../Graph/Explicit/update_before_finalize.cpp | 4 -- .../Graph/Explicit/update_nd_range.cpp | 4 -- sycl/test-e2e/Graph/Explicit/update_range.cpp | 4 -- .../Explicit/update_with_indices_accessor.cpp | 4 -- ...date_with_indices_multiple_exec_graphs.cpp | 4 -- .../Explicit/update_with_indices_ordering.cpp | 4 -- .../Explicit/update_with_indices_ptr.cpp | 4 -- .../update_with_indices_ptr_double_update.cpp | 4 -- ...update_with_indices_ptr_multiple_nodes.cpp | 4 -- ...pdate_with_indices_ptr_multiple_params.cpp | 4 -- .../update_with_indices_ptr_subgraph.cpp | 4 -- .../Explicit/update_with_indices_scalar.cpp | 4 -- .../Graph/UnsupportedDevice/device_query.cpp | 7 ++- sycl/unittests/helpers/PiMockPlugin.hpp | 1 + 18 files changed, 48 insertions(+), 67 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 8b805ff3fe1fb..fb4762c9346cc 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -70,6 +70,7 @@ def AspectExt_intel_matrix : Aspect<"ext_intel_matrix">; def AspectExt_oneapi_is_composite : Aspect<"ext_oneapi_is_composite">; def AspectExt_oneapi_is_component : Aspect<"ext_oneapi_is_component">; def AspectExt_oneapi_graph : Aspect<"ext_oneapi_graph">; +def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">; // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">; diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 749a91a8f525d..f688e756d74ef 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -446,7 +446,8 @@ typedef enum { PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE = 0x20112, // Command Buffers - PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT = 0x20113, + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT = 0x20113, + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT = 0x20114, } _pi_device_info; typedef enum { diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 244f7ddcb3253..be261f0925f79 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1278,6 +1278,9 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, UR_DEVICE_INFO_COMPONENT_DEVICES) PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE, UR_DEVICE_INFO_COMPOSITE_DEVICE) + PI_TO_UR_MAP_DEVICE_INFO( + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP) PI_TO_UR_MAP_DEVICE_INFO( PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index f8a4b504f08d7..324f0e1fb030e 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -603,32 +603,50 @@ bool device_impl::has(aspect Aspect) const { return Result != nullptr; } case aspect::ext_oneapi_graph: { - size_t ResultSize = 0; - bool CallSuccessful = getPlugin()->call_nocheck( - MDevice, PI_DEVICE_INFO_EXTENSIONS, 0, nullptr, - &ResultSize) == PI_SUCCESS; - if (!CallSuccessful || ResultSize == 0) { + pi_bool SupportsCommandBuffers = false; + bool CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + sizeof(SupportsCommandBuffers), &SupportsCommandBuffers, + nullptr) == PI_SUCCESS; + if (!CallSuccessful) { return PI_FALSE; } - std::unique_ptr Result(new char[ResultSize]); - CallSuccessful = getPlugin()->call_nocheck( - MDevice, PI_DEVICE_INFO_EXTENSIONS, ResultSize, - Result.get(), nullptr) == PI_SUCCESS; - + pi_bool SupportsCommandBufferUpdate = false; + CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + sizeof(SupportsCommandBufferUpdate), &SupportsCommandBufferUpdate, + nullptr) == PI_SUCCESS; if (!CallSuccessful) { return PI_FALSE; } - std::string_view ExtensionsString(Result.get()); - std::cout << ExtensionsString; - const bool Support = - ExtensionsString.find("ur_exp_command_buffer") != std::string::npos; - - return Support; + return SupportsCommandBuffers && SupportsCommandBufferUpdate; } case aspect::ext_oneapi_limited_graph: { - return getBackend() != backend::ext_oneapi_cuda; + pi_bool SupportsCommandBuffers = false; + bool CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + sizeof(SupportsCommandBuffers), &SupportsCommandBuffers, + nullptr) == PI_SUCCESS; + if (!CallSuccessful) { + return PI_FALSE; + } + + pi_bool SupportsCommandBufferUpdate = false; + CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + sizeof(SupportsCommandBufferUpdate), &SupportsCommandBufferUpdate, + nullptr) == PI_SUCCESS; + if (!CallSuccessful) { + return PI_FALSE; + } + + return SupportsCommandBuffers && !SupportsCommandBufferUpdate; } } throw runtime_error("This device aspect has not been implemented yet.", diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp index 8b1d8379aeba2..15506915fe695 100644 --- a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp index e8352c391a1ca..72df8d646dd7d 100644 --- a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_range.cpp b/sycl/test-e2e/Graph/Explicit/update_range.cpp index b0a907526e42d..f1ef78857651b 100644 --- a/sycl/test-e2e/Graph/Explicit/update_range.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_range.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp index 1f0ee89bd5536..9d1b61e024a26 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{ diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp index fb4f8cc7dab15..895bb7a1ac3b2 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp @@ -15,10 +15,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp index 3f081ab4633be..141bea74352a8 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp @@ -15,10 +15,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - // Use a large N to try and make the kernel slow const size_t N = 1 << 16; // Loop inside kernel to make even slower (too large N runs out of memory) diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp index 843da6a4b4ac6..989a20b3f7631 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp index f933a754c0e2e..52a0c819ef61b 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp index d33fa99b97af2..a10799403c9cb 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp @@ -15,10 +15,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp index eda375af0b733..ba025a5c447ac 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp @@ -15,10 +15,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp index 0478a80ca938a..8ea466c21141e 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp @@ -16,10 +16,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp index 85d2389436851..601dce44b41b2 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp +++ b/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp @@ -14,10 +14,6 @@ int main() { queue Queue{}; - if (!are_graphs_supported(Queue)) { - return 0; - } - const size_t N = 1024; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp index 11e98262bf390..9a5b3f5744060 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp @@ -14,12 +14,17 @@ int main() { auto Device = Queue.get_device(); bool SupportsGraphs = Device.has(aspect::ext_oneapi_graph); + bool SupportsLimitedGraphs = Device.has(aspect::ext_oneapi_limited_graph); auto Backend = Device.get_backend(); if ((Backend == backend::ext_oneapi_level_zero) || - (Backend == backend::ext_oneapi_cuda) || (Backend == backend::ext_oneapi_hip)) { + assert(!SupportsGraphs); + assert(SupportsLimitedGraphs); + + } else if (Backend == backend::ext_oneapi_cuda) { assert(SupportsGraphs); + assert(!SupportsLimitedGraphs); } else { assert(!SupportsGraphs); } diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 6c992c95c44ce..a1e1fb9ee60b3 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -208,6 +208,7 @@ inline pi_result mock_piDeviceGetInfo(pi_device device, case PI_DEVICE_INFO_AVAILABLE: case PI_DEVICE_INFO_LINKER_AVAILABLE: case PI_DEVICE_INFO_COMPILER_AVAILABLE: + case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT: case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT: { if (param_value) *static_cast(param_value) = PI_TRUE; From e5a8297c0e192446532bdb8373490c9b8fc9bae8 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 12 Mar 2024 19:09:29 +0000 Subject: [PATCH 10/26] [SYCL][Graph] set_arg for dynamic parameters - set_arg and require overloads for dynamic parameters - Removed register_with_node - Implemented limited graph aspect - Modify tests to support new aspect and APIs - Fix handler not checking associated accessors in finalize --- .../sycl/ext/oneapi/experimental/graph.hpp | 23 ++++---- sycl/include/sycl/handler.hpp | 56 +++++++++++++++++++ sycl/plugins/unified_runtime/CMakeLists.txt | 4 +- sycl/source/detail/graph_impl.cpp | 30 +++------- sycl/source/detail/graph_impl.hpp | 53 ++++++++++++------ sycl/source/detail/handler_impl.hpp | 7 ++- sycl/source/handler.cpp | 45 +++++++++++++++ sycl/test-e2e/Graph/Error/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Explicit/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Profiling/lit.local.cfg | 2 +- .../test-e2e/Graph/RecordReplay/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Threading/lit.local.cfg | 2 +- .../exception_unsupported_backend.cpp | 7 ++- sycl/test-e2e/Graph/Update/lit.local.cfg | 1 + .../update_before_finalize.cpp | 7 +-- .../{Explicit => Update}/update_nd_range.cpp | 0 .../{Explicit => Update}/update_range.cpp | 0 .../update_with_indices_accessor.cpp | 14 ++--- ...date_with_indices_multiple_exec_graphs.cpp | 7 +-- .../update_with_indices_ordering.cpp | 7 +-- .../update_with_indices_ptr.cpp | 7 +-- .../update_with_indices_ptr_double_update.cpp | 7 +-- ...update_with_indices_ptr_multiple_nodes.cpp | 14 ++--- ...pdate_with_indices_ptr_multiple_params.cpp | 11 ++-- .../update_with_indices_ptr_subgraph.cpp | 7 +-- .../update_with_indices_scalar.cpp | 7 +-- sycl/test-e2e/Graph/ValidUsage/lit.local.cfg | 2 +- .../Extensions/CommandGraph/Update.cpp | 18 +++--- 28 files changed, 221 insertions(+), 123 deletions(-) create mode 100644 sycl/test-e2e/Graph/Update/lit.local.cfg rename sycl/test-e2e/Graph/{Explicit => Update}/update_before_finalize.cpp (90%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_nd_range.cpp (100%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_range.cpp (100%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_accessor.cpp (85%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_multiple_exec_graphs.cpp (92%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ordering.cpp (92%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ptr.cpp (91%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ptr_double_update.cpp (93%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ptr_multiple_nodes.cpp (85%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ptr_multiple_params.cpp (90%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_ptr_subgraph.cpp (93%) rename sycl/test-e2e/Graph/{Explicit => Update}/update_with_indices_scalar.cpp (90%) diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index a4610f8591f02..fad718a6713dc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -429,15 +429,18 @@ class __SYCL_EXPORT dynamic_parameter_base { public: dynamic_parameter_base( sycl::ext::oneapi::experimental::command_graph - Graph); - - void register_with_node(handler &CGH, int ArgIndex); + Graph, + size_t ParamSize, const void *Data); protected: - void updateValue(void *NewValue, size_t Size); + void updateValue(const void *NewValue, size_t Size); - void updateAccessor(sycl::detail::AccessorBaseHost *Acc); + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); std::shared_ptr impl; + + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); }; } // namespace detail @@ -457,21 +460,17 @@ class dynamic_parameter : public detail::dynamic_parameter_base { /// @param Param A reference value for this parameter used for CTAD. dynamic_parameter(experimental::command_graph Graph, const ValueT &Param) - : detail::dynamic_parameter_base(Graph), MValue() {} + : detail::dynamic_parameter_base(Graph, sizeof(ValueT), &Param) {} /// Updates this dynamic parameter and all registered nodes with a new value. /// @param NewValue The new value for the parameter. void update(const ValueT &NewValue) { - MValue = NewValue; if constexpr (IsAccessor) { - detail::dynamic_parameter_base::updateAccessor(&MValue); + detail::dynamic_parameter_base::updateAccessor(&NewValue); } else { - detail::dynamic_parameter_base::updateValue(&MValue, sizeof(ValueT)); + detail::dynamic_parameter_base::updateValue(&NewValue, sizeof(ValueT)); } } - -private: - ValueT MValue; }; /// Additional CTAD deduction guides. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 869002fcd1cef..116652cfd66d9 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -216,6 +216,11 @@ __SYCL_EXPORT device getDeviceFromHandler(handler &); // Checks if a device_global has any registered kernel usage. __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr); +// Extracts a pointer to the value inside a dynamic parameter +__SYCL_EXPORT void *getValueFromDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase); + #if __SYCL_ID_QUERIES_FIT_IN_INT__ template struct NotIntMsg; @@ -708,6 +713,30 @@ class __SYCL_EXPORT handler { sizeof(sampler), ArgIndex); } + // setArgHelper for graph dynamic_parameters + template + void + setArgHelper(int ArgIndex, + ext::oneapi::experimental::dynamic_parameter DynamicParam) { + // Extract and copy arg so we can move it into setArgHelper + T ArgValue = + *static_cast(detail::getValueFromDynamicParameter(DynamicParam)); + // Set the arg in the handler as normal + setArgHelper(ArgIndex, std::move(ArgValue)); + // Register the dynamic parameter with the handler for later association + // with the node being added + registerDynamicParameter(DynamicParam, ArgIndex); + } + + /// Registers a dynamic parameter with the handler for later association with + /// the node being created + /// @param DynamicParamBase + /// @param ArgIndex + void registerDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase, + int ArgIndex); + // TODO: Unusued. Remove when ABI break is allowed. void verifyKernelInvoc(const kernel &Kernel) { std::ignore = Kernel; @@ -1865,6 +1894,26 @@ class __SYCL_EXPORT handler { associateWithHandler(&Acc, AccTarget); } + /// Requires access to the memory object associated with the placeholder + /// accessor contained in a dynamic_parameter object. Calling this function + /// with a non-placeholder accessor has no effect. + /// + /// The command group has a requirement to gain access to the given memory + /// object before executing. + /// + /// \param Acc is a SYCL accessor describing required memory region. + template + void require(ext::oneapi::experimental::dynamic_parameter< + accessor> + DynamicParamAcc) { + using AccT = accessor; + AccT Acc = *static_cast( + detail::getValueFromDynamicParameter(DynamicParamAcc)); + if (Acc.is_placeholder()) + associateWithHandler(&Acc, AccTarget); + } + /// Registers event dependencies on this command group. /// /// \param Event is a valid SYCL event to wait on. @@ -1918,6 +1967,13 @@ class __SYCL_EXPORT handler { setArgHelper(ArgIndex, std::move(Arg)); } + // set_arg for graph dynamic_parameters + template + void set_arg(int ArgIndex, + ext::oneapi::experimental::dynamic_parameter &DynamicParam) { + setArgHelper(ArgIndex, DynamicParam); + } + /// Sets arguments for OpenCL interoperability kernels. /// /// Registers pack of arguments(Args) with indexes starting from 0. diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 273adf6eee5d9..34189d87c2eac 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -56,14 +56,14 @@ endif() if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) - set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") # commit e2ee9a4720414e0a59fa9c911e9575ab564ac57c # Merge: 7a5150cd be622e7c # Author: Kenneth Benzie (Benie) # Date: Sun Mar 10 18:02:50 2024 +0000 # Merge pull request #1340 from Bensuo/ewan/coverity_cuda_update # [HIP][CUDA][Command-Buffer] Fix Coverity issues in HIP/CUDA command-buffer code - set(UNIFIED_RUNTIME_TAG 8e9adfa9ce293b1f026948b09f2c9394929dc7a0) + set(UNIFIED_RUNTIME_TAG 1f6cbe61d9a142d88d8edf90c655e3cb9f0b6fb9) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 54bb4676c6b3f..5080863edaaa0 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -402,8 +402,8 @@ graph_impl::add(const std::shared_ptr &Impl, "nodes which do not represent kernel executions"); } - for (auto &DynamicParam : DynamicParams) { - DynamicParam->registerNode(NodeImpl); + for (auto &[DynamicParam, ArgIndex] : DynamicParams) { + DynamicParam->registerNode(NodeImpl, ArgIndex); } return NodeImpl; @@ -1613,34 +1613,18 @@ void executable_command_graph::update(const std::vector &Nodes) { impl->update(NodeImpls); } -void dynamic_parameter_impl::registerWithNode(int ArgIndex, - sycl::handler &CGH) { - if (CGH.MGraph != MGraph) { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Dynamic parameters cannot be registered with nodes " - "associated with graphs other than the one used to " - "construct the dynamic parameter object."); - } - - CGH.MImpl->MDynamicParameters.push_back(this); - MIndex = ArgIndex; -} - dynamic_parameter_base::dynamic_parameter_base( - command_graph Graph) + command_graph Graph, size_t ParamSize, + const void *Data) : impl(std::make_shared( - sycl::detail::getSyclObjImpl(Graph))) {} - -void dynamic_parameter_base::register_with_node(handler &CGH, int ArgIndex) { - impl->registerWithNode(ArgIndex, CGH); -} + sycl::detail::getSyclObjImpl(Graph), ParamSize, Data)) {} -void dynamic_parameter_base::updateValue(void *NewValue, size_t Size) { +void dynamic_parameter_base::updateValue(const void *NewValue, size_t Size) { impl->updateValue(NewValue, Size); } void dynamic_parameter_base::updateAccessor( - sycl::detail::AccessorBaseHost *Acc) { + const sycl::detail::AccessorBaseHost *Acc) { impl->updateAccessor(Acc); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index f9447678219e2..66210b90db3b7 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -357,7 +357,7 @@ class node_impl { /// handled specifically compared to other argument values. /// @param ArgIndex The index of the accessor arg to be updated /// @param Acc Pointer to the new accessor value - void updateAccessor(int ArgIndex, sycl::detail::AccessorBaseHost *Acc) { + void updateAccessor(int ArgIndex, const sycl::detail::AccessorBaseHost *Acc) { auto &Args = static_cast(MCommandGroup.get())->MArgs; auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); @@ -410,7 +410,7 @@ class node_impl { } } - void updateArgValue(int ArgIndex, void *NewValue, size_t Size) { + void updateArgValue(int ArgIndex, const void *NewValue, size_t Size) { auto &Args = static_cast(MCommandGroup.get())->MArgs; @@ -791,7 +791,8 @@ class graph_impl { MAllowBuffers = true; } - if (!SyclDevice.has(aspect::ext_oneapi_graph)) { + if (!SyclDevice.has(aspect::ext_oneapi_limited_graph) && + !SyclDevice.has(aspect::ext_oneapi_graph)) { std::stringstream Stream; Stream << SyclDevice.get_backend(); std::string BackendString = Stream.str(); @@ -1411,38 +1412,58 @@ class exec_graph_impl { class dynamic_parameter_impl { public: - dynamic_parameter_impl(std::shared_ptr GraphImpl) - : MGraph(GraphImpl) {} - void registerWithNode(int ArgIndex, sycl::handler &CGH); + dynamic_parameter_impl(std::shared_ptr GraphImpl, + size_t ParamSize, const void *Data) + : MGraph(GraphImpl), MValueStorage(ParamSize) { + std::memcpy(MValueStorage.data(), Data, ParamSize); + } - void registerNode(std::shared_ptr NodeImpl) { - MNodes.push_back(NodeImpl); + /// Register a node with this dynamic parameter + /// @param NodeImpl The node to be registered + /// @param ArgIndex The arg index for the kernel arg associated with this + /// dynamic_parameter in NodeImpl + void registerNode(std::shared_ptr NodeImpl, int ArgIndex) { + MNodes.emplace_back(NodeImpl, ArgIndex); } - void updateValue(void *NewValue, size_t Size) { - for (auto &NodeWeak : MNodes) { + /// Get a pointer to the internal value of this dynamic parameter + void *getValue() { return MValueStorage.data(); } + + /// Update the internal value of this dynamic parameter as well as the value + /// of this parameter in all registered nodes. + /// @param NewValue Pointer to the new value + /// @param Size Size of the data pointer to by NewValue + void updateValue(const void *NewValue, size_t Size) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { auto NodeShared = NodeWeak.lock(); if (NodeShared) { - NodeShared->updateArgValue(MIndex, NewValue, Size); + NodeShared->updateArgValue(ArgIndex, NewValue, Size); } } + std::memcpy(MValueStorage.data(), NewValue, Size); } - void updateAccessor(sycl::detail::AccessorBaseHost *Acc) { - for (auto &NodeWeak : MNodes) { + /// Update the internal value of this dynamic parameter as well as the value + /// of this parameter in all registered nodes. Should only be called for + /// accessor dynamic_parameters. + /// @param Acc The new accessor value + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { auto NodeShared = NodeWeak.lock(); // Should we fail here if the node isn't alive anymore? if (NodeShared) { - NodeShared->updateAccessor(MIndex, Acc); + NodeShared->updateAccessor(ArgIndex, Acc); } } + std::memcpy(MValueStorage.data(), Acc, + sizeof(sycl::detail::AccessorBaseHost)); } // Weak ptrs to node_impls which will be updated - std::vector> MNodes; + std::vector, int>> MNodes; - int MIndex; std::shared_ptr MGraph; + std::vector MValueStorage; }; } // namespace detail diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index ff4a601447a7f..e268175781989 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -131,10 +131,11 @@ class handler_impl { // created for later query by users. sycl::ext::oneapi::experimental::node_type MUserFacingNodeType = sycl::ext::oneapi::experimental::node_type::empty; - + // Storage for any SYCL Graph dynamic parameters which have been flagged for - // registration in the CG - std::vector + // registration in the CG, along with the argument index for the parameter. + std::vector> MDynamicParameters; // Track whether an NDRange was used when submitting a kernel (as opposed to a diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index fc51dbdd5a61f..5fb65fc925f65 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -69,6 +69,12 @@ getPiImageCopyFlags(sycl::usm::alloc SrcPtrType, sycl::usm::alloc DstPtrType) { "Unknown copy destination location"); } +void *getValueFromDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase) { + return sycl::detail::getSyclObjImpl(DynamicParamBase)->getValue(); +} + } // namespace detail handler::handler(std::shared_ptr Queue, bool IsHost) @@ -158,6 +164,22 @@ event handler::finalize() { throw sycl::exception(make_error_code(errc::kernel_argument), "placeholder accessor must be bound by calling " "handler::require() before it can be used."); + + // Check associated accessors + bool AccFound = false; + for (detail::ArgDesc &Acc : MAssociatedAccesors) { + if (Acc.MType == detail::kernel_param_kind_t::kind_accessor && + static_cast(Acc.MPtr) == AccImpl) { + AccFound = true; + break; + } + } + + if (!AccFound) { + throw sycl::exception(make_error_code(errc::kernel_argument), + "placeholder accessor must be bound by calling " + "handler::require() before it can be used."); + } } } } @@ -1566,5 +1588,28 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { void handler::setNDRangeUsed(bool Value) { MImpl->MNDRangeUsed = Value; } +void handler::registerDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase, + int ArgIndex) { + if (MQueue && MQueue->getCommandGraph()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Dynamic Parameters cannot be used with Graph Queue recording."); + } + if (!MGraph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Dynamic Parameters cannot be used with normal SYCL submissions"); + } + + auto ParamImpl = detail::getSyclObjImpl(DynamicParamBase); + if (ParamImpl->MGraph != this->MGraph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot use a Dynamic Parameter with a node associated with a graph " + "other than the one it was created with."); + } + MImpl->MDynamicParameters.emplace_back(ParamImpl.get(), ArgIndex); +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/Error/lit.local.cfg b/sycl/test-e2e/Graph/Error/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/Error/lit.local.cfg +++ b/sycl/test-e2e/Graph/Error/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/Explicit/lit.local.cfg b/sycl/test-e2e/Graph/Explicit/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/Explicit/lit.local.cfg +++ b/sycl/test-e2e/Graph/Explicit/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/Profiling/lit.local.cfg b/sycl/test-e2e/Graph/Profiling/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/Profiling/lit.local.cfg +++ b/sycl/test-e2e/Graph/Profiling/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg +++ b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/Threading/lit.local.cfg b/sycl/test-e2e/Graph/Threading/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/Threading/lit.local.cfg +++ b/sycl/test-e2e/Graph/Threading/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp index b8f62ade21b84..78c65856a0b66 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Tests the ability to finalize a empty command graph // The test checks that invalid exception is thrown // when trying to create a graph with an unsupported backend. @@ -10,14 +9,16 @@ int GetUnsupportedBackend(const sycl::device &Dev) { // Return 1 if the device backend is unsupported or 0 else. // 0 does not prevent another device to be picked as a second choice - return !Dev.has(aspect::ext_oneapi_graph); + return !Dev.has(aspect::ext_oneapi_graph) && + !Dev.has(aspect::ext_oneapi_limited_graph); } int main() { sycl::device Dev{GetUnsupportedBackend}; queue Queue{Dev}; - if (Dev.has(aspect::ext_oneapi_graph)) + if (Dev.has(aspect::ext_oneapi_graph) || + Dev.has(aspect::ext_oneapi_limited_graph)) return 0; std::error_code ExceptionCode = make_error_code(sycl::errc::success); diff --git a/sycl/test-e2e/Graph/Update/lit.local.cfg b/sycl/test-e2e/Graph/Update/lit.local.cfg new file mode 100644 index 0000000000000..9c0c4cc846295 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/lit.local.cfg @@ -0,0 +1 @@ +config.required_features += ['aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/update_before_finalize.cpp similarity index 90% rename from sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp rename to sycl/test-e2e/Graph/Update/update_before_finalize.cpp index 15506915fe695..7c0cc07c5a230 100644 --- a/sycl/test-e2e/Graph/Explicit/update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Update/update_before_finalize.cpp @@ -30,10 +30,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] = i; diff --git a/sycl/test-e2e/Graph/Explicit/update_nd_range.cpp b/sycl/test-e2e/Graph/Update/update_nd_range.cpp similarity index 100% rename from sycl/test-e2e/Graph/Explicit/update_nd_range.cpp rename to sycl/test-e2e/Graph/Update/update_nd_range.cpp diff --git a/sycl/test-e2e/Graph/Explicit/update_range.cpp b/sycl/test-e2e/Graph/Update/update_range.cpp similarity index 100% rename from sycl/test-e2e/Graph/Explicit/update_range.cpp rename to sycl/test-e2e/Graph/Update/update_range.cpp diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp similarity index 85% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp index 9d1b61e024a26..07546a2fa6caf 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp @@ -27,15 +27,15 @@ int main() { buffer BufB{HostDataB}; BufA.set_write_back(false); BufB.set_write_back(false); - - exp_ext::dynamic_parameter InputParam(Graph, BufA.get_access()); + // Initial accessor for use in kernel and dynamic parameter + auto Acc = BufA.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, Acc); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, Acc) - auto Acc = BufA.get_access(cgh); - InputParam.register_with_node(cgh, 0); + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { Acc[i] = i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp similarity index 92% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp index 895bb7a1ac3b2..8109bf59af6e8 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp @@ -31,10 +31,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] += i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp similarity index 92% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp index 141bea74352a8..b4bccb00b6b11 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ordering.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp @@ -35,10 +35,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t j = 0; j < NumKernelLoops; j++) { for (size_t i = 0; i < N; i++) { diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp similarity index 91% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp index 989a20b3f7631..22d92b17cd819 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp @@ -30,10 +30,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] = i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp similarity index 93% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp index 52a0c819ef61b..4bb4ee7666658 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_double_update.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp @@ -33,10 +33,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] = i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp similarity index 85% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp index a10799403c9cb..9568e943c8f2b 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_nodes.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp @@ -31,10 +31,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, PtrA); auto KernelNodeA = Graph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] = i; @@ -44,10 +43,9 @@ int main() { auto KernelNodeB = Graph.add( [&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] += i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp similarity index 90% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp index ba025a5c447ac..2eb98ae3e601e 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_multiple_params.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp @@ -40,12 +40,11 @@ int main() { exp_ext::dynamic_parameter ParamOut(Graph, PtrC); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input pointers, we should be using set_arg but can't - // currently test that with CUDA e.g. - // cgh.set_arg(0, PtrA) - ParamOut.register_with_node(cgh, 0); - ParamA.register_with_node(cgh, 1); - ParamB.register_with_node(cgh, 2); + cgh.set_arg(0, ParamOut); + cgh.set_arg(1, ParamA); + cgh.set_arg(2, ParamB); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.parallel_for(range<1>{Size}, [=](item<1> Item) { size_t ID = Item.get_id(); PtrC[ID] += PtrA[ID] * PtrB[ID]; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp similarity index 93% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp index 8ea466c21141e..a8a4564f3fd52 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_ptr_subgraph.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp @@ -33,10 +33,9 @@ int main() { exp_ext::dynamic_parameter InputParam(SubGraph, PtrA); auto SubKernelNode = SubGraph.add([&](handler &cgh) { - // Register the input pointer, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(0, PtrA) - InputParam.register_with_node(cgh, 0); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { PtrA[i] += i; diff --git a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp similarity index 90% rename from sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp rename to sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp index 601dce44b41b2..5a00aed70f2a1 100644 --- a/sycl/test-e2e/Graph/Explicit/update_with_indices_scalar.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp @@ -29,10 +29,9 @@ int main() { exp_ext::dynamic_parameter InputParam(Graph, ScalarValue); auto KernelNode = Graph.add([&](handler &cgh) { - // Register the input scalar, we should be using set_arg but can't - // currently test that with CUDA - // cgh.set_arg(1, ScalarValue) - InputParam.register_with_node(cgh, 1); + cgh.set_arg(1, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. cgh.single_task([=]() { for (size_t i = 0; i < N; i++) { DeviceData[i] = ScalarValue; diff --git a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg index 9c0c4cc846295..95f3be32e90c9 100644 --- a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg +++ b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp index 72ea40f2a8e97..92246fb83678d 100644 --- a/sycl/unittests/Extensions/CommandGraph/Update.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -35,7 +35,7 @@ TEST_F(CommandGraphTest, DynamicParamRegister) { experimental::command_graph(Queue.get_context(), Queue.get_device()); auto Node = OtherGraph.add([&](sycl::handler &cgh) { // This should throw since OtherGraph is not associated with DynamicParam - EXPECT_ANY_THROW(DynamicParam.register_with_node(cgh, 0)); + EXPECT_ANY_THROW(cgh.set_arg(0, DynamicParam)); cgh.single_task>([]() {}); }); } @@ -75,37 +75,37 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { experimental::dynamic_parameter DynamicParam{Graph, int{}}; ASSERT_NO_THROW(auto NodeKernel = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.single_task>([]() {}); })); ASSERT_ANY_THROW(auto NodeMemcpy = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); })); ASSERT_ANY_THROW(auto NodeMemset = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.memset(PtrB, 7, 16 * sizeof(int)); })); ASSERT_ANY_THROW(auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.fill(PtrB, 7, 16); })); ASSERT_ANY_THROW(auto NodePrefetch = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.prefetch(PtrA, 16 * sizeof(int)); })); ASSERT_ANY_THROW(auto NodeMemadvise = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.mem_advise(PtrA, 16 * sizeof(int), 1); })); ASSERT_ANY_THROW(auto NodeHostTask = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.host_task([]() {}); })); @@ -117,7 +117,7 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { auto SubgraphExec = Subgraph.finalize(); ASSERT_ANY_THROW(auto NodeSubgraph = Graph.add([&](sycl::handler &cgh) { - DynamicParam.register_with_node(cgh, 0); + cgh.set_arg(0, DynamicParam); cgh.ext_oneapi_graph(SubgraphExec); })); } From c51dbbf138cd894af80df466cc3801fad70b362b Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Mar 2024 10:45:03 +0000 Subject: [PATCH 11/26] Add comments for includes in graph.hpp --- sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index fad718a6713dc..b6eee87e3ac7c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -8,13 +8,13 @@ #pragma once -#include +#include // for detail::AccessorBaseHost #include // for context #include // for __SYCL_EXPORT -#include +#include // for kernel_param_kind_t #include // for DataLessPropKind, PropWith... #include // for device -#include +#include // for range, nd_range #include // for is_property, is_property_of #include // for property_list From 2568f4a05964e6e05575a1f54ffbc7f1f3b88914 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Mar 2024 12:05:55 +0000 Subject: [PATCH 12/26] Fix linux symbols and aspect test --- llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td | 2 +- sycl/test/abi/sycl_symbols_linux.dump | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 2ad4ea1de5ed3..d98572b4d7a7f 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -123,7 +123,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_mipmap, AspectExt_oneapi_mipmap_anisotropy, AspectExt_oneapi_mipmap_level_reference, AspectExt_intel_esimd, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, - AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence], + AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index b85663ae8c0ca..6298de067d9d4 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3729,11 +3729,10 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_des _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev _ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_ -_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPvm -_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPNS0_6detail16AccessorBaseHostE -_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base18register_with_nodeERNS0_7handlerEi -_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEE -_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKvm +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPKNS0_6detail16AccessorBaseHostE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph12finalizeImplEv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_13command_graphILNS3_11graph_stateE0EEE _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_4nodeE @@ -4042,6 +4041,7 @@ _ZN4sycl3_V16detail28SampledImageAccessorBaseHost6getPtrEv _ZN4sycl3_V16detail28SampledImageAccessorBaseHostC1ENS0_5rangeILi3EEEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderENS0_13image_samplerERKNS0_13property_listE _ZN4sycl3_V16detail28SampledImageAccessorBaseHostC2ENS0_5rangeILi3EEEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderENS0_13image_samplerERKNS0_13property_listE _ZN4sycl3_V16detail28getPixelCoordNearestFiltModeENS0_3vecIfLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE +_ZN4sycl3_V16detail28getValueFromDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseE _ZN4sycl3_V16detail2pi25contextSetExtendedDeleterERKNS0_7contextEPFvPvES6_ _ZN4sycl3_V16detail2pi3dieEPKc _ZN4sycl3_V16detail2pi9assertionEbPKc @@ -4162,6 +4162,7 @@ _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V17handler24ext_intel_read_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm +_ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler27computeFallbackKernelBoundsEmm From 17a668784767d41cb8514bb4d9b7e0e2f090c683 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Mar 2024 14:42:43 +0000 Subject: [PATCH 13/26] Update windows symbol dumps --- sycl/test/abi/sycl_symbols_windows.dump | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 8d3d08c7056cb..6aae0843cbcd6 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -506,7 +506,7 @@ ??0device_selector@_V1@sycl@@QEAA@XZ ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z -??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@@Z +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z @@ -1020,8 +1020,8 @@ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z -?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4arch_category@experimental@oneapi@ext@23@@Z +?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ ?ext_oneapi_can_access_peer@device@_V1@sycl@@QEAA_NAEBV123@W4peer_access@oneapi@ext@23@@Z @@ -1236,6 +1236,7 @@ ?getType@handler@_V1@sycl@@AEAA?AW4CGTYPE@CG@detail@23@XZ ?getType@image_impl@detail@_V1@sycl@@UEBA?AW4MemObjType@SYCLMemObjI@234@XZ ?getUserPtr@SYCLMemObjT@detail@_V1@sycl@@QEBAPEAXXZ +?getValueFromDynamicParameter@detail@_V1@sycl@@YAPEAXAEAVdynamic_parameter_base@1experimental@oneapi@ext@23@@Z ?get_addressing_mode@sampler@_V1@sycl@@QEBA?AW4addressing_mode@23@XZ ?get_addressing_mode@sampler_impl@detail@_V1@sycl@@QEBA?AW4addressing_mode@34@XZ ?get_allocator_internal@SYCLMemObjT@detail@_V1@sycl@@QEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ @@ -1474,7 +1475,7 @@ ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z -?register_with_node@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEAVhandler@67@H@Z +?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z ?release@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAVSYCLMemObjI@234@PEAXV?$vector@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@V?$allocator@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@2@@6@AEAPEAU_pi_event@@@Z ?releaseHostMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXPEAX@Z ?releaseMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAX@Z @@ -5140,10 +5141,10 @@ ?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$command_graph@$0A@@34567@@Z ?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z ?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBVnode@34567@@Z -?updateAccessor@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEAVAccessorBaseHost@267@@Z +?updateAccessor@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBVAccessorBaseHost@267@@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXQEAX@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXXZ -?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEAX_K@Z +?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBX_K@Z ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z ?usesPinnedHostMemory@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ From 6c405d0acdf1fee668ee64a6f8a7a7dba3b3195c Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Mar 2024 15:13:50 +0000 Subject: [PATCH 14/26] Add test for updating multiple nodes with different arg indices --- ...s_ptr_multiple_nodes_different_indices.cpp | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp new file mode 100644 index 0000000000000..2050a3fffc766 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp @@ -0,0 +1,84 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes where it has a different argument index in each node + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + // Arg index is 1 here + cgh.set_arg(1, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrB[i] = 0; + PtrA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + // Arg index is 0 here + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} From 16f2eb6c1f3c11d635c8be80c93c13b8b30ff6dc Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Mar 2024 16:39:46 +0000 Subject: [PATCH 15/26] Update hip symbol dumps --- sycl/test/abi/pi_hip_symbol_check.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 4c091716caedb..6028a2624ae75 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -99,6 +99,7 @@ piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease piextCommandBufferRetain +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter From 382f0166f519da86df474f4dd6f875c37f03de2f Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 14 Mar 2024 12:50:35 +0000 Subject: [PATCH 16/26] Fix host tasks moving associated accessors - Prevented finalize from correctly checking them to ensure require was called. --- sycl/include/sycl/handler.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 116652cfd66d9..9a0b1a5642dbf 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -937,8 +937,9 @@ class __SYCL_EXPORT handler { } else { // In case w/o the integration header it is necessary to process // accessors from the list(which are associated with this handler) as - // arguments. - MArgs = std::move(MAssociatedAccesors); + // arguments. We must copy the associated accessors as they are checked + // later during finalize. + MArgs = MAssociatedAccesors; } // If the kernel lambda is callable with a kernel_handler argument, manifest @@ -1818,7 +1819,9 @@ class __SYCL_EXPORT handler { throwIfActionIsCreated(); MNDRDesc.set(range<1>(1)); - MArgs = std::move(MAssociatedAccesors); + // Need to copy these rather than move so that we can check associated + // accessors during finalize + MArgs = MAssociatedAccesors; MHostTask.reset(new detail::HostTask(std::move(Func))); From c17e6c6da17264bed7eda19e79bc2f4e3caf827c Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 14 Mar 2024 14:19:09 +0000 Subject: [PATCH 17/26] Make graph aspects not mutually exclusive --- sycl/source/detail/device_impl.cpp | 12 +----------- sycl/test-e2e/Graph/Error/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Explicit/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Profiling/lit.local.cfg | 2 +- sycl/test-e2e/Graph/RecordReplay/lit.local.cfg | 2 +- sycl/test-e2e/Graph/Threading/lit.local.cfg | 2 +- .../Graph/UnsupportedDevice/device_query.cpp | 3 ++- sycl/test-e2e/Graph/ValidUsage/lit.local.cfg | 2 +- 8 files changed, 9 insertions(+), 18 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 269a6cfc989cf..f4d9c5518564b 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -636,17 +636,7 @@ bool device_impl::has(aspect Aspect) const { return PI_FALSE; } - pi_bool SupportsCommandBufferUpdate = false; - CallSuccessful = - getPlugin()->call_nocheck( - MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, - sizeof(SupportsCommandBufferUpdate), &SupportsCommandBufferUpdate, - nullptr) == PI_SUCCESS; - if (!CallSuccessful) { - return PI_FALSE; - } - - return SupportsCommandBuffers && !SupportsCommandBufferUpdate; + return SupportsCommandBuffers; } case aspect::ext_intel_fpga_task_sequence: { return is_accelerator(); diff --git a/sycl/test-e2e/Graph/Error/lit.local.cfg b/sycl/test-e2e/Graph/Error/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Error/lit.local.cfg +++ b/sycl/test-e2e/Graph/Error/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Explicit/lit.local.cfg b/sycl/test-e2e/Graph/Explicit/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Explicit/lit.local.cfg +++ b/sycl/test-e2e/Graph/Explicit/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Profiling/lit.local.cfg b/sycl/test-e2e/Graph/Profiling/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Profiling/lit.local.cfg +++ b/sycl/test-e2e/Graph/Profiling/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg +++ b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Threading/lit.local.cfg b/sycl/test-e2e/Graph/Threading/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Threading/lit.local.cfg +++ b/sycl/test-e2e/Graph/Threading/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp index 9a5b3f5744060..ec0e3bf382846 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp @@ -24,8 +24,9 @@ int main() { } else if (Backend == backend::ext_oneapi_cuda) { assert(SupportsGraphs); - assert(!SupportsLimitedGraphs); + assert(SupportsLimitedGraphs); } else { assert(!SupportsGraphs); + assert(!SupportsLimitedGraphs); } } diff --git a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg index 95f3be32e90c9..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg +++ b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_limited_graph || aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] From 26126384be1bcfac6af22dc968352847b0e40338 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 14 Mar 2024 14:24:29 +0000 Subject: [PATCH 18/26] Simplify graph aspect checks --- sycl/source/detail/device_impl.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index f4d9c5518564b..b185df451d362 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -603,18 +603,8 @@ bool device_impl::has(aspect Aspect) const { return Result != nullptr; } case aspect::ext_oneapi_graph: { - pi_bool SupportsCommandBuffers = false; - bool CallSuccessful = - getPlugin()->call_nocheck( - MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, - sizeof(SupportsCommandBuffers), &SupportsCommandBuffers, - nullptr) == PI_SUCCESS; - if (!CallSuccessful) { - return PI_FALSE; - } - pi_bool SupportsCommandBufferUpdate = false; - CallSuccessful = + bool CallSuccessful = getPlugin()->call_nocheck( MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, sizeof(SupportsCommandBufferUpdate), &SupportsCommandBufferUpdate, @@ -623,7 +613,7 @@ bool device_impl::has(aspect Aspect) const { return PI_FALSE; } - return SupportsCommandBuffers && SupportsCommandBufferUpdate; + return has(aspect::ext_oneapi_limited_graph) && SupportsCommandBufferUpdate; } case aspect::ext_oneapi_limited_graph: { pi_bool SupportsCommandBuffers = false; From 57db3f69ee3982222a1359354b00158e9d38c202 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 18 Mar 2024 17:34:43 +0000 Subject: [PATCH 19/26] Addressing MR comments - Minor name/formatting changes - Remove node accessor cache, rebuild requirements manually after update - Add double update accessor test --- sycl/include/sycl/handler.hpp | 10 +-- sycl/source/detail/graph_impl.cpp | 56 ++++++------- sycl/source/detail/graph_impl.hpp | 11 --- .../Graph/UnsupportedDevice/device_query.cpp | 6 +- .../Update/update_with_indices_accessor.cpp | 5 +- ...te_with_indices_accessor_double_update.cpp | 80 +++++++++++++++++++ 6 files changed, 117 insertions(+), 51 deletions(-) create mode 100644 sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 9a0b1a5642dbf..cefbd84b38ac6 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1909,10 +1909,10 @@ class __SYCL_EXPORT handler { access::target AccTarget, access::placeholder isPlaceholder> void require(ext::oneapi::experimental::dynamic_parameter< accessor> - DynamicParamAcc) { + dynamicParamAcc) { using AccT = accessor; AccT Acc = *static_cast( - detail::getValueFromDynamicParameter(DynamicParamAcc)); + detail::getValueFromDynamicParameter(dynamicParamAcc)); if (Acc.is_placeholder()) associateWithHandler(&Acc, AccTarget); } @@ -1972,9 +1972,9 @@ class __SYCL_EXPORT handler { // set_arg for graph dynamic_parameters template - void set_arg(int ArgIndex, - ext::oneapi::experimental::dynamic_parameter &DynamicParam) { - setArgHelper(ArgIndex, DynamicParam); + void set_arg(int argIndex, + ext::oneapi::experimental::dynamic_parameter &dynamicParam) { + setArgHelper(argIndex, dynamicParam); } /// Sets arguments for OpenCL interoperability kernels. diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 5080863edaaa0..7878ac76fd216 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -762,19 +762,11 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, MIsUpdatable(PropList.has_property()) { // If the graph has been marked as updatable then check if the backend - // actually supports that. + // actually supports that. Devices supporting aspect::ext_oneapi_graph must + // have support for graph update. if (MIsUpdatable) { - pi_bool SupportsUpdate = PI_FALSE; - bool CallSuccessful = - sycl::detail::getSyclObjImpl(MContext) - ->getPlugin() - ->call_nocheck( - sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()) - ->getHandleRef(), - PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, - sizeof(pi_bool), &SupportsUpdate, nullptr) == PI_SUCCESS; - - if (!CallSuccessful || !SupportsUpdate) { + bool SupportsUpdate = MGraphImpl->getDevice().has(aspect::ext_oneapi_graph); + if (!SupportsUpdate) { throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), "Device does not support Command Graph update"); } @@ -1170,6 +1162,10 @@ void exec_graph_impl::update( // scheduler to ensure that any allocations have taken place before trying to // update. bool NeedScheduledUpdate = false; + std::vector UpdateRequirements; + // At worst we may have as many requirements as there are for the entire graph + // for updating. + UpdateRequirements.reserve(MRequirements.size()); for (auto &Node : Nodes) { // Check if node(s) derived from this modifiable node exists in this graph if (MIDCache.count(Node->getID()) == 0) { @@ -1185,15 +1181,10 @@ void exec_graph_impl::update( continue; } NeedScheduledUpdate = true; - // Update cached requirements for this graph with updated node ones - auto UpdatedReqs = Node->MUpdatedAccessorsCache; - for (auto &CachedReq : MRequirements) { - for (auto &UpdatedReq : UpdatedReqs) { - if (CachedReq == UpdatedReq.first) { - CachedReq = UpdatedReq.second; - } - } - } + + UpdateRequirements.insert(UpdateRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); } // Clean up any execution events which have finished so we don't pass them to @@ -1217,12 +1208,20 @@ void exec_graph_impl::update( sycl::async_handler{}, sycl::property_list{}); // Don't need to care about the return event here because it is synchronous sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( - this, Nodes, AllocaQueue, MRequirements, MExecutionEvents); + this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents); } else { for (auto &Node : Nodes) { updateImpl(Node); } } + + // Rebuild cached requirements for this graph with updated nodes + MRequirements.clear(); + for (auto &Node : MNodeStorage) { + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + } } void exec_graph_impl::updateImpl(std::shared_ptr Node) { @@ -1278,16 +1277,13 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { MaskedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex); }); - // Remember this information before the range dimensions are reversed - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); - // Reverse kernel dims sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc); size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; - if (HasLocalSize) + if (NDRDesc.LocalSize[0] != 0) LocalSize = &NDRDesc.LocalSize[0]; else { Plugin->call( @@ -1382,9 +1378,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { // Update ExecNode with new values from Node, in case we ever need to // rebuild the command buffers - (*ExecNode).second->updateFromOtherNode(Node); + ExecNode->second->updateFromOtherNode(Node); - auto Command = MCommandMap[(*ExecNode).second]; + sycl::detail::pi::PiExtCommandBufferCommand Command = + MCommandMap[ExecNode->second]; pi_result Res = Plugin->call_nocheck< sycl::detail::PiApiKind::piextCommandBufferUpdateKernelLaunch>( Command, &UpdateDesc); @@ -1598,7 +1595,6 @@ void executable_command_graph::update( "Method not yet implemented"); } - void executable_command_graph::update(const node &Node) { impl->update(sycl::detail::getSyclObjImpl(Node)); } @@ -1630,7 +1626,6 @@ void dynamic_parameter_base::updateAccessor( } // namespace detail - node_type node::get_type() const { return impl->MNodeType; } std::vector node::get_predecessors() const { @@ -1649,7 +1644,6 @@ node node::get_node_from_event(event nodeEvent) { GraphImpl->getNodeForEvent(EventImpl)); } - template <> void node::update_nd_range<1>(nd_range<1> NDRange) { impl->updateNDRange(NDRange); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 66210b90db3b7..fb5bbf1f0f8a9 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -104,11 +104,6 @@ class node_impl { /// cannot be used to find out the partion of a node outside of this process. int MPartitionNum = -1; - /// Cache of accessors which have been updated on this node - std::vector> - MUpdatedAccessorsCache; - /// Track whether an ND-Range was used for kernel nodes bool MNDRangeUsed = false; @@ -401,10 +396,6 @@ class node_impl { Req = NewReq; } } - // Cache the old and new values so the graph can access it when updating - MUpdatedAccessorsCache.push_back(std::make_pair( - static_cast(Arg.MPtr), - NewAccImpl.get())); Arg.MPtr = NewAccImpl.get(); break; } @@ -494,8 +485,6 @@ class node_impl { auto &NewArgStorage = ExecCG->getArgsStorage(); // Rebuild the arg storage and update the args rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); - - MUpdatedAccessorsCache = Other->MUpdatedAccessorsCache; } id_type getID() const { return MID; } diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp index ec0e3bf382846..b3d40f4d0f89e 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp @@ -17,12 +17,12 @@ int main() { bool SupportsLimitedGraphs = Device.has(aspect::ext_oneapi_limited_graph); auto Backend = Device.get_backend(); - if ((Backend == backend::ext_oneapi_level_zero) || - (Backend == backend::ext_oneapi_hip)) { + if ((Backend == backend::ext_oneapi_level_zero)) { assert(!SupportsGraphs); assert(SupportsLimitedGraphs); - } else if (Backend == backend::ext_oneapi_cuda) { + } else if ((Backend == backend::ext_oneapi_cuda) || + (Backend == backend::ext_oneapi_hip)) { assert(SupportsGraphs); assert(SupportsLimitedGraphs); } else { diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp index 07546a2fa6caf..9f203cef7c533 100644 --- a/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp @@ -7,7 +7,8 @@ // // UNSUPPORTED: opencl, level_zero -// Tests updating a graph node scalar argument using index-based explicit update +// Tests updating a graph node accessor argument using index-based explicit +// update #include "../graph_common.hpp" @@ -49,6 +50,7 @@ int main() { Queue.ext_oneapi_graph(ExecGraph).wait(); Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); for (size_t i = 0; i < N; i++) { assert(HostDataA[i] == i); assert(HostDataB[i] == 0); @@ -59,6 +61,7 @@ int main() { ExecGraph.update(KernelNode); Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); Queue.copy(BufB.get_access(), HostDataB.data()).wait(); for (size_t i = 0; i < N; i++) { assert(HostDataA[i] == i); diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp new file mode 100644 index 0000000000000..ba2e7b0869ca7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node accessor argument multiple times before the graph +// is updated, using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + std::vector HostDataC(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + buffer BufC{HostDataC}; + BufA.set_write_back(false); + BufB.set_write_back(false); + BufC.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto Acc = BufA.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, Acc); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + Acc[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + Queue.copy(BufC.get_access(), HostDataC.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataC[i] == 0); + } + // Update to BufC first + InputParam.update(BufC.get_access()); + + // Swap BufB to be the input instead + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + Queue.copy(BufC.get_access(), HostDataC.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + assert(HostDataC[i] == 0); + } + return 0; +} From d62c02cc0396e8f1919ac08503aa9a6661a23545 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 18 Mar 2024 17:36:15 +0000 Subject: [PATCH 20/26] Fix comment --- sycl/include/sycl/handler.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index cefbd84b38ac6..b441f2cfe6a6d 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1904,7 +1904,8 @@ class __SYCL_EXPORT handler { /// The command group has a requirement to gain access to the given memory /// object before executing. /// - /// \param Acc is a SYCL accessor describing required memory region. + /// \param dynamicParamAcc is dynamic_parameter containing a SYCL accessor + /// describing required memory region. template void require(ext::oneapi::experimental::dynamic_parameter< From 3cd2ddcdd2e841a3312f87a5ddfb03f14417cf1c Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 19 Mar 2024 12:07:22 +0000 Subject: [PATCH 21/26] Add more complex accessor tests --- ...essor_multiple_nodes_different_indices.cpp | 91 +++++++++++++++++ .../update_with_indices_accessor_ordering.cpp | 97 +++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp create mode 100644 sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp new file mode 100644 index 0000000000000..dd6ccf39fd5f5 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp @@ -0,0 +1,91 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes where it has a different argument index in each node + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, AccA); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.require(AccB); + cgh.require(InputParam); + // Arg index is 4 here + cgh.set_arg(4, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccB[i] = 0; + AccA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.require(InputParam); + // Arg index is 0 here + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // AccA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(AccA, HostDataA.data()).wait(); + Queue.copy(AccB, HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap AccB to be the input + InputParam.update(AccB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(AccA, HostDataA.data()).wait(); + Queue.copy(AccB, HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp new file mode 100644 index 0000000000000..54ca947987b82 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp @@ -0,0 +1,97 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node accessor argument using index-based explicit +// update while also submitting work using those accessors to a normal queue + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, AccA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Modify A before the graph executes + Queue.submit([&](handler &cgh) { + cgh.require(AccA); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + // Write a different value than above, this should be overwritten when + // the graph executes. + AccA[i] = i * 3; + } + }); + }); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap BufB to be the input + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + + // Modify B before the graph executes + Queue.submit([&](handler &cgh) { + cgh.require(AccB); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + // Write a different value than above, this should be overwritten when + // the graph executes. + AccB[i] = i * 3; + } + }); + }); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} From 5e10190e0fd4ec92d9bf64c4505a2635d3c901aa Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Mar 2024 16:41:38 +0000 Subject: [PATCH 22/26] Fix test --- .../Graph/RecordReplay/exception_inconsistent_devices.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp index 38a71ca2506f8..281bfb3040ad9 100644 --- a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp @@ -30,7 +30,7 @@ int main() { return 0; } - if (!Dev0.has(aspect::ext_oneapi_graph)) { + if (!Dev0.has(aspect::ext_oneapi_limited_graph)) { std::cout << "Test skipped: device doesn't support graphs" << std::endl; return 0; } From 77165fdc3838d2207a3fe72140456abaf47e564f Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Mar 2024 17:44:35 +0000 Subject: [PATCH 23/26] Add missing pi functions, fix leaking of command buffer commands --- sycl/include/sycl/detail/pi.def | 2 ++ sycl/include/sycl/detail/pi.h | 11 +++++++++++ sycl/plugins/cuda/pi_cuda.cpp | 10 ++++++++++ sycl/plugins/hip/pi_hip.cpp | 10 ++++++++++ sycl/plugins/level_zero/pi_level_zero.cpp | 10 ++++++++++ sycl/plugins/native_cpu/pi_native_cpu.cpp | 10 ++++++++++ sycl/plugins/opencl/pi_opencl.cpp | 10 ++++++++++ sycl/plugins/unified_runtime/pi2ur.hpp | 16 ++++++++++++++++ .../unified_runtime/pi_unified_runtime.cpp | 10 ++++++++++ sycl/source/detail/event_impl.hpp | 2 +- sycl/source/detail/graph_impl.cpp | 9 +++++++++ sycl/source/detail/scheduler/commands.cpp | 2 +- sycl/unittests/helpers/PiMockPlugin.hpp | 10 ++++++++++ 13 files changed, 110 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 32c142f0da95a..195beb0a24861 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -185,6 +185,8 @@ _PI_API(piextCommandBufferPrefetchUSM) _PI_API(piextCommandBufferAdviseUSM) _PI_API(piextEnqueueCommandBuffer) _PI_API(piextCommandBufferUpdateKernelLaunch) +_PI_API(piextCommandBufferRetainCommand) +_PI_API(piextCommandBufferReleaseCommand) _PI_API(piextUSMPitchedAlloc) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 47f7b1b4a6b80..f3e99c32eb3c9 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2658,6 +2658,17 @@ __SYCL_EXPORT pi_result piextCommandBufferUpdateKernelLaunch( pi_ext_command_buffer_command command, pi_ext_command_buffer_update_kernel_launch_desc *desc); +/// API to increment the reference count of a command-buffer command. +/// \param command The command to release. +__SYCL_EXPORT pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command command); + +/// API to decrement the reference count of a command-buffer command. After the +/// command reference count becomes zero, the command is deleted. +/// \param command The command to release. +__SYCL_EXPORT pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command command); + /// API to destroy bindless unsampled image handles. /// /// \param context is the pi_context diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b8a4838362ffb..8bf4eea26620c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1225,6 +1225,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index cf09e703245d3..609750a4892b7 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1228,6 +1228,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 7b21022fd49dc..8e6224ba5794a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1386,6 +1386,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { // missing diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index cd5268ded15a2..35f17a5316bac 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -1194,6 +1194,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 2cdf335ed7b80..9441e29804021 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1164,6 +1164,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 6316a135ec112..63de0109a0742 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4879,6 +4879,22 @@ inline pi_result piextCommandBufferUpdateKernelLaunch( return PI_SUCCESS; } +inline pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command command) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + HANDLE_ERRORS(urCommandBufferRetainCommandExp(UrCommand)); + return PI_SUCCESS; +} + +inline pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command command) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + HANDLE_ERRORS(urCommandBufferReleaseCommandExp(UrCommand)); + return PI_SUCCESS; +} + // Command-buffer extension /////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 557ba54ae7592..0f42f21d39093 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1167,6 +1167,16 @@ pi_result piextCommandBufferUpdateKernelLaunch( return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); } +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 3e0103b606e67..dd2626e5c8053 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -376,7 +376,7 @@ class event_impl { // If this event represents a submission to a // sycl::detail::pi::PiExtCommandBuffer the command-buffer command // (if any) associated with that submission is stored here. - sycl::detail::pi::PiExtCommandBufferCommand MCommandBufferCommand; + sycl::detail::pi::PiExtCommandBufferCommand MCommandBufferCommand = nullptr; friend std::vector getOrWaitEvents(std::vector DepEvents, diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index c3eb329a67ef7..62483acd24506 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -796,6 +796,15 @@ exec_graph_impl::~exec_graph_impl() { } } } + + for (auto &Iter : MCommandMap) { + if (auto Command = Iter.second; Command) { + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferReleaseCommand>(Command); + (void)Res; + assert(Res == pi_result::PI_SUCCESS); + } + } } sycl::event diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 63a0731435f54..3da716ee1d8e8 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2725,7 +2725,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { ? nullptr : &MEvent->getHandleRef(); sycl::detail::pi::PiExtSyncPoint OutSyncPoint; - sycl::detail::pi::PiExtCommandBufferCommand OutCommand; + sycl::detail::pi::PiExtCommandBufferCommand OutCommand = nullptr; switch (MCommandGroup->getType()) { case CG::CGTYPE::Kernel: { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index a2c0c59f1b9e6..a0f267bd97d50 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1387,6 +1387,16 @@ inline pi_result mock_piextCommandBufferUpdateKernelLaunch( return PI_SUCCESS; } +inline pi_result +mock_piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return PI_SUCCESS; +} + inline pi_result mock_piextCommandBufferMemBufferCopy( pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, From 76fbc04277c96c865d4194af24be958ea38a58d7 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Mar 2024 17:51:31 +0000 Subject: [PATCH 24/26] Update pi symbol dumps --- sycl/test/abi/pi_cuda_symbol_check.dump | 2 ++ sycl/test/abi/pi_hip_symbol_check.dump | 2 ++ sycl/test/abi/pi_level_zero_symbol_check.dump | 2 ++ sycl/test/abi/pi_nativecpu_symbol_check.dump | 2 ++ sycl/test/abi/pi_opencl_symbol_check.dump | 2 ++ 5 files changed, 10 insertions(+) diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index 24d7822738b57..dc8ca30260276 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -97,7 +97,9 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 1d1bb90f1f18d..ab85eb32b8ce5 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -98,7 +98,9 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 7ba6042fcce5d..2ebc6b56078a4 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -97,7 +97,9 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 4a21545ae0cb6..6198c8aeb5832 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -98,7 +98,9 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 28b8c2d1c2101..86860b50e57b6 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -97,7 +97,9 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle From e4957e3473d7d8b1ce8269c1444671b406869720 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 22 Mar 2024 12:10:45 +0000 Subject: [PATCH 25/26] Add `REQUIRES: cuda_be` back to PI CUDA symbol test --- sycl/test/abi/pi_cuda_symbol_check.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index dc8ca30260276..0a7db9e19498c 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -5,6 +5,7 @@ # RUN: env LLVM_BIN_PATH=%llvm_build_bin_dir %python %sycl_tools_src_dir/abi_check.py --mode check_symbols --reference %s %sycl_libs_dir/libpi_cuda.so # REQUIRES: linux +# REQUIRES: cuda_be # UNSUPPORTED: libcxx piContextCreate From 5e8c703860dcc50191135469cd5460c57bc200c8 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 22 Mar 2024 13:13:29 +0000 Subject: [PATCH 26/26] clang-format updates --- sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 12 ++++++------ sycl/plugins/unified_runtime/pi2ur.hpp | 5 ++--- sycl/source/detail/event_impl.hpp | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index b6eee87e3ac7c..5cf6b7d9ee761 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -8,9 +8,9 @@ #pragma once -#include // for detail::AccessorBaseHost -#include // for context -#include // for __SYCL_EXPORT +#include // for detail::AccessorBaseHost +#include // for context +#include // for __SYCL_EXPORT #include // for kernel_param_kind_t #include // for DataLessPropKind, PropWith... #include // for device @@ -450,9 +450,9 @@ class dynamic_parameter : public detail::dynamic_parameter_base { std::is_base_of_v; static constexpr sycl::detail::kernel_param_kind_t ParamType = IsAccessor ? sycl::detail::kernel_param_kind_t::kind_accessor - : std::is_pointer_v - ? sycl::detail::kernel_param_kind_t::kind_pointer - : sycl::detail::kernel_param_kind_t::kind_std_layout; + : std::is_pointer_v + ? sycl::detail::kernel_param_kind_t::kind_pointer + : sycl::detail::kernel_param_kind_t::kind_std_layout; public: /// Constructs a new dynamic parameter. diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 63de0109a0742..f396441f0de5d 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1279,9 +1279,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, UR_DEVICE_INFO_COMPONENT_DEVICES) PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE, UR_DEVICE_INFO_COMPOSITE_DEVICE) - PI_TO_UR_MAP_DEVICE_INFO( - PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, - UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP) + PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP) PI_TO_UR_MAP_DEVICE_INFO( PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index dd2626e5c8053..2721832266218 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -310,7 +310,7 @@ class event_impl { sycl::detail::pi::PiExtCommandBufferCommand getCommandBufferCommand() const { return MCommandBufferCommand; } - + const std::vector &getPostCompleteEvents() const { return MPostCompleteEvents; }