diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 57dba90d1f31d..d98572b4d7a7f 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -71,6 +71,7 @@ def AspectExt_oneapi_is_composite : Aspect<"ext_oneapi_is_composite">; def AspectExt_oneapi_is_component : Aspect<"ext_oneapi_is_component">; def AspectExt_oneapi_graph : Aspect<"ext_oneapi_graph">; def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">; +def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">; // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">; @@ -122,7 +123,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_mipmap, AspectExt_oneapi_mipmap_anisotropy, AspectExt_oneapi_mipmap_level_reference, AspectExt_intel_esimd, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, - AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence], + AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 2fc1e75749364..01ae7b7463749 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -46,6 +46,7 @@ with the following entry-points: | `urCommandBufferAppendMemBufferReadRectExp` | Append a rectangular memory read command to a command-buffer object. | | `urCommandBufferAppendMemBufferFillExp` | Append a memory fill command to a command-buffer object. | | `urCommandBufferEnqueueExp` | Submit command-buffer to a command-queue for execution. | +| `urCommandBufferUpdateKernelLaunchExp` | Updates the parameters of a previous kernel launch command. | See the [UR EXP-COMMAND-BUFFER](https://oneapi-src.github.io/unified-runtime/core/EXP-COMMAND-BUFFER.html) specification for more details. @@ -230,6 +231,47 @@ on buffer usage in a graph so that their lifetime semantics are compatible with a lazy work execution model. However these changes to storage lifetimes have not yet been implemented. +## Graph Update + +### Design Challenges + +Graph update faces significant design challenges in SYCL: + +* Lambda capture order is explicitly undefined in C++, so the user cannot reason + about the indices of arguments captured by kernel lambdas. +* Once arguments have been captured the actual type information is lost in the + transition through the integration header and extracting arguments in the SYCL + runtime, therefore we cannot automatically match new argument values by + querying the captured arguments without significant possibility for + collisions. For example, if a kernel captures two USM pointers and the user + wishes to update one, we cannot reason about which pointer they actually want + to update when we only know that: they are pointer args of a certain size. + +The current approach is to limit graph update to the explicit APIs and where the +user is using `handler::set_arg()` or some equivalent to manually set kernel +arguments using indices. Therefore when updating we can use indices to avoid +collisions. In practice there are only a few current scenarios where `set_arg()` +can be used: + +* The proposed ["Free Function Kernel" + extension](../extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc) +* OpenCL interop kernels created from SPIR-V source at runtime. + +A possible future workaround lambda capture issues could be "Whole-Graph Update" +where if we can guarantee that lambda capture order is the same across two +different recordings we can then match parameter order when updating. + +### Scheduler Integration + +Graph updates in the runtime are synchronous calls however they can optionally +be done through the scheduler using a new command, +`sycl::detail::UpdateCommandBufferCommand`. This is needed when dealing with +accessor updates. Since a new buffer which the user creates for updating may not +yet have been lazily initialized on device we schedule a new command which has +requirements for these new accessors to correctly trigger allocations before +updating. This is similar to how individual graph commands are enqueued when +accessors are used in a graph node. + ## Backend Implementation Implementation of UR command-buffers diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 46a200e001231..195beb0a24861 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -184,6 +184,9 @@ _PI_API(piextCommandBufferFillUSM) _PI_API(piextCommandBufferPrefetchUSM) _PI_API(piextCommandBufferAdviseUSM) _PI_API(piextEnqueueCommandBuffer) +_PI_API(piextCommandBufferUpdateKernelLaunch) +_PI_API(piextCommandBufferRetainCommand) +_PI_API(piextCommandBufferReleaseCommand) _PI_API(piextUSMPitchedAlloc) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 3c9076e09f66b..f3e99c32eb3c9 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -156,9 +156,10 @@ // piextEnqueueCooperativeKernelLaunch. // 15.46 Add piextGetGlobalVariablePointer // 15.47 Added PI_ERROR_FEATURE_UNSUPPORTED. +// 15.48 Add CommandBuffer update definitions #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 47 +#define _PI_H_VERSION_MINOR 48 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -445,6 +446,10 @@ typedef enum { // Composite device PI_EXT_ONEAPI_DEVICE_INFO_COMPONENT_DEVICES = 0x20111, PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE = 0x20112, + + // Command Buffers + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT = 0x20113, + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT = 0x20114, } _pi_device_info; typedef enum { @@ -2320,7 +2325,10 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, /// Command buffer extension struct _pi_ext_command_buffer; struct _pi_ext_sync_point; +struct _pi_ext_command_buffer_command; + using pi_ext_command_buffer = _pi_ext_command_buffer *; +using pi_ext_command_buffer_command = _pi_ext_command_buffer_command *; using pi_ext_sync_point = pi_uint32; typedef enum { @@ -2330,7 +2338,40 @@ typedef enum { struct pi_ext_command_buffer_desc final { pi_ext_structure_type stype; const void *pNext; - pi_queue_properties *properties; + pi_bool is_updatable; +}; + +// Command Buffer Update types +struct pi_ext_command_buffer_update_memobj_arg_desc_t final { + uint32_t arg_index; + const pi_mem_obj_property *properties; + pi_mem new_mem_obj; +}; + +struct pi_ext_command_buffer_update_pointer_arg_desc_t final { + uint32_t arg_index; + void *new_ptr; +}; + +struct pi_ext_command_buffer_update_value_arg_desc_t final { + uint32_t arg_index; + uint32_t arg_size; + void *new_value; +}; + +struct pi_ext_command_buffer_update_kernel_launch_desc final { + uint32_t num_mem_obj_args; + uint32_t num_ptr_args; + uint32_t num_value_args; + uint32_t num_work_dim; + + pi_ext_command_buffer_update_memobj_arg_desc_t *mem_obj_arg_list; + pi_ext_command_buffer_update_pointer_arg_desc_t *ptr_arg_list; + pi_ext_command_buffer_update_value_arg_desc_t *value_arg_list; + + size_t *global_work_offset; + size_t *global_work_size; + size_t *local_work_size; }; /// API to create a command-buffer. @@ -2374,12 +2415,14 @@ piextCommandBufferFinalize(pi_ext_command_buffer command_buffer); /// \param sync_point_wait_list A list of sync points that this command must /// wait on. /// \param sync_point The sync_point associated with this kernel execution. +/// \param command Return pointer to the command representing this kernel +/// execution. __SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, - pi_ext_sync_point *sync_point); + pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command); /// API to append a USM memcpy command to the command-buffer. /// \param command_buffer The command-buffer to append onto. @@ -2607,6 +2650,25 @@ piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +/// API to update a kernel launch command inside of a command-buffer. +/// @param command The command to be updated. +/// @param desc Descriptor which describes the updated parameters of the kernel +/// launch. +__SYCL_EXPORT pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command command, + pi_ext_command_buffer_update_kernel_launch_desc *desc); + +/// API to increment the reference count of a command-buffer command. +/// \param command The command to release. +__SYCL_EXPORT pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command command); + +/// API to decrement the reference count of a command-buffer command. After the +/// command reference count becomes zero, the command is deleted. +/// \param command The command to release. +__SYCL_EXPORT pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command command); + /// API to destroy bindless unsampled image handles. /// /// \param context is the pi_context diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 84f9272d22bf5..6ac57370102ce 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -156,6 +156,7 @@ using PiKernelCacheConfig = ::pi_kernel_cache_config; using PiExtSyncPoint = ::pi_ext_sync_point; using PiExtCommandBuffer = ::pi_ext_command_buffer; using PiExtCommandBufferDesc = ::pi_ext_command_buffer_desc; +using PiExtCommandBufferCommand = ::pi_ext_command_buffer_command; using PiPeerAttr = ::pi_peer_attr; using PiImageHandle = ::pi_image_handle; using PiImageMemHandle = ::pi_image_mem_handle; diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 3009af8ee2890..f438b5098065e 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -47,8 +47,9 @@ enum DataLessPropKind { GraphAssumeDataOutlivesBuffer = 22, GraphAssumeBufferOutlivesGraph = 23, GraphDependOnAllLeaves = 24, + GraphUpdatable = 25, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 24, + LastKnownDataLessPropKind = 25, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp index edd10c23b6099..93eb8947ee5b5 100644 --- a/sycl/include/sycl/device_aspect_macros.hpp +++ b/sycl/include/sycl/device_aspect_macros.hpp @@ -323,6 +323,11 @@ #define __SYCL_ALL_DEVICES_HAVE_ext_intel_fpga_task_sequence__ 0 #endif +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_limited_graph__ +// __SYCL_ASPECT(ext_oneapi_limited_graph, 63) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_limited_graph__ 0 +#endif + #ifndef __SYCL_ANY_DEVICE_HAS_host__ // __SYCL_ASPECT(host, 0) #define __SYCL_ANY_DEVICE_HAS_host__ 0 @@ -637,3 +642,8 @@ // __SYCL_ASPECT(ext_intel_fpga_task_sequence__, 62) #define __SYCL_ANY_DEVICE_HAS_ext_intel_fpga_task_sequence__ 0 #endif + +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_limited_graph__ +// __SYCL_ASPECT(ext_oneapi_limited_graph, 63) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_limited_graph__ 0 +#endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 209a0ed25f72f..5cf6b7d9ee761 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -8,10 +8,13 @@ #pragma once +#include // for detail::AccessorBaseHost #include // for context #include // for __SYCL_EXPORT +#include // for kernel_param_kind_t #include // for DataLessPropKind, PropWith... #include // for device +#include // for range, nd_range #include // for is_property, is_property_of #include // for property_list @@ -30,6 +33,15 @@ namespace ext { namespace oneapi { namespace experimental { +/// State to template the command_graph class on. +enum class graph_state { + modifiable, ///< In modifiable state, commands can be added to graph. + executable, ///< In executable state, the graph is ready to execute. +}; + +// Forward declare Graph class +template class command_graph; + namespace detail { // List of sycl features and extensions which are not supported by graphs. Used // for throwing errors when these features are used with graphs. @@ -73,15 +85,9 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) { class node_impl; class graph_impl; class exec_graph_impl; - +class dynamic_parameter_impl; } // namespace detail -/// State to template the command_graph class on. -enum class graph_state { - modifiable, ///< In modifiable state, commands can be added to graph. - executable, ///< In executable state, the graph is ready to execute. -}; - enum class node_type { empty = 0, subgraph = 1, @@ -113,6 +119,13 @@ class __SYCL_EXPORT node { /// submission. static node get_node_from_event(event nodeEvent); + /// Update the ND-Range of this node if it is a kernel execution node + template + void update_nd_range(nd_range executionRange); + + /// Update the Range of this node if it is a kernel execution node + template void update_range(range executionRange); + private: node(const std::shared_ptr &Impl) : impl(Impl) {} @@ -146,6 +159,14 @@ class assume_buffer_outlives_graph public: assume_buffer_outlives_graph() = default; }; + +/// Property passed to command_graph::finalize() to +/// mark the resulting executable command_graph as able to be updated. +class updatable + : public ::sycl::detail::DataLessProperty<::sycl::detail::GraphUpdatable> { +public: + updatable() = default; +}; } // namespace graph namespace node { @@ -336,12 +357,24 @@ class __SYCL_EXPORT executable_command_graph { /// @param Graph Graph to use the inputs and outputs of. void update(const command_graph &Graph); + /// Updates a single node in this graph based on the contents of the provided + /// node. + /// @param Node The node to use for updating the graph. + void update(const node &Node); + + /// Updates a number of nodes in this graph based on the contents of the + /// provided nodes. + /// @param Nodes The nodes to use for updating the graph. + void update(const std::vector &Nodes); + protected: /// Constructor used by internal runtime. /// @param Graph Detail implementation class to construct with. /// @param Ctx Context to use for graph. + /// @param PropList Optional list of properties to pass. executable_command_graph(const std::shared_ptr &Graph, - const sycl::context &Ctx); + const sycl::context &Ctx, + const property_list &PropList = {}); template friend decltype(Obj::impl) @@ -385,14 +418,65 @@ class command_graph : public detail::modifiable_command_graph { template <> class command_graph : public detail::executable_command_graph { - protected: friend command_graph detail::modifiable_command_graph::finalize(const sycl::property_list &) const; using detail::executable_command_graph::executable_command_graph; }; -/// Additional CTAD deduction guide. +namespace detail { +class __SYCL_EXPORT dynamic_parameter_base { +public: + dynamic_parameter_base( + sycl::ext::oneapi::experimental::command_graph + Graph, + size_t ParamSize, const void *Data); + +protected: + void updateValue(const void *NewValue, size_t Size); + + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + std::shared_ptr impl; + + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); +}; +} // namespace detail + +template +class dynamic_parameter : public detail::dynamic_parameter_base { + static constexpr bool IsAccessor = + std::is_base_of_v; + static constexpr sycl::detail::kernel_param_kind_t ParamType = + IsAccessor ? sycl::detail::kernel_param_kind_t::kind_accessor + : std::is_pointer_v + ? sycl::detail::kernel_param_kind_t::kind_pointer + : sycl::detail::kernel_param_kind_t::kind_std_layout; + +public: + /// Constructs a new dynamic parameter. + /// @param Graph The graph associated with this parameter. + /// @param Param A reference value for this parameter used for CTAD. + dynamic_parameter(experimental::command_graph Graph, + const ValueT &Param) + : detail::dynamic_parameter_base(Graph, sizeof(ValueT), &Param) {} + + /// Updates this dynamic parameter and all registered nodes with a new value. + /// @param NewValue The new value for the parameter. + void update(const ValueT &NewValue) { + if constexpr (IsAccessor) { + detail::dynamic_parameter_base::updateAccessor(&NewValue); + } else { + detail::dynamic_parameter_base::updateValue(&NewValue, sizeof(ValueT)); + } + } +}; + +/// Additional CTAD deduction guides. +template +dynamic_parameter(experimental::command_graph Graph, + const ValueT &Param) -> dynamic_parameter; template command_graph(const context &SyclContext, const device &SyclDevice, const property_list &PropList) -> command_graph; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 672f8da32c91c..fa6ee8f1edc49 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -216,6 +216,11 @@ __SYCL_EXPORT device getDeviceFromHandler(handler &); // Checks if a device_global has any registered kernel usage. __SYCL_EXPORT bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr); +// Extracts a pointer to the value inside a dynamic parameter +__SYCL_EXPORT void *getValueFromDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase); + #if __SYCL_ID_QUERIES_FIT_IN_INT__ template struct NotIntMsg; @@ -708,6 +713,30 @@ class __SYCL_EXPORT handler { sizeof(sampler), ArgIndex); } + // setArgHelper for graph dynamic_parameters + template + void + setArgHelper(int ArgIndex, + ext::oneapi::experimental::dynamic_parameter DynamicParam) { + // Extract and copy arg so we can move it into setArgHelper + T ArgValue = + *static_cast(detail::getValueFromDynamicParameter(DynamicParam)); + // Set the arg in the handler as normal + setArgHelper(ArgIndex, std::move(ArgValue)); + // Register the dynamic parameter with the handler for later association + // with the node being added + registerDynamicParameter(DynamicParam, ArgIndex); + } + + /// Registers a dynamic parameter with the handler for later association with + /// the node being created + /// @param DynamicParamBase + /// @param ArgIndex + void registerDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase, + int ArgIndex); + // TODO: Unusued. Remove when ABI break is allowed. void verifyKernelInvoc(const kernel &Kernel) { std::ignore = Kernel; @@ -908,8 +937,9 @@ class __SYCL_EXPORT handler { } else { // In case w/o the integration header it is necessary to process // accessors from the list(which are associated with this handler) as - // arguments. - MArgs = std::move(MAssociatedAccesors); + // arguments. We must copy the associated accessors as they are checked + // later during finalize. + MArgs = MAssociatedAccesors; } // If the kernel lambda is callable with a kernel_handler argument, manifest @@ -1326,6 +1356,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(Wrapper)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif } else #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && @@ -1346,6 +1377,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif #else (void)KernelFunc; @@ -1403,6 +1435,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(true); #endif } @@ -1420,6 +1453,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); setType(detail::CG::Kernel); + setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -1458,6 +1492,7 @@ class __SYCL_EXPORT handler { MNDRDesc.setNumWorkGroups(NumWorkGroups); StoreLambda(std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif // __SYCL_DEVICE_ONLY__ } @@ -1794,7 +1829,9 @@ class __SYCL_EXPORT handler { throwIfActionIsCreated(); MNDRDesc.set(range<1>(1)); - MArgs = std::move(MAssociatedAccesors); + // Need to copy these rather than move so that we can check associated + // accessors during finalize + MArgs = MAssociatedAccesors; MHostTask.reset(new detail::HostTask(std::move(Func))); @@ -1870,6 +1907,27 @@ class __SYCL_EXPORT handler { associateWithHandler(&Acc, AccTarget); } + /// Requires access to the memory object associated with the placeholder + /// accessor contained in a dynamic_parameter object. Calling this function + /// with a non-placeholder accessor has no effect. + /// + /// The command group has a requirement to gain access to the given memory + /// object before executing. + /// + /// \param dynamicParamAcc is dynamic_parameter containing a SYCL accessor + /// describing required memory region. + template + void require(ext::oneapi::experimental::dynamic_parameter< + accessor> + dynamicParamAcc) { + using AccT = accessor; + AccT Acc = *static_cast( + detail::getValueFromDynamicParameter(dynamicParamAcc)); + if (Acc.is_placeholder()) + associateWithHandler(&Acc, AccTarget); + } + /// Registers event dependencies on this command group. /// /// \param Event is a valid SYCL event to wait on. @@ -1923,6 +1981,13 @@ class __SYCL_EXPORT handler { setArgHelper(ArgIndex, std::move(Arg)); } + // set_arg for graph dynamic_parameters + template + void set_arg(int argIndex, + ext::oneapi::experimental::dynamic_parameter &dynamicParam) { + setArgHelper(argIndex, dynamicParam); + } + /// Sets arguments for OpenCL interoperability kernels. /// /// Registers pack of arguments(Args) with indexes starting from 0. @@ -2011,6 +2076,7 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); + setNDRangeUsed(false); #endif } @@ -2103,6 +2169,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); setType(detail::CG::Kernel); + setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -2121,6 +2188,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); setType(detail::CG::Kernel); + setNDRangeUsed(true); extractArgsAndReqs(); MKernelName = getKernelName(); } @@ -2183,6 +2251,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(false); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2222,6 +2291,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(false); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2260,6 +2330,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); + setNDRangeUsed(true); if (!MIsHost && !lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -3464,6 +3535,7 @@ class __SYCL_EXPORT handler { size_t Size, bool Block = false); #endif friend class ext::oneapi::experimental::detail::graph_impl; + friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; bool DisableRangeRounding(); @@ -3687,6 +3759,9 @@ class __SYCL_EXPORT handler { "for use with the SYCL Graph extension."); } } + + // Set that an ND Range was used during a call to parallel_for + void setNDRangeUsed(bool Value); }; } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 4324015822c17..bf8b3c020d4ad 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -57,3 +57,4 @@ __SYCL_ASPECT(ext_oneapi_is_composite, 59) __SYCL_ASPECT(ext_oneapi_is_component, 60) __SYCL_ASPECT(ext_oneapi_graph, 61) __SYCL_ASPECT(ext_intel_fpga_task_sequence, 62) +__SYCL_ASPECT(ext_oneapi_limited_graph, 63) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e6d395e758568..8bf4eea26620c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1088,10 +1088,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1217,6 +1219,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 2fbde10b77123..609750a4892b7 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1091,10 +1091,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1220,6 +1222,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 26a1b104b3335..8e6224ba5794a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1249,10 +1249,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1378,6 +1380,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { // missing diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 5174456e95a77..35f17a5316bac 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -1096,10 +1096,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1186,6 +1188,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index e8a168b60445e..9441e29804021 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1027,10 +1027,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1156,6 +1158,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 14b7b4723c0dc..f396441f0de5d 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1279,6 +1279,11 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, UR_DEVICE_INFO_COMPONENT_DEVICES) PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMPOSITE_DEVICE, UR_DEVICE_INFO_COMPOSITE_DEVICE) + PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP) + PI_TO_UR_MAP_DEVICE_INFO( + PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) #undef PI_TO_UR_MAP_DEVICE_INFO default: return PI_ERROR_UNKNOWN; @@ -4479,13 +4484,14 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_context_handle_t UrContext = reinterpret_cast(Context); ur_device_handle_t UrDevice = reinterpret_cast(Device); - const ur_exp_command_buffer_desc_t *UrDesc = - reinterpret_cast(Desc); + ur_exp_command_buffer_desc_t UrDesc; + UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + UrDesc.isUpdatable = Desc->is_updatable; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer); HANDLE_ERRORS( - urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer)); + urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer)); return PI_SUCCESS; } @@ -4523,16 +4529,18 @@ inline pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { ur_exp_command_buffer_handle_t UrCommandBuffer = reinterpret_cast(CommandBuffer); ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); - + ur_exp_command_buffer_command_handle_t *UrCommandHandle = + reinterpret_cast(Command); HANDLE_ERRORS(urCommandBufferAppendKernelLaunchExp( UrCommandBuffer, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, - nullptr)); + UrCommandHandle)); return PI_SUCCESS; } @@ -4809,6 +4817,83 @@ inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, return PI_SUCCESS; } +inline pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command command, + pi_ext_command_buffer_update_kernel_launch_desc *desc) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + ur_exp_command_buffer_update_kernel_launch_desc_t UrDesc; + + UrDesc.stype = ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_EXEC_INFO_DESC; + UrDesc.numNewMemObjArgs = desc->num_mem_obj_args; + UrDesc.numNewPointerArgs = desc->num_ptr_args; + UrDesc.numNewValueArgs = desc->num_value_args; + UrDesc.newWorkDim = desc->num_work_dim; + + // Exec info updates are unused and will be removed from UR in future + UrDesc.numNewExecInfos = 0; + UrDesc.pNewExecInfoList = nullptr; + + // Convert arg descs + std::vector UrMemObjDescs; + std::vector UrPointerDescs; + std::vector UrValueDescs; + + for (size_t i = 0; i < UrDesc.numNewMemObjArgs; i++) { + auto &PiDesc = desc->mem_obj_arg_list[i]; + UrMemObjDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC, + nullptr, PiDesc.arg_index, nullptr, + reinterpret_cast(PiDesc.new_mem_obj)}); + } + UrDesc.pNewMemObjArgList = UrMemObjDescs.data(); + + for (size_t i = 0; i < UrDesc.numNewPointerArgs; i++) { + auto &PiDesc = desc->ptr_arg_list[i]; + UrPointerDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, + nullptr, PiDesc.arg_index, nullptr, PiDesc.new_ptr}); + } + UrDesc.pNewPointerArgList = UrPointerDescs.data(); + + for (size_t i = 0; i < UrDesc.numNewValueArgs; i++) { + auto &PiDesc = desc->value_arg_list[i]; + UrValueDescs.push_back( + {ur_structure_type_t:: + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, + nullptr, PiDesc.arg_index, PiDesc.arg_size, nullptr, + PiDesc.new_value}); + } + UrDesc.pNewValueArgList = UrValueDescs.data(); + + UrDesc.pNewGlobalWorkSize = desc->global_work_size; + UrDesc.pNewGlobalWorkOffset = desc->global_work_offset; + UrDesc.pNewLocalWorkSize = desc->local_work_size; + + HANDLE_ERRORS(urCommandBufferUpdateKernelLaunchExp(UrCommand, &UrDesc)); + + return PI_SUCCESS; +} + +inline pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command command) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + HANDLE_ERRORS(urCommandBufferRetainCommandExp(UrCommand)); + return PI_SUCCESS; +} + +inline pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command command) { + ur_exp_command_buffer_command_handle_t UrCommand = + reinterpret_cast(command); + HANDLE_ERRORS(urCommandBufferReleaseCommandExp(UrCommand)); + return PI_SUCCESS; +} + // Command-buffer extension /////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 8701d23027682..0f42f21d39093 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1030,10 +1030,12 @@ pi_result piextCommandBufferNDRangeKernel( pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, - const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint, + pi_ext_command_buffer_command *Command) { return pi2ur::piextCommandBufferNDRangeKernel( CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint, + Command); } pi_result piextCommandBufferMemcpyUSM( @@ -1159,6 +1161,22 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); } +pi_result piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return pi2ur::piextCommandBufferUpdateKernelLaunch(Command, Desc); +} + +pi_result +piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferRetainCommand(Command); +} + +pi_result +piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return pi2ur::piextCommandBufferReleaseCommand(Command); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 7028b16d0d338..b185df451d362 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -603,28 +603,30 @@ bool device_impl::has(aspect Aspect) const { return Result != nullptr; } case aspect::ext_oneapi_graph: { - size_t ResultSize = 0; - bool CallSuccessful = getPlugin()->call_nocheck( - MDevice, PI_DEVICE_INFO_EXTENSIONS, 0, nullptr, - &ResultSize) == PI_SUCCESS; - if (!CallSuccessful || ResultSize == 0) { + pi_bool SupportsCommandBufferUpdate = false; + bool CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, + sizeof(SupportsCommandBufferUpdate), &SupportsCommandBufferUpdate, + nullptr) == PI_SUCCESS; + if (!CallSuccessful) { return PI_FALSE; } - std::unique_ptr Result(new char[ResultSize]); - CallSuccessful = getPlugin()->call_nocheck( - MDevice, PI_DEVICE_INFO_EXTENSIONS, ResultSize, - Result.get(), nullptr) == PI_SUCCESS; - + return has(aspect::ext_oneapi_limited_graph) && SupportsCommandBufferUpdate; + } + case aspect::ext_oneapi_limited_graph: { + pi_bool SupportsCommandBuffers = false; + bool CallSuccessful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, + sizeof(SupportsCommandBuffers), &SupportsCommandBuffers, + nullptr) == PI_SUCCESS; if (!CallSuccessful) { return PI_FALSE; } - std::string_view ExtensionsString(Result.get()); - const bool Support = - ExtensionsString.find("ur_exp_command_buffer") != std::string::npos; - - return Support; + return SupportsCommandBuffers; } case aspect::ext_intel_fpga_task_sequence: { return is_accelerator(); diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 4a7467691127c..2721832266218 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -274,7 +274,7 @@ class event_impl { } // Sets a sync point which is used when this event represents an enqueue to a - // Command Bufferr. + // Command Buffer. void setSyncPoint(sycl::detail::pi::PiExtSyncPoint SyncPoint) { MSyncPoint = SyncPoint; } @@ -300,6 +300,17 @@ class event_impl { return MEventFromSubmittedExecCommandBuffer; } + // Sets a command-buffer command when this event represents an enqueue to a + // Command Buffer. + void + setCommandBufferCommand(sycl::detail::pi::PiExtCommandBufferCommand Command) { + MCommandBufferCommand = Command; + } + + sycl::detail::pi::PiExtCommandBufferCommand getCommandBufferCommand() const { + return MCommandBufferCommand; + } + const std::vector &getPostCompleteEvents() const { return MPostCompleteEvents; } @@ -362,6 +373,11 @@ class event_impl { // stored here. sycl::detail::pi::PiExtSyncPoint MSyncPoint; + // If this event represents a submission to a + // sycl::detail::pi::PiExtCommandBuffer the command-buffer command + // (if any) associated with that submission is stored here. + sycl::detail::pi::PiExtCommandBufferCommand MCommandBufferCommand = nullptr; + friend std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index a318ed97d0abd..308fa742d2047 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -388,8 +388,24 @@ graph_impl::add(const std::shared_ptr &Impl, Handler.MCGType); auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep); + NodeImpl->MNDRangeUsed = Handler.MImpl->MNDRangeUsed; // Add an event associated with this explicit node for mixed usage addEventForNode(Impl, std::make_shared(), NodeImpl); + + // Retrieve any dynamic parameters which have been registered in the CGF and + // register the actual nodes with them. + auto &DynamicParams = Handler.MImpl->MDynamicParameters; + + if (NodeType != node_type::kernel && DynamicParams.size() > 0) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "dynamic_parameters cannot be registered with graph " + "nodes which do not represent kernel executions"); + } + + for (auto &[DynamicParam, ArgIndex] : DynamicParams) { + DynamicParam->registerNode(NodeImpl, ArgIndex); + } + return NodeImpl; } @@ -639,10 +655,13 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNodeDirect( findRealDeps(Deps, N.lock(), MPartitionNodes[Node]); } sycl::detail::pi::PiExtSyncPoint NewSyncPoint; + sycl::detail::pi::PiExtCommandBufferCommand NewCommand; pi_int32 Res = sycl::detail::enqueueImpCommandBufferKernel( Ctx, DeviceImpl, CommandBuffer, *static_cast((Node->MCommandGroup.get())), - Deps, &NewSyncPoint, nullptr); + Deps, &NewSyncPoint, &NewCommand, nullptr); + + MCommandMap[Node] = NewCommand; if (Res != pi_result::PI_SUCCESS) { throw sycl::exception(errc::invalid, @@ -671,12 +690,15 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode( sycl::detail::Scheduler::getInstance().addCG( Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps); + MCommandMap[Node] = Event->getCommandBufferCommand(); return Event->getSyncPoint(); } void exec_graph_impl::createCommandBuffers( sycl::device Device, std::shared_ptr &Partition) { sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; - sycl::detail::pi::PiExtCommandBufferDesc Desc{}; + sycl::detail::pi::PiExtCommandBufferDesc Desc{ + pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, + MIsUpdatable}; auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); @@ -732,6 +754,27 @@ void exec_graph_impl::createCommandBuffers( } } +exec_graph_impl::exec_graph_impl(sycl::context Context, + const std::shared_ptr &GraphImpl, + const property_list &PropList) + : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), + MRequirements(), MExecutionEvents(), + MIsUpdatable(PropList.has_property()) { + + // If the graph has been marked as updatable then check if the backend + // actually supports that. Devices supporting aspect::ext_oneapi_graph must + // have support for graph update. + if (MIsUpdatable) { + bool SupportsUpdate = MGraphImpl->getDevice().has(aspect::ext_oneapi_graph); + if (!SupportsUpdate) { + throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), + "Device does not support Command Graph update"); + } + } + // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. + duplicateNodes(); +} + exec_graph_impl::~exec_graph_impl() { const sycl::detail::PluginPtr &Plugin = sycl::detail::getSyclObjImpl(MContext)->getPlugin(); @@ -753,6 +796,15 @@ exec_graph_impl::~exec_graph_impl() { } } } + + for (auto &Iter : MCommandMap) { + if (auto Command = Iter.second; Command) { + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferReleaseCommand>(Command); + (void)Res; + assert(Res == pi_result::PI_SUCCESS); + } + } } sycl::event @@ -964,6 +1016,10 @@ void exec_graph_impl::duplicateNodes() { std::shared_ptr NodeCopy = std::make_shared(*OriginalNode); + // Associate the ID of the original node with the node copy for later quick + // access + MIDCache.insert(std::make_pair(OriginalNode->MID, NodeCopy)); + // Clear edges between nodes so that we can replace with new ones NodeCopy->MSuccessors.clear(); NodeCopy->MPredecessors.clear(); @@ -1007,6 +1063,10 @@ void exec_graph_impl::duplicateNodes() { for (size_t i = 0; i < SubgraphNodes.size(); i++) { auto SubgraphNode = SubgraphNodes[i]; auto NodeCopy = std::make_shared(*SubgraphNode); + // Associate the ID of the original subgraph node with all extracted node + // copies for future quick access. + MIDCache.insert(std::make_pair(SubgraphNode->MID, NodeCopy)); + NewSubgraphNodes.push_back(NodeCopy); SubgraphNodesMap.insert({SubgraphNode, NodeCopy}); NodeCopy->MSuccessors.clear(); @@ -1095,6 +1155,251 @@ void exec_graph_impl::duplicateNodes() { MNodeStorage.insert(MNodeStorage.begin(), NewNodes.begin(), NewNodes.end()); } +void exec_graph_impl::update(std::shared_ptr Node) { + this->update(std::vector>{Node}); +} +void exec_graph_impl::update( + const std::vector> Nodes) { + + if (!MIsUpdatable) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "update() cannot be called on a executable graph " + "which was not created with property::updatable"); + } + + // If there are any accessor requirements, we have to update through the + // scheduler to ensure that any allocations have taken place before trying to + // update. + bool NeedScheduledUpdate = false; + std::vector UpdateRequirements; + // At worst we may have as many requirements as there are for the entire graph + // for updating. + UpdateRequirements.reserve(MRequirements.size()); + for (auto &Node : Nodes) { + // Check if node(s) derived from this modifiable node exists in this graph + if (MIDCache.count(Node->getID()) == 0) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Node passed to update() is not part of the graph."); + } + if (Node->MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception(errc::invalid, "Cannot update non-kernel nodes"); + } + + if (Node->MCommandGroup->getRequirements().size() == 0) { + continue; + } + NeedScheduledUpdate = true; + + UpdateRequirements.insert(UpdateRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + } + + // Clean up any execution events which have finished so we don't pass them to + // the scheduler. + for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) { + if ((*It)->isCompleted()) { + It = MExecutionEvents.erase(It); + continue; + } + ++It; + } + + // If we have previous execution events do the update through the scheduler to + // ensure it is ordered correctly. + NeedScheduledUpdate |= MExecutionEvents.size() > 0; + + if (NeedScheduledUpdate) { + auto AllocaQueue = std::make_shared( + sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()), + sycl::detail::getSyclObjImpl(MGraphImpl->getContext()), + sycl::async_handler{}, sycl::property_list{}); + // Don't need to care about the return event here because it is synchronous + sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( + this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents); + } else { + for (auto &Node : Nodes) { + updateImpl(Node); + } + } + + // Rebuild cached requirements for this graph with updated nodes + MRequirements.clear(); + for (auto &Node : MNodeStorage) { + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + } +} + +void exec_graph_impl::updateImpl(std::shared_ptr Node) { + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()); + + // Gather arg information from Node + auto &ExecCG = + *(static_cast(Node->MCommandGroup.get())); + // Copy args because we may modify them + std::vector NodeArgs = ExecCG.getArguments(); + // Copy NDR desc since we need to modify it + auto NDRDesc = ExecCG.MNDRDesc; + + pi_kernel PiKernel = nullptr; + auto Kernel = ExecCG.MSyclKernel; + auto KernelBundleImplPtr = ExecCG.MKernelBundle; + std::shared_ptr SyclKernelImpl = nullptr; + const sycl::detail::KernelArgMask *EliminatedArgMask = nullptr; + + // Use kernel_bundle if available unless it is interop. + // Interop bundles can't be used in the first branch, because the kernels + // in interop kernel bundles (if any) do not have kernel_id + // and can therefore not be looked up, but since they are self-contained + // they can simply be launched directly. + if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + auto KernelName = ExecCG.MKernelName; + kernel_id KernelID = + sycl::detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + SyclKernelImpl = sycl::detail::getSyclObjImpl(SyclKernel); + PiKernel = SyclKernelImpl->getHandleRef(); + EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); + } else if (Kernel != nullptr) { + PiKernel = Kernel->getHandleRef(); + auto SyclProg = Kernel->getProgramImpl(); + EliminatedArgMask = Kernel->getKernelArgMask(); + } else { + std::tie(PiKernel, std::ignore, EliminatedArgMask, std::ignore) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, ExecCG.MKernelName); + } + + // Remove eliminated args + std::vector MaskedArgs; + MaskedArgs.reserve(NodeArgs.size()); + + sycl::detail::applyFuncOnFilteredArgs( + EliminatedArgMask, NodeArgs, + [&MaskedArgs](sycl::detail::ArgDesc &Arg, int NextTrueIndex) { + MaskedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex); + }); + + // Reverse kernel dims + sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc); + + size_t RequiredWGSize[3] = {0, 0, 0}; + size_t *LocalSize = nullptr; + + if (NDRDesc.LocalSize[0] != 0) + LocalSize = &NDRDesc.LocalSize[0]; + else { + Plugin->call( + PiKernel, DeviceImpl->getHandleRef(), + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize), + RequiredWGSize, + /* param_value_size_ret = */ nullptr); + + const bool EnforcedLocalSize = + (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || + RequiredWGSize[2] != 0); + if (EnforcedLocalSize) + LocalSize = RequiredWGSize; + } + // Create update descriptor + + // Storage for individual arg descriptors + std::vector MemobjDescs; + std::vector PtrDescs; + std::vector ValueDescs; + MemobjDescs.reserve(MaskedArgs.size()); + PtrDescs.reserve(MaskedArgs.size()); + ValueDescs.reserve(MaskedArgs.size()); + + pi_ext_command_buffer_update_kernel_launch_desc UpdateDesc; + + // Collect arg descriptors and fill kernel launch descriptor + using sycl::detail::kernel_param_kind_t; + for (size_t i = 0; i < MaskedArgs.size(); i++) { + auto &NodeArg = MaskedArgs[i]; + switch (NodeArg.MType) { + case kernel_param_kind_t::kind_pointer: { + PtrDescs.push_back({static_cast(NodeArg.MIndex), NodeArg.MPtr}); + } break; + case kernel_param_kind_t::kind_std_layout: { + ValueDescs.push_back({static_cast(NodeArg.MIndex), + static_cast(NodeArg.MSize), + NodeArg.MPtr}); + } break; + case kernel_param_kind_t::kind_accessor: { + sycl::detail::Requirement *Req = + static_cast(NodeArg.MPtr); + + pi_mem_obj_property MemObjData{}; + + switch (Req->MAccessMode) { + case access::mode::read: { + MemObjData.mem_access = PI_ACCESS_READ_ONLY; + break; + } + case access::mode::write: + case access::mode::discard_write: { + MemObjData.mem_access = PI_ACCESS_WRITE_ONLY; + break; + } + default: { + MemObjData.mem_access = PI_ACCESS_READ_WRITE; + break; + } + } + MemObjData.type = PI_KERNEL_ARG_MEM_OBJ_ACCESS; + MemobjDescs.push_back(pi_ext_command_buffer_update_memobj_arg_desc_t{ + static_cast(NodeArg.MIndex), &MemObjData, + static_cast(Req->MData)}); + + } break; + + default: + break; + } + } + + UpdateDesc.num_mem_obj_args = MemobjDescs.size(); + UpdateDesc.mem_obj_arg_list = MemobjDescs.data(); + UpdateDesc.num_ptr_args = PtrDescs.size(); + UpdateDesc.ptr_arg_list = PtrDescs.data(); + UpdateDesc.num_value_args = ValueDescs.size(); + UpdateDesc.value_arg_list = ValueDescs.data(); + + UpdateDesc.global_work_offset = &NDRDesc.GlobalOffset[0]; + UpdateDesc.global_work_size = &NDRDesc.GlobalSize[0]; + UpdateDesc.local_work_size = LocalSize; + UpdateDesc.num_work_dim = NDRDesc.Dims; + + // Query the ID cache to find the equivalent exec node for the node passed to + // this function. + // TODO: Handle subgraphs or any other cases where multiple nodes may be + // associated with a single key, once those node types are supported for + // update. + auto ExecNode = MIDCache.find(Node->MID); + assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache"); + + // Update ExecNode with new values from Node, in case we ever need to + // rebuild the command buffers + ExecNode->second->updateFromOtherNode(Node); + + sycl::detail::pi::PiExtCommandBufferCommand Command = + MCommandMap[ExecNode->second]; + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferUpdateKernelLaunch>( + Command, &UpdateDesc); + + if (Res != PI_SUCCESS) { + throw sycl::exception(errc::invalid, "Error updating command_graph"); + } +} + modifiable_command_graph::modifiable_command_graph( const sycl::context &SyclContext, const sycl::device &SyclDevice, const sycl::property_list &PropList) @@ -1156,12 +1461,12 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) { } command_graph -modifiable_command_graph::finalize(const sycl::property_list &) const { +modifiable_command_graph::finalize(const sycl::property_list &PropList) const { // Graph is read and written in this scope so we lock // this graph with full priviledges. graph_impl::WriteLock Lock(impl->MMutex); - return command_graph{this->impl, - this->impl->getContext()}; + return command_graph{ + this->impl, this->impl->getContext(), PropList}; } bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { @@ -1275,8 +1580,9 @@ std::vector modifiable_command_graph::get_root_nodes() const { } executable_command_graph::executable_command_graph( - const std::shared_ptr &Graph, const sycl::context &Ctx) - : impl(std::make_shared(Ctx, Graph)) { + const std::shared_ptr &Graph, const sycl::context &Ctx, + const property_list &PropList) + : impl(std::make_shared(Ctx, Graph, PropList)) { finalizeImpl(); // Create backend representation for executable graph } @@ -1297,6 +1603,36 @@ void executable_command_graph::update( throw sycl::exception(sycl::make_error_code(errc::invalid), "Method not yet implemented"); } + +void executable_command_graph::update(const node &Node) { + impl->update(sycl::detail::getSyclObjImpl(Node)); +} + +void executable_command_graph::update(const std::vector &Nodes) { + std::vector> NodeImpls{}; + NodeImpls.reserve(Nodes.size()); + for (auto &Node : Nodes) { + NodeImpls.push_back(sycl::detail::getSyclObjImpl(Node)); + } + + impl->update(NodeImpls); +} + +dynamic_parameter_base::dynamic_parameter_base( + command_graph Graph, size_t ParamSize, + const void *Data) + : impl(std::make_shared( + sycl::detail::getSyclObjImpl(Graph), ParamSize, Data)) {} + +void dynamic_parameter_base::updateValue(const void *NewValue, size_t Size) { + impl->updateValue(NewValue, Size); +} + +void dynamic_parameter_base::updateAccessor( + const sycl::detail::AccessorBaseHost *Acc) { + impl->updateAccessor(Acc); +} + } // namespace detail node_type node::get_type() const { return impl->MNodeType; } @@ -1317,6 +1653,24 @@ node node::get_node_from_event(event nodeEvent) { GraphImpl->getNodeForEvent(EventImpl)); } +template <> void node::update_nd_range<1>(nd_range<1> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_nd_range<2>(nd_range<2> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_nd_range<3>(nd_range<3> NDRange) { + impl->updateNDRange(NDRange); +} +template <> void node::update_range<1>(range<1> Range) { + impl->updateRange(Range); +} +template <> void node::update_range<2>(range<2> Range) { + impl->updateRange(Range); +} +template <> void node::update_range<3>(range<3> Range) { + impl->updateRange(Range); +} } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 30cc78c70ab4d..ae6fedbfd12a0 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -75,6 +76,10 @@ inline node_type getNodeTypeFromCG(sycl::detail::CG::CGTYPE CGType) { /// Implementation of node class from SYCL_EXT_ONEAPI_GRAPH. class node_impl { public: + using id_type = uint64_t; + + /// Unique identifier for this node. + id_type MID = getNextNodeID(); /// List of successors to this node. std::vector> MSuccessors; /// List of predecessors to this node. @@ -99,6 +104,9 @@ class node_impl { /// cannot be used to find out the partion of a node outside of this process. int MPartitionNum = -1; + /// Track whether an ND-Range was used for kernel nodes + bool MNDRangeUsed = false; + /// Add successor to the node. /// @param Node Node to add as a successor. /// @param Prev Predecessor to \p node being added as successor. @@ -215,8 +223,12 @@ class node_impl { /// @return A unique ptr to the new command group object. std::unique_ptr getCGCopy() const { switch (MCGType) { - case sycl::detail::CG::Kernel: - return createCGCopy(); + case sycl::detail::CG::Kernel: { + auto CGCopy = createCGCopy(); + rebuildArgStorage(CGCopy->MArgs, MCommandGroup->getArgsStorage(), + CGCopy->getArgsStorage()); + return std::move(CGCopy); + } case sycl::detail::CG::CopyAccToPtr: case sycl::detail::CG::CopyPtrToAcc: case sycl::detail::CG::CopyAccToAcc: @@ -253,6 +265,11 @@ class node_impl { CommandGroupPtr->getSharedPtrStorage(), CommandGroupPtr->getRequirements(), CommandGroupPtr->getEvents()); + std::vector NewArgs = CommandGroupPtr->MArgs; + + rebuildArgStorage(NewArgs, CommandGroupPtr->getArgsStorage(), + Data.MArgsStorage); + sycl::detail::code_location Loc(CommandGroupPtr->MFileName.data(), CommandGroupPtr->MFunctionName.data(), CommandGroupPtr->MLine, @@ -261,7 +278,7 @@ class node_impl { return std::make_unique( sycl::detail::CGHostTask( std::move(HostTaskUPtr), CommandGroupPtr->MQueue, - CommandGroupPtr->MContext, CommandGroupPtr->MArgs, Data, + CommandGroupPtr->MContext, std::move(NewArgs), std::move(Data), CommandGroupPtr->getType(), Loc)); } case sycl::detail::CG::Barrier: @@ -366,11 +383,188 @@ class node_impl { } } + /// Update the value of an accessor inside this node. Accessors must be + /// handled specifically compared to other argument values. + /// @param ArgIndex The index of the accessor arg to be updated + /// @param Acc Pointer to the new accessor value + void updateAccessor(int ArgIndex, const sycl::detail::AccessorBaseHost *Acc) { + auto &Args = + static_cast(MCommandGroup.get())->MArgs; + auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); + + // Find old accessor in accessor storage and replace with new one + if (static_cast(NewAccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } + + // All accessors passed to this function will be placeholders, so we must + // perform steps similar to what happens when handler::require() is + // called here. + sycl::detail::Requirement *NewReq = NewAccImpl.get(); + if (NewReq->MAccessMode != sycl::access_mode::read) { + auto SYCLMemObj = + static_cast(NewReq->MSYCLMemObj); + SYCLMemObj->handleWriteAccessorCreation(); + } + + for (auto &Acc : MCommandGroup->getAccStorage()) { + if (auto OldAcc = + static_cast(Arg.MPtr); + Acc.get() == OldAcc) { + Acc = NewAccImpl; + } + } + + for (auto &Req : MCommandGroup->getRequirements()) { + if (auto OldReq = + static_cast(Arg.MPtr); + Req == OldReq) { + Req = NewReq; + } + } + Arg.MPtr = NewAccImpl.get(); + break; + } + } + + void updateArgValue(int ArgIndex, const void *NewValue, size_t Size) { + + auto &Args = + static_cast(MCommandGroup.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MSize == static_cast(Size)); + // MPtr may be a pointer into arg storage so we memcpy the contents of + // NewValue rather than assign it directly + std::memcpy(Arg.MPtr, NewValue, Size); + break; + } + } + + template + void updateNDRange(nd_range ExecutionRange) { + if (MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception( + sycl::errc::invalid, + "Cannot update execution range of nodes which are not kernel nodes"); + } + if (!MNDRangeUsed) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update node which was created with a " + "sycl::range with a sycl::nd_range"); + } + + auto &NDRDesc = + static_cast(MCommandGroup.get()) + ->MNDRDesc; + + if (NDRDesc.Dims != Dimensions) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update execution range of a node with an " + "execution range of different dimensions than what " + "the node was originall created with."); + } + + NDRDesc.set(ExecutionRange); + } + + template void updateRange(range ExecutionRange) { + if (MCGType != sycl::detail::CG::Kernel) { + throw sycl::exception( + sycl::errc::invalid, + "Cannot update execution range of nodes which are not kernel nodes"); + } + if (MNDRangeUsed) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update node which was created with a " + "sycl::nd_range with a sycl::range"); + } + + auto &NDRDesc = + static_cast(MCommandGroup.get()) + ->MNDRDesc; + + if (NDRDesc.Dims != Dimensions) { + throw sycl::exception(sycl::errc::invalid, + "Cannot update execution range of a node with an " + "execution range of different dimensions than what " + "the node was originall created with."); + } + + NDRDesc.set(ExecutionRange); + } + + void updateFromOtherNode(const std::shared_ptr &Other) { + auto ExecCG = + static_cast(MCommandGroup.get()); + auto OtherExecCG = + static_cast(Other->MCommandGroup.get()); + + ExecCG->MArgs = OtherExecCG->MArgs; + ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; + ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); + ExecCG->getRequirements() = OtherExecCG->getRequirements(); + + auto &OldArgStorage = OtherExecCG->getArgsStorage(); + auto &NewArgStorage = ExecCG->getArgsStorage(); + // Rebuild the arg storage and update the args + rebuildArgStorage(ExecCG->MArgs, OldArgStorage, NewArgStorage); + } + + id_type getID() const { return MID; } + private: + void rebuildArgStorage(std::vector &Args, + const std::vector> &OldArgStorage, + std::vector> &NewArgStorage) const { + // Clear the arg storage so we can rebuild it + NewArgStorage.clear(); + + // Loop over all the args, any std_layout ones need their pointers updated + // to point to the new arg storage. + for (auto &Arg : Args) { + if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_std_layout) { + continue; + } + // Find which ArgStorage Arg.MPtr is pointing to + for (auto &ArgStorage : OldArgStorage) { + if (ArgStorage.data() != Arg.MPtr) { + continue; + } + NewArgStorage.emplace_back(Arg.MSize); + // Memcpy contents from old storage to new storage + std::memcpy(NewArgStorage.back().data(), ArgStorage.data(), Arg.MSize); + // Update MPtr to point to the new storage instead of the old + Arg.MPtr = NewArgStorage.back().data(); + + break; + } + } + } + // Gets the next unique identifier for a node, should only be used when + // constructing nodes. + static id_type getNextNodeID() { + static id_type nextID = 0; + + // Return the value then increment the next ID + return nextID++; + } + /// Prints Node information to Stream. /// @param Stream Where to print the Node information - /// @param Verbose If true, print additional information about the nodes such - /// as kernel args or memory access where applicable. + /// @param Verbose If true, print additional information about the nodes + /// such as kernel args or memory access where applicable. void printDotCG(std::ostream &Stream, bool Verbose) { Stream << "\"" << this << "\" [style=bold, label=\""; @@ -621,7 +815,8 @@ class graph_impl { MAllowBuffers = true; } - if (!SyclDevice.has(aspect::ext_oneapi_graph)) { + if (!SyclDevice.has(aspect::ext_oneapi_limited_graph) && + !SyclDevice.has(aspect::ext_oneapi_graph)) { std::stringstream Stream; Stream << SyclDevice.get_backend(); std::string BackendString = Stream.str(); @@ -1054,13 +1249,10 @@ class exec_graph_impl { /// nodes). /// @param Context Context to create graph with. /// @param GraphImpl Modifiable graph implementation to create with. + /// @param PropList List of properties for constructing this object exec_graph_impl(sycl::context Context, - const std::shared_ptr &GraphImpl) - : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), - MRequirements(), MExecutionEvents() { - // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. - duplicateNodes(); - } + const std::shared_ptr &GraphImpl, + const property_list &PropList); /// Destructor. /// @@ -1128,6 +1320,11 @@ class exec_graph_impl { return MRequirements; } + void update(std::shared_ptr Node); + void update(const std::vector> Nodes); + + void updateImpl(std::shared_ptr NodeImpl); + private: /// Create a command-group for the node and add it to command-buffer by going /// through the scheduler. @@ -1225,6 +1422,72 @@ class exec_graph_impl { std::vector> MPartitions; /// Storage for copies of nodes from the original modifiable graph. std::vector> MNodeStorage; + /// Map of nodes to their associated PI command handles. + std::unordered_map, + sycl::detail::pi::PiExtCommandBufferCommand> + MCommandMap; + /// True if this graph can be updated (set with property::updatable) + bool MIsUpdatable; + + // Stores a cache of node ids from modifiable graph nodes to the companion + // node(s) in this graph. Used for quick access when updating this graph. + std::multimap> MIDCache; +}; + +class dynamic_parameter_impl { +public: + dynamic_parameter_impl(std::shared_ptr GraphImpl, + size_t ParamSize, const void *Data) + : MGraph(GraphImpl), MValueStorage(ParamSize) { + std::memcpy(MValueStorage.data(), Data, ParamSize); + } + + /// Register a node with this dynamic parameter + /// @param NodeImpl The node to be registered + /// @param ArgIndex The arg index for the kernel arg associated with this + /// dynamic_parameter in NodeImpl + void registerNode(std::shared_ptr NodeImpl, int ArgIndex) { + MNodes.emplace_back(NodeImpl, ArgIndex); + } + + /// Get a pointer to the internal value of this dynamic parameter + void *getValue() { return MValueStorage.data(); } + + /// Update the internal value of this dynamic parameter as well as the value + /// of this parameter in all registered nodes. + /// @param NewValue Pointer to the new value + /// @param Size Size of the data pointer to by NewValue + void updateValue(const void *NewValue, size_t Size) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + NodeShared->updateArgValue(ArgIndex, NewValue, Size); + } + } + std::memcpy(MValueStorage.data(), NewValue, Size); + } + + /// Update the internal value of this dynamic parameter as well as the value + /// of this parameter in all registered nodes. Should only be called for + /// accessor dynamic_parameters. + /// @param Acc The new accessor value + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + // Should we fail here if the node isn't alive anymore? + if (NodeShared) { + NodeShared->updateAccessor(ArgIndex, Acc); + } + } + std::memcpy(MValueStorage.data(), Acc, + sizeof(sycl::detail::AccessorBaseHost)); + } + + // Weak ptrs to node_impls which will be updated + std::vector, int>> MNodes; + + std::shared_ptr MGraph; + std::vector MValueStorage; }; } // namespace detail diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index c96d60bd85ecd..e268175781989 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -15,6 +15,9 @@ namespace sycl { inline namespace _V1 { +namespace ext::oneapi::experimental::detail { +class dynamic_parameter_impl; +} namespace detail { using KernelBundleImplPtr = std::shared_ptr; @@ -128,6 +131,16 @@ class handler_impl { // created for later query by users. sycl::ext::oneapi::experimental::node_type MUserFacingNodeType = sycl::ext::oneapi::experimental::node_type::empty; + + // Storage for any SYCL Graph dynamic parameters which have been flagged for + // registration in the CG, along with the argument index for the parameter. + std::vector> + MDynamicParameters; + + // Track whether an NDRange was used when submitting a kernel (as opposed to a + // range), needed for graph update + bool MNDRangeUsed = false; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index efc553cdb97e2..7781fb7e1cd1e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2458,6 +2458,7 @@ pi_int32 enqueueImpCommandBufferKernel( const CGExecKernel &CommandGroup, std::vector &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function &getMemAllocationFunc) { auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); @@ -2539,7 +2540,8 @@ pi_int32 enqueueImpCommandBufferKernel( sycl::detail::PiApiKind::piextCommandBufferNDRangeKernel>( CommandBuffer, PiKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(), - SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint); + SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint, + OutCommand); if (!SyclKernelImpl && !Kernel) { Plugin->call(PiKernel); @@ -2749,6 +2751,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { ? nullptr : &MEvent->getHandleRef(); sycl::detail::pi::PiExtSyncPoint OutSyncPoint; + sycl::detail::pi::PiExtCommandBufferCommand OutCommand = nullptr; switch (MCommandGroup->getType()) { case CG::CGTYPE::Kernel: { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); @@ -2770,8 +2773,10 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { } auto result = enqueueImpCommandBufferKernel( MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer, - *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc); + *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand, + getMemAllocationFunc); MEvent->setSyncPoint(OutSyncPoint); + MEvent->setCommandBufferCommand(OutCommand); return result; } case CG::CGTYPE::CopyUSM: { @@ -3424,6 +3429,67 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { } } +UpdateCommandBufferCommand::UpdateCommandBufferCommand( + QueueImplPtr Queue, + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes) + : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph), + MNodes(Nodes) {} + +pi_int32 UpdateCommandBufferCommand::enqueueImp() { + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; + auto RawEvents = getPiEvents(EventImpls); + flushCrossQueueDeps(EventImpls, getWorkerQueue()); + + for (auto &Node : MNodes) { + auto CG = static_cast(Node->MCommandGroup.get()); + for (auto &Arg : CG->MArgs) { + if (Arg.MType != kernel_param_kind_t::kind_accessor) { + continue; + } + // Search through deps to get actual allocation for accessor args. + for (const DepDesc &Dep : MDeps) { + Requirement *Req = static_cast(Arg.MPtr); + if (Dep.MDepRequirement == Req) { + if (Dep.MAllocaCmd) { + Req->MData = Dep.MAllocaCmd->getMemAllocation(); + } else { + throw sycl::exception(make_error_code(errc::invalid), + "No allocation available for accessor when " + "updating command buffer!"); + } + } + } + } + MGraph->updateImpl(Node); + } + + return PI_SUCCESS; +} + +void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const { + Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; + + Stream << "ID = " << this << "\\n"; + Stream << "CommandBuffer Command Update" + << "\\n"; + + Stream << "\"];" << std::endl; + + for (const auto &Dep : MDeps) { + Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\"" + << " [ label = \"Access mode: " + << accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n" + << "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]" + << std::endl; + } +} + +void UpdateCommandBufferCommand::emitInstrumentationData() {} +bool UpdateCommandBufferCommand::producesPiEvent() const { return false; } + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8dc12120bdd9a..8ba0cceee9e6a 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -25,6 +25,11 @@ namespace sycl { inline namespace _V1 { + +namespace ext::oneapi::experimental::detail { +class exec_graph_impl; +class node_impl; +} // namespace ext::oneapi::experimental::detail namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -114,6 +119,7 @@ class Command { HOST_TASK, FUSION, EXEC_CMD_BUFFER, + UPDATE_CMD_BUFFER }; Command(CommandType Type, QueueImplPtr Queue, @@ -755,6 +761,26 @@ class KernelFusionCommand : public Command { FusionStatus MStatus; }; +class UpdateCommandBufferCommand : public Command { +public: + explicit UpdateCommandBufferCommand( + QueueImplPtr Queue, + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes); + + void printDot(std::ostream &Stream) const final; + void emitInstrumentationData() final; + bool producesPiEvent() const final; + +private: + pi_int32 enqueueImp() final; + + ext::oneapi::experimental::detail::exec_graph_impl *MGraph; + std::vector> + MNodes; +}; + // Enqueues a given kernel to a PiExtCommandBuffer pi_int32 enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, @@ -762,6 +788,7 @@ pi_int32 enqueueImpCommandBufferKernel( const CGExecKernel &CommandGroup, std::vector &SyncPoints, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint, + sycl::detail::pi::PiExtCommandBufferCommand *OutCommand, const std::function &getMemAllocationFunc); // Sets arguments for a given kernel and device based on the argument type. diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 7b50192cf3b43..f0c5dc670aa05 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -14,6 +14,7 @@ #if SYCL_EXT_CODEPLAY_KERNEL_FUSION #include #endif +#include #include #include #include @@ -1679,6 +1680,96 @@ bool Scheduler::GraphBuilder::isInFusionMode(QueueIdT Id) { return FusionList->second->isActive(); } +Command *Scheduler::GraphBuilder::addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events, + std::vector &ToEnqueue) { + auto NewCmd = + std::make_unique(Queue, Graph, Nodes); + // If there are multiple requirements for the same memory object, its + // AllocaCommand creation will be dependent on the access mode of the first + // requirement. Combine these access modes to take all of them into account. + combineAccessModesOfReqs(Requirements); + std::vector ToCleanUp; + for (Requirement *Req : Requirements) { + MemObjRecord *Record = nullptr; + AllocaCommandBase *AllocaCmd = nullptr; + + bool isSameCtx = false; + + { + + Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue); + markModifiedIfWrite(Record, Req); + + AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); + + isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext); + } + + if (!isSameCtx) { + // Cannot directly copy memory from OpenCL device to OpenCL device - + // create two copies: device->host and host->device. + bool NeedMemMoveToHost = false; + auto MemMoveTargetQueue = Queue; + + if (!Queue->is_host() && !Record->MCurContext->is_host()) + NeedMemMoveToHost = true; + + if (NeedMemMoveToHost) + insertMemoryMove(Record, Req, + Scheduler::getInstance().getDefaultHostQueue(), + ToEnqueue); + insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); + } + std::set Deps = + findDepsForReq(Record, Req, Queue->getContextImplPtr()); + + for (Command *Dep : Deps) { + if (Dep != NewCmd.get()) { + Command *ConnCmd = + NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); + } + } + } + + // Set new command as user for dependencies and update leaves. + // Node dependencies can be modified further when adding the node to leaves, + // iterate over their copy. + // FIXME employ a reference here to eliminate copying of a vector + std::vector Deps = NewCmd->MDeps; + for (DepDesc &Dep : Deps) { + const Requirement *Req = Dep.MDepRequirement; + MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); + updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp); + addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); + } + + // Register all the events as dependencies + for (detail::EventImplPtr e : Events) { + if (e->getCommand() && + e->getCommand() == static_cast(NewCmd.get())) { + continue; + } + if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp)) + ToEnqueue.push_back(ConnCmd); + } + + if (MPrintOptionsArray[AfterAddCG]) + printGraphAsDot("after_addCG"); + + for (Command *Cmd : ToCleanUp) { + cleanupCommand(Cmd); + } + + return NewCmd.release(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index a83298a628539..8bfaf07c31c22 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -8,6 +8,7 @@ #include "detail/sycl_mem_obj_i.hpp" #include +#include #include #include #include @@ -670,6 +671,51 @@ KernelFusionCommand *Scheduler::isPartOfActiveFusion(Command *Cmd) { } } +EventImplPtr Scheduler::addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events) { + std::vector AuxiliaryCmds; + EventImplPtr NewCmdEvent = nullptr; + + { + WriteLockT Lock = acquireWriteLock(); + + Command *NewCmd = MGraphBuilder.addCommandGraphUpdate( + Graph, Nodes, Queue, Requirements, Events, AuxiliaryCmds); + if (!NewCmd) + return nullptr; + NewCmdEvent = NewCmd->getEvent(); + } + + std::vector ToCleanUp; + { + ReadLockT Lock = acquireReadLock(); + EnqueueResultT Res; + bool Enqueued; + + for (Command *Cmd : AuxiliaryCmds) { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", + PI_ERROR_INVALID_OPERATION); + } + + if (Command *NewCmd = static_cast(NewCmdEvent->getCommand())) { + Enqueued = + GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", + PI_ERROR_INVALID_OPERATION); + } + } + + cleanupCommands(ToCleanUp); + return NewCmdEvent; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 53ce295626045..ecad8ede1e9ff 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -173,6 +173,10 @@ class MockScheduler; namespace sycl { inline namespace _V1 { +namespace ext::oneapi::experimental::detail { +class exec_graph_impl; +class node_impl; +} // namespace ext::oneapi::experimental::detail namespace detail { class queue_impl; class event_impl; @@ -468,6 +472,20 @@ class Scheduler { std::vector &AuxilaryCmds, BlockingT Blocking = NON_BLOCKING); + /// Adds a command buffer update operation to the execution graph. This is + /// required when buffers/accessors are updated to ensure that the memory has + /// been allocated when updating. + /// \param Graph The executable graph to be updated. + /// \param Nodes The list of Nodes which are to be updated in the graph. + /// \param Requirements List of accessor requirements for this update. + /// \param Events List of events that this update operation depends on + EventImplPtr addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events); + protected: using RWLockT = std::shared_timed_mutex; using ReadLockT = std::shared_lock; @@ -668,6 +686,23 @@ class Scheduler { bool isInFusionMode(QueueIdT queue); + /// Adds a command buffer update operation to the execution graph. This is + /// required when buffers/accessors are updated to ensure that the memory + /// has been allocated when updating. + /// \param Graph The executable graph to be updated. + /// \param Nodes The list of Nodes which are to be updated in the graph. + /// \param Requirements List of accessor requirements for this update. + /// \param Events List of events that this operation depends on. + /// \param ToEnqueue List of commands which need to be enqueued. + Command *addCommandGraphUpdate( + ext::oneapi::experimental::detail::exec_graph_impl *Graph, + std::vector< + std::shared_ptr> + Nodes, + const QueueImplPtr &Queue, std::vector Requirements, + std::vector &Events, + std::vector &ToEnqueue); + std::vector MMemObjs; private: diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5b40f1b3b07c3..5fb65fc925f65 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -69,6 +69,12 @@ getPiImageCopyFlags(sycl::usm::alloc SrcPtrType, sycl::usm::alloc DstPtrType) { "Unknown copy destination location"); } +void *getValueFromDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base + &DynamicParamBase) { + return sycl::detail::getSyclObjImpl(DynamicParamBase)->getValue(); +} + } // namespace detail handler::handler(std::shared_ptr Queue, bool IsHost) @@ -158,6 +164,22 @@ event handler::finalize() { throw sycl::exception(make_error_code(errc::kernel_argument), "placeholder accessor must be bound by calling " "handler::require() before it can be used."); + + // Check associated accessors + bool AccFound = false; + for (detail::ArgDesc &Acc : MAssociatedAccesors) { + if (Acc.MType == detail::kernel_param_kind_t::kind_accessor && + static_cast(Acc.MPtr) == AccImpl) { + AccFound = true; + break; + } + } + + if (!AccFound) { + throw sycl::exception(make_error_code(errc::kernel_argument), + "placeholder accessor must be bound by calling " + "handler::require() before it can be used."); + } } } } @@ -586,6 +608,8 @@ event handler::finalize() { // Associate an event with this new node and return the event. GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); + NodeImpl->MNDRangeUsed = MImpl->MNDRangeUsed; + return detail::createSyclObjFromImpl(EventImpl); } @@ -1562,5 +1586,30 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { return {std::array{0, 0, 0}, false}; } +void handler::setNDRangeUsed(bool Value) { MImpl->MNDRangeUsed = Value; } + +void handler::registerDynamicParameter( + ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase, + int ArgIndex) { + if (MQueue && MQueue->getCommandGraph()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Dynamic Parameters cannot be used with Graph Queue recording."); + } + if (!MGraph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Dynamic Parameters cannot be used with normal SYCL submissions"); + } + + auto ParamImpl = detail::getSyclObjImpl(DynamicParamBase); + if (ParamImpl->MGraph != this->MGraph) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot use a Dynamic Parameter with a node associated with a graph " + "other than the one it was created with."); + } + MImpl->MDynamicParameters.emplace_back(ParamImpl.get(), ArgIndex); +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/Error/lit.local.cfg b/sycl/test-e2e/Graph/Error/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Error/lit.local.cfg +++ b/sycl/test-e2e/Graph/Error/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Explicit/lit.local.cfg b/sycl/test-e2e/Graph/Explicit/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Explicit/lit.local.cfg +++ b/sycl/test-e2e/Graph/Explicit/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Profiling/lit.local.cfg b/sycl/test-e2e/Graph/Profiling/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Profiling/lit.local.cfg +++ b/sycl/test-e2e/Graph/Profiling/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp index 38a71ca2506f8..281bfb3040ad9 100644 --- a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp @@ -30,7 +30,7 @@ int main() { return 0; } - if (!Dev0.has(aspect::ext_oneapi_graph)) { + if (!Dev0.has(aspect::ext_oneapi_limited_graph)) { std::cout << "Test skipped: device doesn't support graphs" << std::endl; return 0; } diff --git a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg +++ b/sycl/test-e2e/Graph/RecordReplay/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/Threading/lit.local.cfg b/sycl/test-e2e/Graph/Threading/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/Threading/lit.local.cfg +++ b/sycl/test-e2e/Graph/Threading/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp index 11e98262bf390..b3d40f4d0f89e 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp @@ -14,13 +14,19 @@ int main() { auto Device = Queue.get_device(); bool SupportsGraphs = Device.has(aspect::ext_oneapi_graph); + bool SupportsLimitedGraphs = Device.has(aspect::ext_oneapi_limited_graph); auto Backend = Device.get_backend(); - if ((Backend == backend::ext_oneapi_level_zero) || - (Backend == backend::ext_oneapi_cuda) || - (Backend == backend::ext_oneapi_hip)) { + if ((Backend == backend::ext_oneapi_level_zero)) { + assert(!SupportsGraphs); + assert(SupportsLimitedGraphs); + + } else if ((Backend == backend::ext_oneapi_cuda) || + (Backend == backend::ext_oneapi_hip)) { assert(SupportsGraphs); + assert(SupportsLimitedGraphs); } else { assert(!SupportsGraphs); + assert(!SupportsLimitedGraphs); } } diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp index b8f62ade21b84..78c65856a0b66 100644 --- a/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp +++ b/sycl/test-e2e/Graph/UnsupportedDevice/exception_unsupported_backend.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Tests the ability to finalize a empty command graph // The test checks that invalid exception is thrown // when trying to create a graph with an unsupported backend. @@ -10,14 +9,16 @@ int GetUnsupportedBackend(const sycl::device &Dev) { // Return 1 if the device backend is unsupported or 0 else. // 0 does not prevent another device to be picked as a second choice - return !Dev.has(aspect::ext_oneapi_graph); + return !Dev.has(aspect::ext_oneapi_graph) && + !Dev.has(aspect::ext_oneapi_limited_graph); } int main() { sycl::device Dev{GetUnsupportedBackend}; queue Queue{Dev}; - if (Dev.has(aspect::ext_oneapi_graph)) + if (Dev.has(aspect::ext_oneapi_graph) || + Dev.has(aspect::ext_oneapi_limited_graph)) return 0; std::error_code ExceptionCode = make_error_code(sycl::errc::success); diff --git a/sycl/test-e2e/Graph/Update/lit.local.cfg b/sycl/test-e2e/Graph/Update/lit.local.cfg new file mode 100644 index 0000000000000..9c0c4cc846295 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/lit.local.cfg @@ -0,0 +1 @@ +config.required_features += ['aspect-ext_oneapi_graph'] diff --git a/sycl/test-e2e/Graph/Update/update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/update_before_finalize.cpp new file mode 100644 index 0000000000000..7c0cc07c5a230 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_before_finalize.cpp @@ -0,0 +1,58 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node before finalization + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + // Swap PtrB to be the input + InputParam.update(PtrB); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Only PtrB should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == 0); + assert(HostDataB[i] == i); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_nd_range.cpp b/sycl/test-e2e/Graph/Update/update_nd_range.cpp new file mode 100644 index 0000000000000..72df8d646dd7d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_nd_range.cpp @@ -0,0 +1,56 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + nd_range<1> NDRange{range{1024}, range{32}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_nd_range(nd_range<1>{range{512}, range{32}}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_range.cpp b/sycl/test-e2e/Graph/Update/update_range.cpp new file mode 100644 index 0000000000000..f1ef78857651b --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_range.cpp @@ -0,0 +1,56 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + range<1> Range{1024}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(Range, [=](item<1> Item) { + size_t GlobalID = Item.get_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_range(range<1>{512}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp new file mode 100644 index 0000000000000..9f203cef7c533 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor.cpp @@ -0,0 +1,71 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node accessor argument using index-based explicit +// update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto Acc = BufA.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, Acc); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + Acc[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap BufB to be the input + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp new file mode 100644 index 0000000000000..ba2e7b0869ca7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_double_update.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node accessor argument multiple times before the graph +// is updated, using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + std::vector HostDataC(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + buffer BufC{HostDataC}; + BufA.set_write_back(false); + BufB.set_write_back(false); + BufC.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto Acc = BufA.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, Acc); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + Acc[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + Queue.copy(BufC.get_access(), HostDataC.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataC[i] == 0); + } + // Update to BufC first + InputParam.update(BufC.get_access()); + + // Swap BufB to be the input instead + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + Queue.copy(BufC.get_access(), HostDataC.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + assert(HostDataC[i] == 0); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp new file mode 100644 index 0000000000000..dd6ccf39fd5f5 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_multiple_nodes_different_indices.cpp @@ -0,0 +1,91 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes where it has a different argument index in each node + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, AccA); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.require(AccB); + cgh.require(InputParam); + // Arg index is 4 here + cgh.set_arg(4, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccB[i] = 0; + AccA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.require(InputParam); + // Arg index is 0 here + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // AccA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(AccA, HostDataA.data()).wait(); + Queue.copy(AccB, HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap AccB to be the input + InputParam.update(AccB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(AccA, HostDataA.data()).wait(); + Queue.copy(AccB, HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp new file mode 100644 index 0000000000000..54ca947987b82 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_accessor_ordering.cpp @@ -0,0 +1,97 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node accessor argument using index-based explicit +// update while also submitting work using those accessors to a normal queue + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + // Initial accessor for use in kernel and dynamic parameter + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + exp_ext::dynamic_parameter InputParam(Graph, AccA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.require(InputParam); + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + AccA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Modify A before the graph executes + Queue.submit([&](handler &cgh) { + cgh.require(AccA); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + // Write a different value than above, this should be overwritten when + // the graph executes. + AccA[i] = i * 3; + } + }); + }); + + // BufA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap BufB to be the input + InputParam.update(BufB.get_access()); + ExecGraph.update(KernelNode); + + // Modify B before the graph executes + Queue.submit([&](handler &cgh) { + cgh.require(AccB); + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + // Write a different value than above, this should be overwritten when + // the graph executes. + AccB[i] = i * 3; + } + }); + }); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..8109bf59af6e8 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_multiple_exec_graphs.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests creating multiple executable graphs from the same modifiable graph and +// only updating one of them. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA values should be modified twice + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Only update ExecGraph, which should now modify PtrB while ExecGraph2 + // modifies PtrA still + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + // A should have been modified 3 times by now, B only once + assert(HostDataA[i] == i * 3); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp new file mode 100644 index 0000000000000..b4bccb00b6b11 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp @@ -0,0 +1,75 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests that updating a graph is ordered with respect to previous executions of +// the graph which may be in flight. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + // Use a large N to try and make the kernel slow + const size_t N = 1 << 16; + // Loop inside kernel to make even slower (too large N runs out of memory) + const size_t NumKernelLoops = 4; + const size_t NumSubmitLoops = 8; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t j = 0; j < NumKernelLoops; j++) { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Submit a bunch of graphs without waiting + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + + // Submit another set of graphs then wait on all submissions + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops); + assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp new file mode 100644 index 0000000000000..22d92b17cd819 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr.cpp @@ -0,0 +1,67 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp new file mode 100644 index 0000000000000..4bb4ee7666658 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_double_update.cpp @@ -0,0 +1,77 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrUnused = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataUnused(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + Queue.memset(PtrUnused, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataUnused[i] == 0); + } + + // Swap PtrUnused to be the input, then swap to PtrB without executing + InputParam.update(PtrUnused); + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + // Check that PtrUnused was never actually used in a kernel + assert(HostDataUnused[i] == 0); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp new file mode 100644 index 0000000000000..9568e943c8f2b --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp new file mode 100644 index 0000000000000..2050a3fffc766 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_nodes_different_indices.cpp @@ -0,0 +1,84 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes where it has a different argument index in each node + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + + auto KernelNodeA = Graph.add([&](handler &cgh) { + // Arg index is 1 here + cgh.set_arg(1, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrB[i] = 0; + PtrA[i] = i; + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + // Arg index is 0 here + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp new file mode 100644 index 0000000000000..2eb98ae3e601e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_multiple_params.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating multiple parameters to a singlegraph node using index-based +// explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + std::vector OutData(N); + + std::iota(HostDataA.begin(), HostDataA.end(), 10); + std::iota(HostDataB.begin(), HostDataB.end(), 100); + + Queue.memcpy(PtrA, HostDataA.data(), N * sizeof(int)).wait(); + Queue.memcpy(PtrB, HostDataB.data(), N * sizeof(int)).wait(); + Queue.memset(PtrC, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter ParamA(Graph, PtrA); + exp_ext::dynamic_parameter ParamB(Graph, PtrB); + exp_ext::dynamic_parameter ParamOut(Graph, PtrC); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, ParamOut); + cgh.set_arg(1, ParamA); + cgh.set_arg(2, ParamB); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.parallel_for(range<1>{Size}, [=](item<1> Item) { + size_t ID = Item.get_id(); + PtrC[ID] += PtrA[ID] * PtrB[ID]; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrC, OutData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(OutData[i] == HostDataC[i] + (HostDataA[i] * HostDataB[i])); + } + + // Update C's host data + HostDataC = OutData; + + // Swap PtrB to be the input + ParamOut.update(PtrB); + ParamB.update(PtrC); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrB, OutData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(OutData[i] == HostDataB[i] + (HostDataA[i] * HostDataC[i])); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp new file mode 100644 index 0000000000000..a8a4564f3fd52 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_subgraph.cpp @@ -0,0 +1,77 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node in an executable graph that was used as a +// subgraph node in another executable graph is not reflected in the graph +// containing the subgraph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(SubGraph, PtrA); + + auto SubKernelNode = SubGraph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] += i; + } + }); + }); + + auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updatable{}); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }); + + Graph.add([&](handler &cgh) { cgh.ext_oneapi_graph(SubExecGraph); }, + exp_ext::property::node::depends_on{KernelNode}); + + // Finalize the parent graph with the original values + auto ExecGraph = Graph.finalize(); + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Update the executable graph that was used as a subgraph with the new value, + // this should not affect ExecGraph + SubExecGraph.update(SubKernelNode); + // Only PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp new file mode 100644 index 0000000000000..5a00aed70f2a1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_with_indices_scalar.cpp @@ -0,0 +1,62 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// UNSUPPORTED: opencl, level_zero + +// Tests updating a graph node scalar argument using index-based explicit update + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *DeviceData = malloc_device(N, Queue); + + int ScalarValue = 17; + + std::vector HostData(N); + + Queue.memset(DeviceData, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, ScalarValue); + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(1, InputParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + cgh.single_task([=]() { + for (size_t i = 0; i < N; i++) { + DeviceData[i] = ScalarValue; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // DeviceData should be filled with current ScalarValue (17) + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == 17); + } + + // Update ScalarValue to be 99 instead + InputParam.update(99); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == 99); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg index 9c0c4cc846295..f01e2216db41b 100644 --- a/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg +++ b/sycl/test-e2e/Graph/ValidUsage/lit.local.cfg @@ -1 +1 @@ -config.required_features += ['aspect-ext_oneapi_graph'] +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index ec83769469dcd..0a7db9e19498c 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -98,7 +98,10 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 975e3315c0197..ab85eb32b8ce5 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -98,7 +98,10 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 336e1cd3cdd8e..2ebc6b56078a4 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -97,7 +97,10 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 1294e7ae831cf..6198c8aeb5832 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -98,7 +98,10 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index fa7c7a2dc0525..86860b50e57b6 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -81,8 +81,8 @@ piSamplerGetInfo piSamplerRelease piSamplerRetain piTearDown -piextCommandBufferAdviseUSM piextBindlessImageSamplerCreate +piextCommandBufferAdviseUSM piextCommandBufferCreate piextCommandBufferFillUSM piextCommandBufferFinalize @@ -97,7 +97,10 @@ piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferPrefetchUSM piextCommandBufferRelease +piextCommandBufferReleaseCommand piextCommandBufferRetain +piextCommandBufferRetainCommand +piextCommandBufferUpdateKernelLaunch piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index dffee1588a04a..8a5cbaa9bbac2 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3721,16 +3721,28 @@ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_ _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKSt6vectorISt4byteSaIS9_EE _ZN4sycl3_V13ext6oneapi12experimental33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi1EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi2EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi3EEEvNS0_5rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi1EEEvNS0_8nd_rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi2EEEvNS0_8nd_rangeIXT_EEE +_ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi3EEEvNS0_8nd_rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node19get_node_from_eventENS0_5eventE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev _ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_ +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKvm +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPKNS0_6detail16AccessorBaseHostE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph12finalizeImplEv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_13command_graphILNS3_11graph_stateE0EEE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_4nodeE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERKSt6vectorINS0_5queueESaIS7_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingEv @@ -4033,6 +4045,7 @@ _ZN4sycl3_V16detail28SampledImageAccessorBaseHost6getPtrEv _ZN4sycl3_V16detail28SampledImageAccessorBaseHostC1ENS0_5rangeILi3EEEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderENS0_13image_samplerERKNS0_13property_listE _ZN4sycl3_V16detail28SampledImageAccessorBaseHostC2ENS0_5rangeILi3EEEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderENS0_13image_samplerERKNS0_13property_listE _ZN4sycl3_V16detail28getPixelCoordNearestFiltModeENS0_3vecIfLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE +_ZN4sycl3_V16detail28getValueFromDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseE _ZN4sycl3_V16detail2pi25contextSetExtendedDeleterERKNS0_7contextEPFvPvES6_ _ZN4sycl3_V16detail2pi3dieEPKc _ZN4sycl3_V16detail2pi9assertionEbPKc @@ -4118,6 +4131,7 @@ _ZN4sycl3_V17handler10mem_adviseEPKvmi _ZN4sycl3_V17handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE _ZN4sycl3_V17handler13getKernelNameB5cxx11Ev +_ZN4sycl3_V17handler14setNDRangeUsedEb _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_ _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleEPvRKNS4_16image_descriptorE _ZN4sycl3_V17handler15ext_oneapi_copyEPvNS0_3ext6oneapi12experimental16image_mem_handleERKNS5_16image_descriptorE @@ -4152,6 +4166,7 @@ _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V17handler24ext_intel_read_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm +_ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler27computeFallbackKernelBoundsEmm diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 0ed3998b6eb55..daf42c02be9ec 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -508,6 +508,9 @@ ??0device_image_plain@detail@_V1@sycl@@QEAA@AEBV?$shared_ptr@Vdevice_image_impl@detail@_V1@sycl@@@std@@@Z ??0device_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0device_selector@_V1@sycl@@QEAA@XZ +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z @@ -534,7 +537,7 @@ ??0exception_list@_V1@sycl@@QEAA@$$QEAV012@@Z ??0exception_list@_V1@sycl@@QEAA@AEBV012@@Z ??0exception_list@_V1@sycl@@QEAA@XZ -??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@@Z +??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@AEBVproperty_list@56@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0filter_selector@ONEAPI@_V1@sycl@@QEAA@$$QEAV0123@@Z @@ -666,6 +669,7 @@ ??1device@_V1@sycl@@QEAA@XZ ??1device_image_plain@detail@_V1@sycl@@QEAA@XZ ??1device_selector@_V1@sycl@@UEAA@XZ +??1dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1event@_V1@sycl@@QEAA@XZ ??1exception@_V1@sycl@@UEAA@XZ ??1exception_list@_V1@sycl@@QEAA@XZ @@ -742,6 +746,8 @@ ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4device_selector@_V1@sycl@@QEAAAEAV012@AEBV012@@Z +??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@$$QEAV0123456@@Z +??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@AEBV0123456@@Z ??4event@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4event@_V1@sycl@@QEAAAEAV012@AEBV012@@Z ??4exception@_V1@sycl@@QEAAAEAV012@AEBV012@@Z @@ -1018,8 +1024,8 @@ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z -?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4arch_category@experimental@oneapi@ext@23@@Z +?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ ?ext_oneapi_can_access_peer@device@_V1@sycl@@QEAA_NAEBV123@W4peer_access@oneapi@ext@23@@Z @@ -1234,6 +1240,7 @@ ?getType@handler@_V1@sycl@@AEAA?AW4CGTYPE@CG@detail@23@XZ ?getType@image_impl@detail@_V1@sycl@@UEBA?AW4MemObjType@SYCLMemObjI@234@XZ ?getUserPtr@SYCLMemObjT@detail@_V1@sycl@@QEBAPEAXXZ +?getValueFromDynamicParameter@detail@_V1@sycl@@YAPEAXAEAVdynamic_parameter_base@1experimental@oneapi@ext@23@@Z ?get_addressing_mode@sampler@_V1@sycl@@QEBA?AW4addressing_mode@23@XZ ?get_addressing_mode@sampler_impl@detail@_V1@sycl@@QEBA?AW4addressing_mode@34@XZ ?get_allocator_internal@SYCLMemObjT@detail@_V1@sycl@@QEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ @@ -1472,6 +1479,7 @@ ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z +?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z ?release@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAVSYCLMemObjI@234@PEAXV?$vector@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@V?$allocator@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@2@@6@AEAPEAU_pi_event@@@Z ?releaseHostMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXPEAX@Z ?releaseMem@SYCLMemObjT@detail@_V1@sycl@@UEAAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAX@Z @@ -1504,6 +1512,7 @@ ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4_pi_kernel_cache_config@@@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z +?setNDRangeUsed@handler@_V1@sycl@@AEAAX_N@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ ?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ @@ -5134,8 +5143,12 @@ ?unsampledImageDestructorNotification@image_plain@detail@_V1@sycl@@IEAAXPEAX@Z ?unset_flag@stream@_V1@sycl@@AEBAXI@Z ?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$command_graph@$0A@@34567@@Z +?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +?update@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBVnode@34567@@Z +?updateAccessor@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBVAccessorBaseHost@267@@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXQEAX@Z ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXXZ +?updateValue@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXPEBX_K@Z ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z ?usesPinnedHostMemory@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ diff --git a/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt b/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt index 2232ce4abb54f..b7213d2eaeae6 100644 --- a/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt +++ b/sycl/unittests/Extensions/CommandGraph/CMakeLists.txt @@ -9,4 +9,5 @@ add_sycl_unittest(CommandGraphExtensionTests OBJECT Queries.cpp Regressions.cpp Subgraph.cpp + Update.cpp ) diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp new file mode 100644 index 0000000000000..92246fb83678d --- /dev/null +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -0,0 +1,152 @@ +//==----------------------------- Update.cpp -------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Common.hpp" + +using namespace sycl; +using namespace sycl::ext::oneapi; + +TEST_F(CommandGraphTest, UpdatableException) { + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraphUpdatable = + Graph.finalize(experimental::property::graph::updatable{}); + + EXPECT_NO_THROW(ExecGraphUpdatable.update(Node)); + + auto ExecGraphNoUpdatable = Graph.finalize(); + + // Graph without the property should throw + EXPECT_ANY_THROW(ExecGraphNoUpdatable.update(Node)); +} + +TEST_F(CommandGraphTest, DynamicParamRegister) { + // Check that registering a dynamic param with a node from a graph that was + // not passed to its constructor throws. + experimental::dynamic_parameter DynamicParam(Graph, int{}); + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto Node = OtherGraph.add([&](sycl::handler &cgh) { + // This should throw since OtherGraph is not associated with DynamicParam + EXPECT_ANY_THROW(cgh.set_arg(0, DynamicParam)); + cgh.single_task>([]() {}); + }); +} + +TEST_F(CommandGraphTest, UpdateNodeNotInGraph) { + // Check that updating a graph with a node which is not part of that graph is + // an error. + + auto OtherGraph = + experimental::command_graph(Queue.get_context(), Queue.get_device()); + auto OtherNode = OtherGraph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); + EXPECT_ANY_THROW(ExecGraph.update(OtherNode)); +} + +TEST_F(CommandGraphTest, UpdateWithUnchangedNode) { + // Tests that updating a graph with a node with unchanged + // parameters is not an error + + auto Node = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto ExecGraph = Graph.finalize(experimental::property::graph::updatable{}); + EXPECT_NO_THROW(ExecGraph.update(Node)); +} + +TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { + // Check that registering a dynamic parameter with various node types either + // throws or does not throw as appropriate + + // Allocate some pointers for memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + experimental::dynamic_parameter DynamicParam{Graph, int{}}; + + ASSERT_NO_THROW(auto NodeKernel = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.single_task>([]() {}); + })); + + ASSERT_ANY_THROW(auto NodeMemcpy = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemset = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.memset(PtrB, 7, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemfill = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.fill(PtrB, 7, 16); + })); + + ASSERT_ANY_THROW(auto NodePrefetch = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.prefetch(PtrA, 16 * sizeof(int)); + })); + + ASSERT_ANY_THROW(auto NodeMemadvise = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.mem_advise(PtrA, 16 * sizeof(int), 1); + })); + + ASSERT_ANY_THROW(auto NodeHostTask = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.host_task([]() {}); + })); + + auto NodeEmpty = Graph.add(); + + experimental::command_graph Subgraph(Queue.get_context(), Dev); + // Add an empty node to the subgraph + Subgraph.add(); + + auto SubgraphExec = Subgraph.finalize(); + ASSERT_ANY_THROW(auto NodeSubgraph = Graph.add([&](sycl::handler &cgh) { + cgh.set_arg(0, DynamicParam); + cgh.ext_oneapi_graph(SubgraphExec); + })); +} + +TEST_F(CommandGraphTest, UpdateRangeErrors) { + // Test that the correct errors are throw when trying to update node ranges + + nd_range<1> NDRange{range{128}, range{32}}; + range<1> Range{128}; + auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(NDRange, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); + // Can't update an nd_range node with a range + EXPECT_ANY_THROW(NodeNDRange.update_range(Range)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeNDRange.update_nd_range( + nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); + + auto NodeRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(range<1>{128}, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeRange.update_range(Range)); + // Can't update a range node with an nd_range + EXPECT_ANY_THROW(NodeRange.update_nd_range(NDRange)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index be777f23df239..a0f267bd97d50 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -207,7 +207,9 @@ inline pi_result mock_piDeviceGetInfo(pi_device device, case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: case PI_DEVICE_INFO_AVAILABLE: case PI_DEVICE_INFO_LINKER_AVAILABLE: - case PI_DEVICE_INFO_COMPILER_AVAILABLE: { + case PI_DEVICE_INFO_COMPILER_AVAILABLE: + case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT: + case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT: { if (param_value) *static_cast(param_value) = PI_TRUE; if (param_value_size_ret) @@ -1322,7 +1324,7 @@ inline pi_result mock_piextCommandBufferNDRangeKernel( const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, - pi_ext_sync_point *sync_point) { + pi_ext_sync_point *sync_point, pi_ext_command_buffer_command *command) { return PI_SUCCESS; } @@ -1379,6 +1381,22 @@ inline pi_result mock_piextEnqueueCommandBuffer( return PI_SUCCESS; } +inline pi_result mock_piextCommandBufferUpdateKernelLaunch( + pi_ext_command_buffer_command Command, + pi_ext_command_buffer_update_kernel_launch_desc *Desc) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferRetainCommand(pi_ext_command_buffer_command Command) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { + return PI_SUCCESS; +} + inline pi_result mock_piextCommandBufferMemBufferCopy( pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size,