diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 248d5446d9dca..9519067a00484 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -282,6 +282,28 @@ requirements for these new accessors to correctly trigger allocations before updating. This is similar to how individual graph commands are enqueued when accessors are used in a graph node. +## Optimizations +### Interactions with Profiling + +Enabling profiling on a graph may disable optimizations from being performed on +the graph if they are incompatible with profiling. For example, enabling +profiling prevents the in-order optimization since the removal of events would +prevent collecting profiling information. + +### In-Order Graph Partitions + +On finalization graph partitions are checked to see if they are in-order, i.e. +the graph follows a single path where each node depends on the previous node. If +so a hint is provided to the backend that it may create the command-buffers in +an in-order fashion. Support for this is backend specific but it may provide +benefits through the removal of the need for synchronization primitives between +kernels. + +This optimization is only performed in this very limited case where it can be +safely assumed to be more performant. It is not likely we'll try to allow +in-order execution in more scenarios through a complicated (and imperfect) +heuristic but rather expose this as a hint the user can provide. + ## Backend Implementation Implementation of UR command-buffers for each of the supported SYCL 2020 diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index ec2bfabfdf0f9..e28e7cdd51cad 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -814,6 +814,25 @@ when passed on finalization of a modifiable `command_graph`. For further information see <>. +==== Enable-Profiling Property [[enable-profiling]] + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental::property::graph { +class enable_profiling { + public: + enable_profiling() = default; +}; +} +---- + +The `property::graph::enable_profiling` property enables profiling events +returned from submissions of the executable graph. Passing this property +implies disabling certain optimizations. As a result, the execution time of a +graph finalized with profiling enabled is longer than that of a graph without +profiling capability. An error will be thrown when attempting to profile an +event from a graph submission that was created without this property. + ==== Graph Member Functions Table {counter: tableNumber}. Constructor of the `command_graph` class. @@ -1345,9 +1364,12 @@ ways: 2. `property::queue::enable_profiling` - This property has no effect on graph recording. When set on the queue a graph is submitted to however, it allows profiling information to be obtained from the event returned by a graph - submission. As it is not defined how a submitted graph will be split up for - scheduling at runtime, the `uint64_t` timestamp reported from a profiling - query on a graph execution event has the following semantics, which may be + submission. The executable graph used for this submission must have been + created with the `enable_profiling` property, see + <> for more details. As it is not + defined how a submitted graph will be split up for scheduling at runtime, + the `uint64_t` timestamp reported from a profiling query on a graph + execution event has the following semantics, which may be pessimistic about execution time on device. * `info::event_profiling::command_submit` - Timestamp when the graph is diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 6e7c5bfbf1669..a134e7f1a9885 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2358,6 +2358,8 @@ typedef enum { struct pi_ext_command_buffer_desc final { pi_ext_structure_type stype; const void *pNext; + pi_bool is_in_order; + pi_bool enable_profiling; pi_bool is_updatable; }; diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index f438b5098065e..544e776a21e1e 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -48,8 +48,9 @@ enum DataLessPropKind { GraphAssumeBufferOutlivesGraph = 23, GraphDependOnAllLeaves = 24, GraphUpdatable = 25, + GraphEnableProfiling = 26, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 25, + LastKnownDataLessPropKind = 26, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 4e9f4c103f945..4521da77c1839 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -167,6 +167,14 @@ class updatable public: updatable() = default; }; + +/// Property used to enable executable graph profiling. Enables profiling on +/// events returned by submissions of the executable graph +class enable_profiling : public ::sycl::detail::DataLessProperty< + ::sycl::detail::GraphEnableProfiling> { +public: + enable_profiling() = default; +}; } // namespace graph namespace node { diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 1ba5093a8edc5..a3040a61e9c69 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4496,8 +4496,8 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_device_handle_t UrDevice = reinterpret_cast(Device); ur_exp_command_buffer_desc_t UrDesc; UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; - UrDesc.isInOrder = ur_bool_t(false); - UrDesc.enableProfiling = ur_bool_t(true); + UrDesc.isInOrder = ur_bool_t(Desc->is_in_order); + UrDesc.enableProfiling = ur_bool_t(Desc->enable_profiling); UrDesc.isUpdatable = Desc->is_updatable; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer); diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index db275248578da..9c9a941ba7fb1 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -321,6 +321,8 @@ class event_impl { return MEventFromSubmittedExecCommandBuffer; } + void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; } + // Sets a command-buffer command when this event represents an enqueue to a // Command Buffer. void diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4cf1c683156c1..025fde4904a9a 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -261,6 +261,7 @@ void exec_graph_impl::makePartitions() { } if (Partition->MRoots.size() > 0) { Partition->schedule(); + Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath(); MPartitions.push_back(Partition); PartitionFinalNum++; } @@ -698,7 +699,9 @@ void exec_graph_impl::createCommandBuffers( sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; sycl::detail::pi::PiExtCommandBufferDesc Desc{ pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, - MIsUpdatable}; + pi_bool(Partition->MIsInOrderGraph && !MEnableProfiling), + pi_bool(MEnableProfiling), pi_bool(MIsUpdatable)}; + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); @@ -760,7 +763,9 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(), MExecutionEvents(), - MIsUpdatable(PropList.has_property()) { + MIsUpdatable(PropList.has_property()), + MEnableProfiling( + PropList.has_property()) { // If the graph has been marked as updatable then check if the backend // actually supports that. Devices supporting aspect::ext_oneapi_graph must @@ -999,6 +1004,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, NewEvent->attachEventToComplete(Elem.second); } } + NewEvent->setProfilingEnabled(MEnableProfiling); sycl::event QueueEvent = sycl::detail::createSyclObjFromImpl(NewEvent); return QueueEvent; diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 9d8604316e8d5..fdd3dcbea2a85 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -383,6 +383,21 @@ class node_impl { } } + /// Test if the node contains a N-D copy + /// @return true if the op is a N-D copy + bool isNDCopyNode() const { + if ((MCGType != sycl::detail::CG::CGTYPE::CopyAccToAcc) && + (MCGType != sycl::detail::CG::CGTYPE::CopyAccToPtr) && + (MCGType != sycl::detail::CG::CGTYPE::CopyPtrToAcc)) { + return false; + } + + auto Copy = static_cast(MCommandGroup.get()); + auto ReqSrc = static_cast(Copy->getSrc()); + auto ReqDst = static_cast(Copy->getDst()); + return (ReqSrc->MDims > 1) || (ReqDst->MDims > 1); + } + /// Update the value of an accessor inside this node. Accessors must be /// handled specifically compared to other argument values. /// @param ArgIndex The index of the accessor arg to be updated @@ -779,6 +794,9 @@ class partition { MPiCommandBuffers; /// List of predecessors to this partition. std::vector> MPredecessors; + /// True if the graph of this partition is a single path graph + /// and in-order optmization can be applied on it. + bool MIsInOrderGraph = false; /// @return True if the partition contains a host task bool isHostTask() const { @@ -786,6 +804,24 @@ class partition { sycl::detail::CG::CGTYPE::CodeplayHostTask)); } + /// Checks if the graph is single path, i.e. each node has a single successor. + /// @return True if the graph is a single path + bool checkIfGraphIsSinglePath() { + if (MRoots.size() > 1) { + return false; + } + for (const auto &Node : MSchedule) { + // In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not + // be enqueued in an in-order cmd-list (causing execution to stall). + // The 2D Copy test should be removed from here when the bug is fixed. + if ((Node->MSuccessors.size() > 1) || (Node->isNDCopyNode())) { + return false; + } + } + + return true; + } + /// Add nodes to MSchedule. void schedule(); }; @@ -1435,6 +1471,8 @@ class exec_graph_impl { MCommandMap; /// True if this graph can be updated (set with property::updatable) bool MIsUpdatable; + /// If true, the graph profiling is enabled. + bool MEnableProfiling; // Stores a cache of node ids from modifiable graph nodes to the companion // node(s) in this graph. Used for quick access when updating this graph. diff --git a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp index a0abf0e9e9961..af4bb747bb919 100644 --- a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp +++ b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp @@ -130,8 +130,10 @@ int main() { KernelGraph.end_recording(Queue); - auto CopyGraphExec = CopyGraph.finalize(); - auto KernelGraphExec = KernelGraph.finalize(); + auto CopyGraphExec = + CopyGraph.finalize({exp_ext::property::graph::enable_profiling{}}); + auto KernelGraphExec = + KernelGraph.finalize({exp_ext::property::graph::enable_profiling{}}); event CopyEvent, KernelEvent1, KernelEvent2; // Run graphs diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index dd687551355e1..1477d267bf9a1 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -534,3 +534,31 @@ TEST_F(CommandGraphTest, ProfilingException) { std::string::npos); } } + +TEST_F(CommandGraphTest, ProfilingExceptionProperty) { + Graph.begin_recording(Queue); + auto Event1 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(Queue); + + // Checks exception thrown if profiling is requested while profiling has + // not been enabled during the graph building. + auto GraphExecInOrder = Graph.finalize(); + queue QueueProfile{Dev, {sycl::property::queue::enable_profiling()}}; + auto EventInOrder = QueueProfile.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecInOrder); }); + QueueProfile.wait_and_throw(); + bool Success = true; + try { + EventInOrder + .get_profiling_info(); + } catch (sycl::exception &Exception) { + ASSERT_FALSE(std::string(Exception.what()) + .find("Profiling information is unavailable as the queue " + "associated with the event does not have the " + "'enable_profiling' property.") == + std::string::npos); + Success = false; + } + ASSERT_EQ(Success, false); +}