From f8f00613ceb9ced117e0b0fb27da63440e73ad8a Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Mar 2024 18:49:24 +0000 Subject: [PATCH 1/8] [SYCL][Graph] Enable in-order cmd-list Analyze the graph and apply enable the use of in-order command-list for linear graph. Add a property to finalize function to disable this optimization which is not compatible with profiling. Update the specification. Co-authored-by: Maxime France-Pillois --- .../sycl_ext_oneapi_graph.asciidoc | 40 ++++++++++++-- sycl/include/sycl/detail/pi.h | 3 +- sycl/include/sycl/detail/property_helper.hpp | 3 +- .../sycl/ext/oneapi/experimental/graph.hpp | 33 ++++++++--- sycl/plugins/unified_runtime/CMakeLists.txt | 10 +--- sycl/plugins/unified_runtime/pi2ur.hpp | 9 ++- sycl/source/detail/event_impl.hpp | 2 + sycl/source/detail/graph_impl.cpp | 33 ++++++++--- sycl/source/detail/graph_impl.hpp | 55 ++++++++++++++++++- sycl/source/handler.cpp | 2 +- .../Graph/Profiling/event_profiling_info.cpp | 21 +++---- sycl/test/abi/sycl_symbols_linux.dump | 8 +-- sycl/test/abi/sycl_symbols_windows.dump | 8 +-- .../Extensions/CommandGraph/Exceptions.cpp | 28 ++++++++++ 14 files changed, 201 insertions(+), 54 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 637a8adc8c8f4..9c7d0561eccaf 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -443,6 +443,31 @@ The API for explicitly adding nodes to a `command_graph` includes a property to be passed here. `depends_on_all_leaves` provides a shortcut for adding all the current leaves of a graph as dependencies. +==== Enable-Profiling Property [[enable-profiling]] +[source,c++] +---- +namespace sycl::ext::oneapi::experimental::property::node { +class enable_profiling { + public: + enable_profiling() = default; +}; +} +---- + +The `property::graph::enable_profiling` property can be passed to a +`command_graph::add()` function and enables profiling support +for the node in the `command_graph`. +Passing this property implies disabling certain optimizations. +This is why profiling is by default disabled on graphs, unless users +explicitly require it using either the `property::graph::enable_profiling` +property in building mode or the `property::queue::enable_profiling` on +the recorded queue (Record&Replay API). +As a result, the execution time of a graph finalized with profiling enabled +is longer than that of a graph without profiling capability. +An error will be thrown when attempting to profile an event +from a graph submission that was created without this property. + + === Graph [source, c++] @@ -681,6 +706,8 @@ Parameters: * `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. The `property::node::depends_on` property can be passed here with a list of nodes to create dependency edges on. + The `enable_profiling` property enables the profiling of this node. + See <> for more details. Returns: The empty node which has been added to the graph. @@ -718,6 +745,9 @@ Parameters: * `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. The `property::node::depends_on` property can be passed here with a list of nodes to create dependency edges on. + The `enable_profiling` property enables the profiling of this node. + See <> for more details. + Returns: The command-group function object node which has been added to the graph. @@ -1043,10 +1073,12 @@ ways: an implicit dependency before and after the graph execution, as if the graph execution is one command-group submitted to the in-order queue. -2. `property::queue::enable_profiling` - This property has no effect on graph - recording. When set on the queue a graph is submitted to however, it allows - profiling information to be obtained from the event returned by a graph - submission. As it is not defined how a submitted graph will be split up for +2. `property::queue::enable_profiling` - This property must be set on the queue + in recording mode if users want to profile the commands recorded to + the graph.This property must also be set on the queue the queue a graph is + submitted to. It allows profiling information to be obtained from the event + returned by a graph submission. + As it is not defined how a submitted graph will be split up for scheduling at runtime, the `uint64_t` timestamp reported from a profiling query on a graph execution event has the following semantics, which may be pessimistic about execution time on device. diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 3c9076e09f66b..74239829cf7a3 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2330,7 +2330,8 @@ typedef enum { struct pi_ext_command_buffer_desc final { pi_ext_structure_type stype; const void *pNext; - pi_queue_properties *properties; + pi_bool is_in_order; + pi_bool enable_profiling; }; /// API to create a command-buffer. diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 3009af8ee2890..7743f1e5a370a 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -47,8 +47,9 @@ enum DataLessPropKind { GraphAssumeDataOutlivesBuffer = 22, GraphAssumeBufferOutlivesGraph = 23, GraphDependOnAllLeaves = 24, + GraphEnableProfiling = 25, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 24, + LastKnownDataLessPropKind = 25, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 209a0ed25f72f..3028c8767e13b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -174,6 +174,15 @@ class depends_on_all_leaves : public ::sycl::detail::DataLessProperty< depends_on_all_leaves() = default; }; +/// Property used to enable node profiling. +/// Passing this property to the `command_graph::add()` function +/// ensures that profiling can be queried on this node. +class enable_profiling : public ::sycl::detail::DataLessProperty< + ::sycl::detail::GraphEnableProfiling> { +public: + enable_profiling() = default; +}; + } // namespace node } // namespace property @@ -200,15 +209,17 @@ class __SYCL_EXPORT modifiable_command_graph { /// @param PropList Property list used to pass [0..n] predecessor nodes. /// @return Constructed empty node which has been added to the graph. node add(const property_list &PropList = {}) { + bool EnableProfiling = + PropList.has_property(); if (PropList.has_property()) { auto Deps = PropList.get_property(); - node Node = addImpl(Deps.get_dependencies()); + node Node = addImpl(Deps.get_dependencies(), EnableProfiling); if (PropList.has_property()) { addGraphLeafDependencies(Node); } return Node; } - node Node = addImpl({}); + node Node = addImpl({}, EnableProfiling); if (PropList.has_property()) { addGraphLeafDependencies(Node); } @@ -220,15 +231,17 @@ class __SYCL_EXPORT modifiable_command_graph { /// @param PropList Property list used to pass [0..n] predecessor nodes. /// @return Constructed node which has been added to the graph. template node add(T CGF, const property_list &PropList = {}) { + bool EnableProfiling = + PropList.has_property(); if (PropList.has_property()) { auto Deps = PropList.get_property(); - node Node = addImpl(CGF, Deps.get_dependencies()); + node Node = addImpl(CGF, Deps.get_dependencies(), EnableProfiling); if (PropList.has_property()) { addGraphLeafDependencies(Node); } return Node; } - node Node = addImpl(CGF, {}); + node Node = addImpl(CGF, {}, EnableProfiling); if (PropList.has_property()) { addGraphLeafDependencies(Node); } @@ -303,14 +316,16 @@ class __SYCL_EXPORT modifiable_command_graph { /// Template-less implementation of add() for CGF nodes. /// @param CGF Command-group function to add. /// @param Dep List of predecessor nodes. + /// @param EnableProfiling Enable node profiling. /// @return Node added to the graph. - node addImpl(std::function CGF, - const std::vector &Dep); + node addImpl(std::function CGF, const std::vector &Dep, + const bool EnableProfiling); /// Template-less implementation of add() for empty nodes. /// @param Dep List of predecessor nodes. + /// @param EnableProfiling Enable node profiling. /// @return Node added to the graph. - node addImpl(const std::vector &Dep); + node addImpl(const std::vector &Dep, const bool EnableProfiling); /// Adds all graph leaves as dependencies /// @param Node Destination node to which the leaves of the graph will be @@ -340,8 +355,10 @@ class __SYCL_EXPORT executable_command_graph { /// Constructor used by internal runtime. /// @param Graph Detail implementation class to construct with. /// @param Ctx Context to use for graph. + /// @param PropList Optional list of properties to pass. executable_command_graph(const std::shared_ptr &Graph, - const sycl::context &Ctx); + const sycl::context &Ctx, + const property_list &PropList = {}); template friend decltype(Obj::impl) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 32ebe3784bf2f..7fce668d27bda 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -81,14 +81,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit c98fdbcf1f43ce132fbae75336bda984e4ce2e78 - # Merge: 5f4dd113 9b3cf9d3 - # Author: Kenneth Benzie (Benie) - # Date: Thu Mar 21 10:51:45 2024 +0000 - # Merge pull request #1439 from nrspruit/fix_device_native_proxy_buffer - # [L0] Fix Native Host memory usage on device with copy back sync - set(UNIFIED_RUNTIME_TAG c98fdbcf1f43ce132fbae75336bda984e4ce2e78) + set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") + set(UNIFIED_RUNTIME_TAG 8718fe1e553ea7cda05a0df2f4ef72acc5ce7397) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 14b7b4723c0dc..6cc3f5d0b423f 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4479,13 +4479,16 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_context_handle_t UrContext = reinterpret_cast(Context); ur_device_handle_t UrDevice = reinterpret_cast(Device); - const ur_exp_command_buffer_desc_t *UrDesc = - reinterpret_cast(Desc); + ur_exp_command_buffer_desc_t UrDesc; + UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + UrDesc.isInOrder = Desc->is_in_order; + UrDesc.enableProfiling = Desc->enable_profiling; + UrDesc.isUpdatable = false; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer); HANDLE_ERRORS( - urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer)); + urCommandBufferCreateExp(UrContext, UrDevice, &UrDesc, UrCommandBuffer)); return PI_SUCCESS; } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 4a7467691127c..27cbb1df41428 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -300,6 +300,8 @@ class event_impl { return MEventFromSubmittedExecCommandBuffer; } + void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; } + const std::vector &getPostCompleteEvents() const { return MPostCompleteEvents; } diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index a318ed97d0abd..21616fd2749c7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -188,6 +188,13 @@ void exec_graph_impl::makePartitions() { if (Node->MCGType == sycl::detail::CG::CodeplayHostTask) { HostTaskList.push_back(Node); } + // Next line is supposed to be temporary. + // Nodes are not profiled individually, but the profiling of the whole graph + // is enabled if at least one node has profiling enabled. This should be + // changed once the PR https://github.com/intel/llvm/pull/12592 on node + // profiling is merged. This also will involve updating all the UR enqueue + // cmd functions to add a new parameter containing the profiling status. + MEnableProfiling |= Node->MProfilingEnabled; } // Annotate nodes @@ -261,6 +268,7 @@ void exec_graph_impl::makePartitions() { } if (Partition->MRoots.size() > 0) { Partition->schedule(); + Partition->checkIfGraphIsSinglePath(); MPartitions.push_back(Partition); PartitionFinalNum++; } @@ -676,7 +684,10 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode( void exec_graph_impl::createCommandBuffers( sycl::device Device, std::shared_ptr &Partition) { sycl::detail::pi::PiExtCommandBuffer OutCommandBuffer; - sycl::detail::pi::PiExtCommandBufferDesc Desc{}; + sycl::detail::pi::PiExtCommandBufferDesc Desc{ + pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, + pi_bool(Partition->MIsInOrderGraph && !MEnableProfiling), + pi_bool(MEnableProfiling)}; auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); @@ -946,6 +957,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, NewEvent->attachEventToComplete(Elem.second); } } + NewEvent->setProfilingEnabled(MEnableProfiling); sycl::event QueueEvent = sycl::detail::createSyclObjFromImpl(NewEvent); return QueueEvent; @@ -1106,7 +1118,8 @@ modifiable_command_graph::modifiable_command_graph( : impl(std::make_shared( SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {} -node modifiable_command_graph::addImpl(const std::vector &Deps) { +node modifiable_command_graph::addImpl(const std::vector &Deps, + const bool EnableProfiling) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { @@ -1115,11 +1128,13 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(impl, DepImpls); + NodeImpl->MProfilingEnabled = EnableProfiling; return sycl::detail::createSyclObjFromImpl(NodeImpl); } node modifiable_command_graph::addImpl(std::function CGF, - const std::vector &Deps) { + const std::vector &Deps, + const bool EnableProfiling) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { @@ -1129,6 +1144,7 @@ node modifiable_command_graph::addImpl(std::function CGF, graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(impl, CGF, {}, DepImpls); + NodeImpl->MProfilingEnabled = EnableProfiling; return sycl::detail::createSyclObjFromImpl(NodeImpl); } @@ -1156,12 +1172,12 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) { } command_graph -modifiable_command_graph::finalize(const sycl::property_list &) const { +modifiable_command_graph::finalize(const sycl::property_list &PropList) const { // Graph is read and written in this scope so we lock // this graph with full priviledges. graph_impl::WriteLock Lock(impl->MMutex); - return command_graph{this->impl, - this->impl->getContext()}; + return command_graph{ + this->impl, this->impl->getContext(), PropList}; } bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { @@ -1275,8 +1291,9 @@ std::vector modifiable_command_graph::get_root_nodes() const { } executable_command_graph::executable_command_graph( - const std::shared_ptr &Graph, const sycl::context &Ctx) - : impl(std::make_shared(Ctx, Graph)) { + const std::shared_ptr &Graph, const sycl::context &Ctx, + const property_list &PropList) + : impl(std::make_shared(Ctx, Graph, PropList)) { finalizeImpl(); // Create backend representation for executable graph } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 30cc78c70ab4d..ba2da21ae05de 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -94,6 +94,9 @@ class node_impl { /// Used for tracking visited status during cycle checks. bool MVisited = false; + /// If true, the graph profiling is enabled for this node. + bool MProfilingEnabled = false; + /// Partition number needed to assign a Node to a a partition. /// Note : This number is only used during the partitionning process and /// cannot be used to find out the partion of a node outside of this process. @@ -152,7 +155,8 @@ class node_impl { node_impl(node_impl &Other) : MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), - MCommandGroup(Other.getCGCopy()), MSubGraphImpl(Other.MSubGraphImpl) {} + MCommandGroup(Other.getCGCopy()), MSubGraphImpl(Other.MSubGraphImpl), + MProfilingEnabled(Other.MProfilingEnabled) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -366,6 +370,24 @@ class node_impl { } } + /// Test if the node contains a N-D copy + /// @return true if the op is a N-D copy + bool isNDCopyNode() const { + if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) || + (MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) || + (MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) { + sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get(); + sycl::detail::Requirement *ReqSrc = + (sycl::detail::Requirement *)(Copy->getSrc()); + sycl::detail::Requirement *ReqDst = + (sycl::detail::Requirement *)(Copy->getDst()); + if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) { + return true; + } + } + return false; + } + private: /// Prints Node information to Stream. /// @param Stream Where to print the Node information @@ -558,6 +580,9 @@ class node_impl { Stream << "Other \\n"; break; } + if (MProfilingEnabled) { + Stream << "Profiling Enabled \\n"; + } Stream << "\"];" << std::endl; } @@ -585,6 +610,9 @@ class partition { MPiCommandBuffers; /// List of predecessors to this partition. std::vector> MPredecessors; + /// True if the graph of this partition is a single path graph + /// and in-order optmization can be applied on it. + bool MIsInOrderGraph = false; /// @return True if the partition contains a host task bool isHostTask() const { @@ -592,6 +620,25 @@ class partition { sycl::detail::CG::CGTYPE::CodeplayHostTask)); } + /// Checks if the graph is single path, i.e. each node has a single successor. + /// If so, the MIsInOrderGraph flag is set. + void checkIfGraphIsSinglePath() { + MIsInOrderGraph = true; + if (MRoots.size() > 1) { + MIsInOrderGraph = false; + return; + } + for (const auto &Node : MSchedule) { + // In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not + // be enqueued in an in-order cmd-list (causing execution to stall). + // The 2D Copy test should be removed from here when the bug is fixed. + if ((Node->MSuccessors.size() > 1) || (Node->isNDCopyNode())) { + MIsInOrderGraph = false; + return; + } + } + } + /// Add nodes to MSchedule. void schedule(); }; @@ -1054,8 +1101,10 @@ class exec_graph_impl { /// nodes). /// @param Context Context to create graph with. /// @param GraphImpl Modifiable graph implementation to create with. + /// @param PropList List of properties for constructing this object. exec_graph_impl(sycl::context Context, - const std::shared_ptr &GraphImpl) + const std::shared_ptr &GraphImpl, + const property_list &PropList) : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MContext(Context), MRequirements(), MExecutionEvents() { // Copy nodes from GraphImpl and merge any subgraph nodes into this graph. @@ -1225,6 +1274,8 @@ class exec_graph_impl { std::vector> MPartitions; /// Storage for copies of nodes from the original modifiable graph. std::vector> MNodeStorage; + /// If true, the graph profiling is enabled. + bool MEnableProfiling = false; }; } // namespace detail diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5b40f1b3b07c3..6eb910228223c 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -582,7 +582,7 @@ event handler::finalize() { } else { NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); } - + NodeImpl->MProfilingEnabled = MQueue->MIsProfilingEnabled; // Associate an event with this new node and return the event. GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); diff --git a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp index a0abf0e9e9961..4b4daa7fa3255 100644 --- a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp +++ b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp @@ -81,6 +81,8 @@ bool compareProfiling(event Event1, event Event2) { // event to complete execution. int main() { device Dev; + // The queue on which the graph is recorded must have the `enable_profiling` + // set to enable graph profiling. queue Queue{Dev, {sycl::property::queue::enable_profiling()}}; const size_t Size = 100000; @@ -107,17 +109,16 @@ int main() { Queue.get_context(), Queue.get_device(), {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; - CopyGraph.begin_recording(Queue); - Queue.submit([&](sycl::handler &Cgh) { - accessor AccessorFrom( - BufferFrom, Cgh, range<1>(Size)); - accessor AccessorTo( - BufferTo, Cgh, range<1>(Size)); - Cgh.copy(AccessorFrom, AccessorTo); - }); - - CopyGraph.end_recording(Queue); + CopyGraph.add( + ([&](sycl::handler &Cgh) { + accessor + AccessorFrom(BufferFrom, Cgh, range<1>(Size)); + accessor + AccessorTo(BufferTo, Cgh, range<1>(Size)); + Cgh.copy(AccessorFrom, AccessorTo); + }), + {exp_ext::property::node::enable_profiling{}}); // kernel launch exp_ext::command_graph KernelGraph{ diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index dffee1588a04a..0a89180d8230d 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3729,16 +3729,16 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev _ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_ _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph12finalizeImplEv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_13command_graphILNS3_11graph_stateE0EEE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC1ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE +_ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graphC2ERKSt10shared_ptrINS4_10graph_implEERKNS0_7contextERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERKSt6vectorINS0_5queueESaIS7_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingERNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_recordingEv _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERKSt6vectorINS0_5queueESaIS7_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph24addGraphLeafDependenciesENS3_4nodeE -_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EE -_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EEb +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EEb _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9make_edgeERNS3_4nodeES7_ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2f180817725d4..250008c3d08f0 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -534,7 +534,7 @@ ??0exception_list@_V1@sycl@@QEAA@$$QEAV012@@Z ??0exception_list@_V1@sycl@@QEAA@AEBV012@@Z ??0exception_list@_V1@sycl@@QEAA@XZ -??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@@Z +??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@56@AEBVproperty_list@56@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0filter_selector@ONEAPI@_V1@sycl@@QEAA@$$QEAV0123@@Z @@ -888,8 +888,8 @@ ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z ?addHostUnsampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVUnsampledImageAccessorImplHost@123@@Z -?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z -?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@_N@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@_N@Z ?addInteropObject@buffer_impl@detail@_V1@sycl@@QEBAXAEAV?$vector@_KV?$allocator@_K@std@@@std@@@Z ?addOrReplaceAccessorProperties@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBVproperty_list@34@@Z ?addOrReplaceAccessorProperties@buffer_plain@detail@_V1@sycl@@IEAAXAEBVproperty_list@34@@Z @@ -1018,8 +1018,8 @@ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z -?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4arch_category@experimental@oneapi@ext@23@@Z +?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ ?ext_oneapi_can_access_peer@device@_V1@sycl@@QEAA_NAEBV123@W4peer_access@oneapi@ext@23@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index dd687551355e1..c4fc154ff6e6f 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -534,3 +534,31 @@ TEST_F(CommandGraphTest, ProfilingException) { std::string::npos); } } + +TEST_F(CommandGraphTest, ProfilingExceptionProperty) { + Graph.begin_recording(Queue); + auto Event1 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(Queue); + + // Checks exception thrown if profiling is requested while profiling has + // not be enabled during the graph building. + auto GraphExecInOrder = Graph.finalize(); + queue QueueProfile{Dev, {sycl::property::queue::enable_profiling()}}; + auto EventInOrder = QueueProfile.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecInOrder); }); + QueueProfile.wait_and_throw(); + bool Success = true; + try { + EventInOrder + .get_profiling_info(); + } catch (sycl::exception &Exception) { + ASSERT_FALSE(std::string(Exception.what()) + .find("Profiling information is unavailable as the queue " + "associated with the event does not have the " + "'enable_profiling' property.") == + std::string::npos); + Success = false; + } + ASSERT_EQ(Success, false); +} From c223a3aa3a98169410c4d5007ad0a1ca4b45c5d4 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 21 Mar 2024 18:17:28 +0000 Subject: [PATCH 2/8] Update UR tag, address spec comments --- .../sycl_ext_oneapi_graph.asciidoc | 22 +++++++++---------- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 9c7d0561eccaf..b137b7b5ee02e 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -454,18 +454,18 @@ class enable_profiling { } ---- -The `property::graph::enable_profiling` property can be passed to a +The `property::node::enable_profiling` property can be passed to a `command_graph::add()` function and enables profiling support for the node in the `command_graph`. Passing this property implies disabling certain optimizations. This is why profiling is by default disabled on graphs, unless users -explicitly require it using either the `property::graph::enable_profiling` -property in building mode or the `property::queue::enable_profiling` on -the recorded queue (Record&Replay API). -As a result, the execution time of a graph finalized with profiling enabled -is longer than that of a graph without profiling capability. -An error will be thrown when attempting to profile an event -from a graph submission that was created without this property. +explicitly require it using either the `property::node::enable_profiling` +property when using the explicit graph creation API or the +`property::queue::enable_profiling` on the recorded queue in the Record & Replay +API. As a result, the execution time of a graph finalized with profiling enabled +is longer than that of a graph without profiling capability. An error will be +thrown when attempting to profile an event from a graph submission that was +created without this property. === Graph @@ -1075,9 +1075,9 @@ ways: 2. `property::queue::enable_profiling` - This property must be set on the queue in recording mode if users want to profile the commands recorded to - the graph.This property must also be set on the queue the queue a graph is - submitted to. It allows profiling information to be obtained from the event - returned by a graph submission. + the graph. This property must also be set on the queue the queue an + executable graph is submitted to. It allows profiling information to be + obtained from the event returned by a graph submission. As it is not defined how a submitted graph will be split up for scheduling at runtime, the `uint64_t` timestamp reported from a profiling query on a graph execution event has the following semantics, which may be diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 7fce668d27bda..1d57d987bc282 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -82,7 +82,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 8718fe1e553ea7cda05a0df2f4ef72acc5ce7397) + set(UNIFIED_RUNTIME_TAG 05e3e6d841a9029113e740d12877ece2a8a53cda) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From a47164fcc0ffc9d4f004fd639ab648dcf20f6e7f Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 16 Apr 2024 15:45:42 +0100 Subject: [PATCH 3/8] Fix formatting --- sycl/source/detail/event_impl.hpp | 2 +- sycl/source/detail/graph_impl.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 193bedcce06d5..a1d2339acbe69 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -316,7 +316,7 @@ class event_impl { } void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; } - + // Sets a command-buffer command when this event represents an enqueue to a // Command Buffer. void diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 6ffbbfc57e2f3..9ec4b4944e2c0 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -707,9 +707,8 @@ void exec_graph_impl::createCommandBuffers( sycl::detail::pi::PiExtCommandBufferDesc Desc{ pi_ext_structure_type::PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC, nullptr, pi_bool(Partition->MIsInOrderGraph && !MEnableProfiling), - pi_bool(MEnableProfiling), - pi_bool(MIsUpdatable)}; - + pi_bool(MEnableProfiling), pi_bool(MIsUpdatable)}; + auto ContextImpl = sycl::detail::getSyclObjImpl(MContext); const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); auto DeviceImpl = sycl::detail::getSyclObjImpl(Device); From 384ac8d3aeb386e5f36e9375ad9edad860490b7d Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 16 Apr 2024 18:32:52 +0100 Subject: [PATCH 4/8] Address PR comments - checkIfGraphIsSinglePath returns bool - Design doc wording about optimizations --- sycl/doc/design/CommandGraph.md | 22 +++++++++++++++++++ .../sycl_ext_oneapi_graph.asciidoc | 13 +++++------ sycl/source/detail/graph_impl.cpp | 2 +- sycl/source/detail/graph_impl.hpp | 13 +++++------ 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 230f2ba407957..f50234d11cce5 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -282,6 +282,28 @@ requirements for these new accessors to correctly trigger allocations before updating. This is similar to how individual graph commands are enqueued when accessors are used in a graph node. +## Optimizations +### Interactions with Profiling + +Enabling profiling on a graph may disable optimizations from being performed on +the graph if they are incompatible with profiling. For example, enabling +profiling prevents the in-order optimization since the removal of events would +prevent collecting profiling information. + +### In-Order Graph Partitions + +On finalization graph partitions are checked to see if they are in-order, i.e. +the graph follows a single path where each node depends on the previous node. If +so a hint is provided to the backend that it may create the command-buffers in +an in-order fashion. Support for this is backend specific but it may provide +benefits through the removal of the need for synchronization primitives between +kernels. + +This optimization is only performed in this very limited case where it can be +safely assumed to be more performant. It is not likely we'll try to allow +in-order execution in more scenarios through a complicated (and imperfect) +heuristic but rather expose this as a hint the user can provide. + ## Backend Implementation Implementation of UR command-buffers for each of the supported SYCL 2020 diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 50fb244b48a48..c61c02f287d30 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1382,13 +1382,12 @@ ways: 2. `property::queue::enable_profiling` - This property must be set on the queue in recording mode if users want to profile the commands recorded to - the graph. This property must also be set on the queue the queue an - executable graph is submitted to. It allows profiling information to be - obtained from the event returned by a graph submission. - As it is not defined how a submitted graph will be split up for - scheduling at runtime, the `uint64_t` timestamp reported from a profiling - query on a graph execution event has the following semantics, which may be - pessimistic about execution time on device. + the graph. This property must also be set on the queue an executable graph is + submitted to. It allows profiling information to be obtained from the event + returned by a graph submission. As it is not defined how a submitted graph + will be split up for scheduling at runtime, the `uint64_t` timestamp reported + from a profiling query on a graph execution event has the following + semantics, which may be pessimistic about execution time on device. * `info::event_profiling::command_submit` - Timestamp when the graph is submitted to the queue. diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 9ec4b4944e2c0..e99a5ac6712e0 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -268,7 +268,7 @@ void exec_graph_impl::makePartitions() { } if (Partition->MRoots.size() > 0) { Partition->schedule(); - Partition->checkIfGraphIsSinglePath(); + Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath(); MPartitions.push_back(Partition); PartitionFinalNum++; } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 06f158ef72321..471ebf55b0518 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -815,22 +815,21 @@ class partition { } /// Checks if the graph is single path, i.e. each node has a single successor. - /// If so, the MIsInOrderGraph flag is set. - void checkIfGraphIsSinglePath() { - MIsInOrderGraph = true; + /// @return True if the graph is a single path + bool checkIfGraphIsSinglePath() { if (MRoots.size() > 1) { - MIsInOrderGraph = false; - return; + return false; } for (const auto &Node : MSchedule) { // In version 1.3.28454 of the L0 driver, 2D Copy ops cannot not // be enqueued in an in-order cmd-list (causing execution to stall). // The 2D Copy test should be removed from here when the bug is fixed. if ((Node->MSuccessors.size() > 1) || (Node->isNDCopyNode())) { - MIsInOrderGraph = false; - return; + return false; } } + + return true; } /// Add nodes to MSchedule. From 33f0f53a4632d3d60e8efdb2c682a435094d1462 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 29 Apr 2024 20:51:56 +0100 Subject: [PATCH 5/8] Use graph level property for enabling profiling --- .../sycl_ext_oneapi_graph.asciidoc | 69 +++++++++---------- .../sycl/ext/oneapi/experimental/graph.hpp | 37 ++++------ sycl/source/detail/graph_impl.cpp | 19 ++--- sycl/source/detail/graph_impl.hpp | 13 +--- sycl/source/handler.cpp | 2 +- .../Graph/Profiling/event_profiling_info.cpp | 27 ++++---- sycl/test/abi/sycl_symbols_linux.dump | 4 +- sycl/test/abi/sycl_symbols_windows.dump | 4 +- 8 files changed, 73 insertions(+), 102 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 55981f218257e..f2560c707649f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -592,31 +592,6 @@ The API for explicitly adding nodes to a `command_graph` includes a property to be passed here. `depends_on_all_leaves` provides a shortcut for adding all the current leaves of a graph as dependencies. -==== Enable-Profiling Property [[enable-profiling]] -[source,c++] ----- -namespace sycl::ext::oneapi::experimental::property::node { -class enable_profiling { - public: - enable_profiling() = default; -}; -} ----- - -The `property::node::enable_profiling` property can be passed to a -`command_graph::add()` function and enables profiling support -for the node in the `command_graph`. -Passing this property implies disabling certain optimizations. -This is why profiling is by default disabled on graphs, unless users -explicitly require it using either the `property::node::enable_profiling` -property when using the explicit graph creation API or the -`property::queue::enable_profiling` on the recorded queue in the Record & Replay -API. As a result, the execution time of a graph finalized with profiling enabled -is longer than that of a graph without profiling capability. An error will be -thrown when attempting to profile an event from a graph submission that was -created without this property. - - === Graph [source, c++] @@ -839,6 +814,27 @@ when passed on finalization of a modifiable `command_graph`. For further information see <>. +==== Enable-Profiling Property [[enable-profiling]] + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental::property::graph { +class enable_profiling { + public: + enable_profiling() = default; +}; +} +---- + +The `property::graph::enable_profiling` property enables profiling events +returned from submissions of the executable graph. Passing this property +implies disabling certain optimizations. This is why profiling is by default +disabled on graphs, unless users explicitly enable it. As a result, the +execution time of a graph finalized with profiling enabled is longer than that +of a graph without profiling capability. An error will be thrown when +attempting to profile an event from a graph submission that was created without +this property. + ==== Graph Member Functions Table {counter: tableNumber}. Constructor of the `command_graph` class. @@ -937,8 +933,6 @@ Parameters: * `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. The `property::node::depends_on` property can be passed here with a list of nodes to create dependency edges on. - The `enable_profiling` property enables the profiling of this node. - See <> for more details. Returns: The empty node which has been added to the graph. @@ -976,9 +970,6 @@ Parameters: * `propList` - Zero or more properties can be provided to the constructed node via an instance of `property_list`. The `property::node::depends_on` property can be passed here with a list of nodes to create dependency edges on. - The `enable_profiling` property enables the profiling of this node. - See <> for more details. - Returns: The command-group function object node which has been added to the graph. @@ -1372,14 +1363,16 @@ ways: an implicit dependency before and after the graph execution, as if the graph execution is one command-group submitted to the in-order queue. -2. `property::queue::enable_profiling` - This property must be set on the queue - in recording mode if users want to profile the commands recorded to - the graph. This property must also be set on the queue an executable graph is - submitted to. It allows profiling information to be obtained from the event - returned by a graph submission. As it is not defined how a submitted graph - will be split up for scheduling at runtime, the `uint64_t` timestamp reported - from a profiling query on a graph execution event has the following - semantics, which may be pessimistic about execution time on device. +2. `property::queue::enable_profiling` - This property has no effect on graph + recording. When set on the queue a graph is submitted to however, it allows + profiling information to be obtained from the event returned by a graph + submission. The executable graph used for this submission must have been + created with the `enable_profiling` property, see + <> for more details. As it is not + defined how a submitted graph will be split up for scheduling at runtime, + the `uint64_t` timestamp reported from a profiling query on a graph + execution event has the following semantics, which may be + pessimistic about execution time on device. * `info::event_profiling::command_submit` - Timestamp when the graph is submitted to the queue. diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 2d45457fa933f..4521da77c1839 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -167,6 +167,14 @@ class updatable public: updatable() = default; }; + +/// Property used to enable executable graph profiling. Enables profiling on +/// events returned by submissions of the executable graph +class enable_profiling : public ::sycl::detail::DataLessProperty< + ::sycl::detail::GraphEnableProfiling> { +public: + enable_profiling() = default; +}; } // namespace graph namespace node { @@ -195,15 +203,6 @@ class depends_on_all_leaves : public ::sycl::detail::DataLessProperty< depends_on_all_leaves() = default; }; -/// Property used to enable node profiling. -/// Passing this property to the `command_graph::add()` function -/// ensures that profiling can be queried on this node. -class enable_profiling : public ::sycl::detail::DataLessProperty< - ::sycl::detail::GraphEnableProfiling> { -public: - enable_profiling() = default; -}; - } // namespace node } // namespace property @@ -230,17 +229,15 @@ class __SYCL_EXPORT modifiable_command_graph { /// @param PropList Property list used to pass [0..n] predecessor nodes. /// @return Constructed empty node which has been added to the graph. node add(const property_list &PropList = {}) { - bool EnableProfiling = - PropList.has_property(); if (PropList.has_property()) { auto Deps = PropList.get_property(); - node Node = addImpl(Deps.get_dependencies(), EnableProfiling); + node Node = addImpl(Deps.get_dependencies()); if (PropList.has_property()) { addGraphLeafDependencies(Node); } return Node; } - node Node = addImpl({}, EnableProfiling); + node Node = addImpl({}); if (PropList.has_property()) { addGraphLeafDependencies(Node); } @@ -252,17 +249,15 @@ class __SYCL_EXPORT modifiable_command_graph { /// @param PropList Property list used to pass [0..n] predecessor nodes. /// @return Constructed node which has been added to the graph. template node add(T CGF, const property_list &PropList = {}) { - bool EnableProfiling = - PropList.has_property(); if (PropList.has_property()) { auto Deps = PropList.get_property(); - node Node = addImpl(CGF, Deps.get_dependencies(), EnableProfiling); + node Node = addImpl(CGF, Deps.get_dependencies()); if (PropList.has_property()) { addGraphLeafDependencies(Node); } return Node; } - node Node = addImpl(CGF, {}, EnableProfiling); + node Node = addImpl(CGF, {}); if (PropList.has_property()) { addGraphLeafDependencies(Node); } @@ -331,16 +326,14 @@ class __SYCL_EXPORT modifiable_command_graph { /// Template-less implementation of add() for CGF nodes. /// @param CGF Command-group function to add. /// @param Dep List of predecessor nodes. - /// @param EnableProfiling Enable node profiling. /// @return Node added to the graph. - node addImpl(std::function CGF, const std::vector &Dep, - const bool EnableProfiling); + node addImpl(std::function CGF, + const std::vector &Dep); /// Template-less implementation of add() for empty nodes. /// @param Dep List of predecessor nodes. - /// @param EnableProfiling Enable node profiling. /// @return Node added to the graph. - node addImpl(const std::vector &Dep, const bool EnableProfiling); + node addImpl(const std::vector &Dep); /// Adds all graph leaves as dependencies /// @param Node Destination node to which the leaves of the graph will be diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4d9cc49bb2631..025fde4904a9a 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -188,13 +188,6 @@ void exec_graph_impl::makePartitions() { if (Node->MCGType == sycl::detail::CG::CodeplayHostTask) { HostTaskList.push_back(Node); } - // Next line is supposed to be temporary. - // Nodes are not profiled individually, but the profiling of the whole graph - // is enabled if at least one node has profiling enabled. This should be - // changed once the PR https://github.com/intel/llvm/pull/12592 on node - // profiling is merged. This also will involve updating all the UR enqueue - // cmd functions to add a new parameter containing the profiling status. - MEnableProfiling |= Node->MProfilingEnabled; } // Annotate nodes @@ -770,7 +763,9 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, : MSchedule(), MGraphImpl(GraphImpl), MPiSyncPoints(), MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(), MExecutionEvents(), - MIsUpdatable(PropList.has_property()) { + MIsUpdatable(PropList.has_property()), + MEnableProfiling( + PropList.has_property()) { // If the graph has been marked as updatable then check if the backend // actually supports that. Devices supporting aspect::ext_oneapi_graph must @@ -1469,8 +1464,7 @@ modifiable_command_graph::modifiable_command_graph( : impl(std::make_shared( SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {} -node modifiable_command_graph::addImpl(const std::vector &Deps, - const bool EnableProfiling) { +node modifiable_command_graph::addImpl(const std::vector &Deps) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { @@ -1479,13 +1473,11 @@ node modifiable_command_graph::addImpl(const std::vector &Deps, graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(impl, DepImpls); - NodeImpl->MProfilingEnabled = EnableProfiling; return sycl::detail::createSyclObjFromImpl(NodeImpl); } node modifiable_command_graph::addImpl(std::function CGF, - const std::vector &Deps, - const bool EnableProfiling) { + const std::vector &Deps) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { @@ -1495,7 +1487,6 @@ node modifiable_command_graph::addImpl(std::function CGF, graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(impl, CGF, {}, DepImpls); - NodeImpl->MProfilingEnabled = EnableProfiling; return sycl::detail::createSyclObjFromImpl(NodeImpl); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 24b18aace8551..d28f980551247 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -99,9 +99,6 @@ class node_impl { /// Used for tracking visited status during cycle checks. bool MVisited = false; - /// If true, the graph profiling is enabled for this node. - bool MProfilingEnabled = false; - /// Partition number needed to assign a Node to a a partition. /// Note : This number is only used during the partitionning process and /// cannot be used to find out the partion of a node outside of this process. @@ -163,8 +160,7 @@ class node_impl { node_impl(node_impl &Other) : MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), - MCommandGroup(Other.getCGCopy()), MSubGraphImpl(Other.MSubGraphImpl), - MProfilingEnabled(Other.MProfilingEnabled) {} + MCommandGroup(Other.getCGCopy()), MSubGraphImpl(Other.MSubGraphImpl) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -774,9 +770,6 @@ class node_impl { Stream << "Other \\n"; break; } - if (MProfilingEnabled) { - Stream << "Profiling Enabled \\n"; - } Stream << "\"];" << std::endl; } @@ -1475,14 +1468,14 @@ class exec_graph_impl { std::vector> MPartitions; /// Storage for copies of nodes from the original modifiable graph. std::vector> MNodeStorage; - /// If true, the graph profiling is enabled. - bool MEnableProfiling = false; /// Map of nodes to their associated PI command handles. std::unordered_map, sycl::detail::pi::PiExtCommandBufferCommand> MCommandMap; /// True if this graph can be updated (set with property::updatable) bool MIsUpdatable; + /// If true, the graph profiling is enabled. + bool MEnableProfiling; // Stores a cache of node ids from modifiable graph nodes to the companion // node(s) in this graph. Used for quick access when updating this graph. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 2daa2c2472bd8..7d7f094e8d4a2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -588,7 +588,7 @@ event handler::finalize() { } else { NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); } - NodeImpl->MProfilingEnabled = MQueue->MIsProfilingEnabled; + // Associate an event with this new node and return the event. GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); diff --git a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp index 4b4daa7fa3255..af4bb747bb919 100644 --- a/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp +++ b/sycl/test-e2e/Graph/Profiling/event_profiling_info.cpp @@ -81,8 +81,6 @@ bool compareProfiling(event Event1, event Event2) { // event to complete execution. int main() { device Dev; - // The queue on which the graph is recorded must have the `enable_profiling` - // set to enable graph profiling. queue Queue{Dev, {sycl::property::queue::enable_profiling()}}; const size_t Size = 100000; @@ -109,16 +107,17 @@ int main() { Queue.get_context(), Queue.get_device(), {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + CopyGraph.begin_recording(Queue); - CopyGraph.add( - ([&](sycl::handler &Cgh) { - accessor - AccessorFrom(BufferFrom, Cgh, range<1>(Size)); - accessor - AccessorTo(BufferTo, Cgh, range<1>(Size)); - Cgh.copy(AccessorFrom, AccessorTo); - }), - {exp_ext::property::node::enable_profiling{}}); + Queue.submit([&](sycl::handler &Cgh) { + accessor AccessorFrom( + BufferFrom, Cgh, range<1>(Size)); + accessor AccessorTo( + BufferTo, Cgh, range<1>(Size)); + Cgh.copy(AccessorFrom, AccessorTo); + }); + + CopyGraph.end_recording(Queue); // kernel launch exp_ext::command_graph KernelGraph{ @@ -131,8 +130,10 @@ int main() { KernelGraph.end_recording(Queue); - auto CopyGraphExec = CopyGraph.finalize(); - auto KernelGraphExec = KernelGraph.finalize(); + auto CopyGraphExec = + CopyGraph.finalize({exp_ext::property::graph::enable_profiling{}}); + auto KernelGraphExec = + KernelGraph.finalize({exp_ext::property::graph::enable_profiling{}}); event CopyEvent, KernelEvent1, KernelEvent2; // Run graphs diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 76561608c1d19..a1985cf5d841e 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3081,8 +3081,8 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph13end_reco _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERKSt6vectorINS0_5queueESaIS7_EERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph24addGraphLeafDependenciesENS3_4nodeE -_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EEb -_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EEb +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9make_edgeERNS3_4nodeES7_ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 179ec2a9f0414..69473362c1985 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -3924,8 +3924,8 @@ ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z ?addHostUnsampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVUnsampledImageAccessorImplHost@123@@Z -?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@_N@Z -?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@_N@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z ?addInteropObject@buffer_impl@detail@_V1@sycl@@QEBAXAEAV?$vector@_KV?$allocator@_K@std@@@std@@@Z ?addOrReplaceAccessorProperties@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBVproperty_list@34@@Z ?addOrReplaceAccessorProperties@buffer_plain@detail@_V1@sycl@@IEAAXAEBVproperty_list@34@@Z From 6af4cc8372505faaa090d84b8f6014a94cfaa1ac Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 30 Apr 2024 13:22:48 +0100 Subject: [PATCH 6/8] Address PR feedback --- .../sycl_ext_oneapi_graph.asciidoc | 10 ++++----- sycl/source/detail/graph_impl.hpp | 21 ++++++++----------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index f2560c707649f..e28e7cdd51cad 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -828,12 +828,10 @@ class enable_profiling { The `property::graph::enable_profiling` property enables profiling events returned from submissions of the executable graph. Passing this property -implies disabling certain optimizations. This is why profiling is by default -disabled on graphs, unless users explicitly enable it. As a result, the -execution time of a graph finalized with profiling enabled is longer than that -of a graph without profiling capability. An error will be thrown when -attempting to profile an event from a graph submission that was created without -this property. +implies disabling certain optimizations. As a result, the execution time of a +graph finalized with profiling enabled is longer than that of a graph without +profiling capability. An error will be thrown when attempting to profile an +event from a graph submission that was created without this property. ==== Graph Member Functions diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index d28f980551247..fdd3dcbea2a85 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -386,19 +386,16 @@ class node_impl { /// Test if the node contains a N-D copy /// @return true if the op is a N-D copy bool isNDCopyNode() const { - if ((MCGType == sycl::detail::CG::CGTYPE::CopyAccToAcc) || - (MCGType == sycl::detail::CG::CGTYPE::CopyAccToPtr) || - (MCGType == sycl::detail::CG::CGTYPE::CopyPtrToAcc)) { - sycl::detail::CGCopy *Copy = (sycl::detail::CGCopy *)MCommandGroup.get(); - sycl::detail::Requirement *ReqSrc = - (sycl::detail::Requirement *)(Copy->getSrc()); - sycl::detail::Requirement *ReqDst = - (sycl::detail::Requirement *)(Copy->getDst()); - if ((ReqSrc->MDims > 1) || (ReqDst->MDims > 1)) { - return true; - } + if ((MCGType != sycl::detail::CG::CGTYPE::CopyAccToAcc) && + (MCGType != sycl::detail::CG::CGTYPE::CopyAccToPtr) && + (MCGType != sycl::detail::CG::CGTYPE::CopyPtrToAcc)) { + return false; } - return false; + + auto Copy = static_cast(MCommandGroup.get()); + auto ReqSrc = static_cast(Copy->getSrc()); + auto ReqDst = static_cast(Copy->getDst()); + return (ReqSrc->MDims > 1) || (ReqDst->MDims > 1); } /// Update the value of an accessor inside this node. Accessors must be From 3e4ee68a14b60a2a97120d93e77e03e27b5d25d6 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 30 Apr 2024 16:37:15 +0100 Subject: [PATCH 7/8] Update sycl/unittests/Extensions/CommandGraph/Exceptions.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Fábio --- sycl/unittests/Extensions/CommandGraph/Exceptions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index c4fc154ff6e6f..1477d267bf9a1 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -542,7 +542,7 @@ TEST_F(CommandGraphTest, ProfilingExceptionProperty) { Graph.end_recording(Queue); // Checks exception thrown if profiling is requested while profiling has - // not be enabled during the graph building. + // not been enabled during the graph building. auto GraphExecInOrder = Graph.finalize(); queue QueueProfile{Dev, {sycl::property::queue::enable_profiling()}}; auto EventInOrder = QueueProfile.submit( From 356b9cea6a4dc9143918ece88832aeb134a91302 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 2 May 2024 10:57:00 +0100 Subject: [PATCH 8/8] Update sycl/plugins/unified_runtime/pi2ur.hpp Co-authored-by: Kenneth Benzie (Benie) --- sycl/plugins/unified_runtime/pi2ur.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index d8c526e289855..a3040a61e9c69 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4496,8 +4496,8 @@ piextCommandBufferCreate(pi_context Context, pi_device Device, ur_device_handle_t UrDevice = reinterpret_cast(Device); ur_exp_command_buffer_desc_t UrDesc; UrDesc.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; - UrDesc.isInOrder = Desc->is_in_order; - UrDesc.enableProfiling = Desc->enable_profiling; + UrDesc.isInOrder = ur_bool_t(Desc->is_in_order); + UrDesc.enableProfiling = ur_bool_t(Desc->enable_profiling); UrDesc.isUpdatable = Desc->is_updatable; ur_exp_command_buffer_handle_t *UrCommandBuffer = reinterpret_cast(RetCommandBuffer);