From e6dc9a21a6c0c8afe27472025c61276ac75dda21 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 13 Dec 2023 12:43:54 +0000 Subject: [PATCH 01/15] [SYCL][Graph] Add graph and node queries to spec --- .../sycl_ext_oneapi_graph.asciidoc | 95 ++++++++++++++++++- 1 file changed, 91 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 4204515123ca4..4d09e9148de03 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -313,6 +313,19 @@ enum class graph_support_level { emulated }; +enum class node_type { + empty, + subgraph, + kernel, + memcpy, + memset, + memfill, + prefetch, + memadvise, + ext_oneapi_barrier, + host_task, +}; + namespace property { namespace graph { @@ -353,7 +366,18 @@ struct graphs_support; } // namespace device } // namespace info -class node {}; +class node { +public: + node() = delete; + + node_type get_type() const; + + std::vector get_dependencies() const; + + std::vector get_dependent_nodes() const; + + static node get_node_from_event(event nodeEvent); +}; // State of a graph enum class graph_state { @@ -389,6 +413,9 @@ public: void make_edge(node& src, node& dest); void print_graph(std::string path, bool verbose = false) const; + + std::vector get_nodes() const; + std::vector get_root_nodes() const; }; template<> @@ -459,12 +486,56 @@ edges. The `node` class provides the {crs}[common reference semantics]. +==== Node Member Functions + +Table {counter: tableNumber}. Member functions of the `node` class. +[cols="2a,a"] +|=== +|Member Function|Description + +| [source,c++] ---- -namespace sycl::ext::oneapi::experimental { - class node {}; -} +node_type get_type() const; +---- +|Returns a value representing the type of command this node represents. + +| +[source,c++] +---- +std::vector get_dependencies() const; +---- +|Returns a list of the predecessor nodes which this node depends on. + +| +[source,c++] +---- +std::vector get_dependent_nodes() const; ---- +|Returns a list of the successor nodes which depend on this node. + +| +[source,c++] +---- +static node get_node_from_event(event nodeEvent); +---- +|Finds the node associated with an event created from a submission to a queue + in the recording state. + +Parameters: + +* `nodeEvent` - Event returned from a submission to a queue in the recording + state. + +Returns: Graph node that was created when the command that returned +`nodeEvent` was submitted. + +Exceptions: + +* Throws with error code `invalid` if `nodeEvent` is not associated with a + graph node. + +|=== ==== Depends-On Property @@ -775,6 +846,22 @@ Exceptions: * Throws synchronously with error code `invalid` if the path is invalid or the file extension is not supported or if the write operation failed. +| +[source,c++] +---- +std::vector get_nodes() const; +---- +|Returns a list of all the nodes present in the graph in the order that they +were added. + +| +[source,c++] +---- +std::vector get_root_nodes() const; +---- +|Returns a list of all nodes in the graph which have no dependencies in the +order they were added to the graph. + |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. From 6ba1f6ea2ae449a3ee3b70f7e0f8b1de40777ed2 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Dec 2023 14:03:41 +0000 Subject: [PATCH 02/15] [SYCL][Graph] Implement Graph and node queries - Implement graph and node queries from spec - New node_type enum - Note, subgraph node type not yet implemented due to significant changes required - Explicit nodes now also have associated events (fixes mixed usage issue) - New tests for queries - Update linux ABI symbols --- .../sycl_ext_oneapi_graph.asciidoc | 8 +- .../sycl/ext/oneapi/experimental/graph.hpp | 35 ++++++ sycl/include/sycl/handler.hpp | 10 ++ sycl/source/detail/graph_impl.cpp | 98 +++++++++++++-- sycl/source/detail/graph_impl.hpp | 85 +++++++++++-- sycl/source/detail/handler_impl.hpp | 9 ++ sycl/source/handler.cpp | 27 ++-- sycl/test/abi/sycl_symbols_linux.dump | 7 ++ sycl/unittests/Extensions/CommandGraph.cpp | 117 +++++++++++++++++- 9 files changed, 357 insertions(+), 39 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 4d09e9148de03..1a6255baeca7f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -372,9 +372,9 @@ public: node_type get_type() const; - std::vector get_dependencies() const; + std::vector get_predecessors() const; - std::vector get_dependent_nodes() const; + std::vector get_successors() const; static node get_node_from_event(event nodeEvent); }; @@ -503,14 +503,14 @@ node_type get_type() const; | [source,c++] ---- -std::vector get_dependencies() const; +std::vector get_predecessors() const; ---- |Returns a list of the predecessor nodes which this node depends on. | [source,c++] ---- -std::vector get_dependent_nodes() const; +std::vector get_successors() const; ---- |Returns a list of the successor nodes which depend on this node. diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 15645d9884499..761910af9911a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -82,8 +82,37 @@ enum class graph_state { executable, ///< In executable state, the graph is ready to execute. }; +enum class node_type { + empty = 0, + subgraph, + kernel, + memcpy, + memset, + memfill, + prefetch, + memadvise, + ext_oneapi_barrier, + host_task, +}; + /// Class representing a node in the graph, returned by command_graph::add(). class __SYCL_EXPORT node { +public: + node() = delete; + + /// Get the type of command associated with this node. + node_type get_type() const; + + /// Get a list of all the node dependencies of this node. + std::vector get_predecessors() const; + + /// Get a list of all nodes which depend on this node. + std::vector get_successors() const; + + /// Get the node associated with a SYCL event returned from a queue recording + /// submission. + static node get_node_from_event(event nodeEvent); + private: node(const std::shared_ptr &Impl) : impl(Impl) {} @@ -253,6 +282,12 @@ class __SYCL_EXPORT modifiable_command_graph { /// as kernel args or memory access where applicable. void print_graph(const std::string path, bool verbose = false) const; + /// Get a list of all nodes contained in this graph. + std::vector get_nodes() const; + + /// Get a list of all root nodes (nodes without dependencies) in this graph. + std::vector get_root_nodes() const; + protected: /// Constructor used internally by the runtime. /// @param Impl Detail implementation class to construct object with. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 986c25035ac30..ff86283a23fae 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1779,6 +1779,14 @@ class __SYCL_EXPORT handler { std::shared_ptr getCommandGraph() const; + /// Sets the user facing node type of this operation, used for operations + /// which are recorded to a graph. Since some operations may actually be a + /// different type than the user submitted, e.g. a fill() which is performed + /// as a kernel submission. + /// @param Type The actual type based on what handler functions the user + /// called. + void setUserFacingNodeType(ext::oneapi::experimental::node_type Type); + public: handler(const handler &) = delete; handler(handler &&) = delete; @@ -2722,6 +2730,7 @@ class __SYCL_EXPORT handler { checkIfPlaceholderIsBoundToHandler(Dst); throwIfActionIsCreated(); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); // TODO add check:T must be an integral scalar value or a SYCL vector type static_assert(isValidTargetForExplicitOp(AccessTarget), "Invalid accessor target for the fill method."); @@ -2760,6 +2769,7 @@ class __SYCL_EXPORT handler { /// \param Count is the number of times to fill Pattern into Ptr. template void fill(void *Ptr, const T &Pattern, size_t Count) { throwIfActionIsCreated(); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); static_assert(is_device_copyable::value, "Pattern must be device copyable"); parallel_for<__usmfill>(range<1>(Count), [=](id<1> Index) { diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 2581fb1da345f..ce9f91775f91c 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -70,7 +70,7 @@ void duplicateNode(const std::shared_ptr Node, NodeCopy = std::make_shared(); NodeCopy->MCGType = sycl::detail::CG::None; } else { - NodeCopy = std::make_shared(Node->MCGType, Node->getCGCopy()); + NodeCopy = std::make_shared(Node->MNodeType, Node->getCGCopy()); } } @@ -156,6 +156,32 @@ bool isPartitionRoot(std::shared_ptr Node) { return true; } +/// Takes a vector of weak_ptrs to node_impls and returns a vector of node +/// objects created from those impls, in the same order. +std::vector createNodesFromImpls( + const std::vector> &Impls) { + std::vector Nodes{}; + + for (std::weak_ptr Impl : Impls) { + Nodes.push_back(sycl::detail::createSyclObjFromImpl(Impl.lock())); + } + + return Nodes; +} + +/// Takes a vector of shared_ptrs to node_impls and returns a vector of node +/// objects created from those impls, in the same order. +std::vector createNodesFromImpls( + const std::vector> &Impls) { + std::vector Nodes{}; + + for (std::shared_ptr Impl : Impls) { + Nodes.push_back(sycl::detail::createSyclObjFromImpl(Impl)); + } + + return Nodes; +} + } // anonymous namespace void partition::schedule() { @@ -277,6 +303,7 @@ graph_impl::~graph_impl() { } std::shared_ptr graph_impl::addNodesToExits( + const std::shared_ptr &Impl, const std::list> &NodeList) { // Find all input and output nodes from the node list std::vector> Inputs; @@ -303,12 +330,14 @@ std::shared_ptr graph_impl::addNodesToExits( // Add all the new nodes to the node storage for (auto &Node : NodeList) { MNodeStorage.push_back(Node); + addEventForNode(Impl, std::make_shared(), Node); } - return this->add(Outputs); + return this->add(Impl, Outputs); } std::shared_ptr graph_impl::addSubgraphNodes( + const std::shared_ptr &Impl, const std::shared_ptr &SubGraphExec) { std::map, std::shared_ptr> NodesMap; @@ -331,7 +360,7 @@ std::shared_ptr graph_impl::addSubgraphNodes( } } - return addNodesToExits(NewNodesList); + return addNodesToExits(Impl, NewNodesList); } void graph_impl::addRoot(const std::shared_ptr &Root) { @@ -343,7 +372,8 @@ void graph_impl::removeRoot(const std::shared_ptr &Root) { } std::shared_ptr -graph_impl::add(const std::vector> &Dep) { +graph_impl::add(const std::shared_ptr &Impl, + const std::vector> &Dep) { // Copy deps so we can modify them auto Deps = Dep; @@ -355,7 +385,8 @@ graph_impl::add(const std::vector> &Dep) { MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); - + // Add an event associated with this explicit node for mixed usage + addEventForNode(Impl, std::make_shared(), NodeImpl); return NodeImpl; } @@ -382,11 +413,23 @@ graph_impl::add(const std::shared_ptr &Impl, if (Handler.MSubgraphNode) { return Handler.MSubgraphNode; } - return this->add(Handler.MCGType, std::move(Handler.MGraphNodeCG), Dep); + + node_type NodeType; + if (auto UserFacingType = Handler.MImpl->MUserFacingNodeType; + UserFacingType != node_type::empty) { + NodeType = UserFacingType; + } else { + NodeType = getNodeTypeFromCG(Handler.MCGType); + } + auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep); + // Add an event associated with this explicit node for mixed usage + addEventForNode(Impl, std::make_shared(), NodeImpl); + return NodeImpl; } std::shared_ptr -graph_impl::add(const std::vector Events) { +graph_impl::add(const std::shared_ptr &Impl, + const std::vector Events) { std::vector> Deps; @@ -401,11 +444,11 @@ graph_impl::add(const std::vector Events) { } } - return this->add(Deps); + return this->add(Impl, Deps); } std::shared_ptr -graph_impl::add(sycl::detail::CG::CGTYPE CGType, +graph_impl::add(node_type NodeType, std::unique_ptr CommandGroup, const std::vector> &Dep) { // Copy deps so we can modify them @@ -465,13 +508,13 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end()); const std::shared_ptr &NodeImpl = - std::make_shared(CGType, std::move(CommandGroup)); + std::make_shared(NodeType, std::move(CommandGroup)); MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); // Set barrier nodes as prerequisites (new start points) for subsequent nodes - if (CGType == sycl::detail::CG::Barrier) { + if (NodeImpl->MCGType == sycl::detail::CG::Barrier) { MExtraDependencies.push_back(NodeImpl); } @@ -932,7 +975,7 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { } graph_impl::WriteLock Lock(impl->MMutex); - std::shared_ptr NodeImpl = impl->add(DepImpls); + std::shared_ptr NodeImpl = impl->add(impl, DepImpls); return sycl::detail::createSyclObjFromImpl(NodeImpl); } @@ -1081,6 +1124,17 @@ void modifiable_command_graph::print_graph(std::string path, } } +std::vector modifiable_command_graph::get_nodes() const { + return createNodesFromImpls(impl->MNodeStorage); +} +std::vector modifiable_command_graph::get_root_nodes() const { + auto &Roots = impl->MRoots; + std::vector> Impls{}; + + std::copy(Roots.begin(), Roots.end(), std::back_inserter(Impls)); + return createNodesFromImpls(Impls); +} + executable_command_graph::executable_command_graph( const std::shared_ptr &Graph, const sycl::context &Ctx) : impl(std::make_shared(Ctx, Graph)) { @@ -1116,8 +1170,26 @@ void executable_command_graph::update( throw sycl::exception(sycl::make_error_code(errc::invalid), "Method not yet implemented"); } - } // namespace detail + +node_type node::get_type() const { return impl->MNodeType; } + +std::vector node::get_predecessors() const { + return detail::createNodesFromImpls(impl->MPredecessors); +} + +std::vector node::get_successors() const { + return detail::createNodesFromImpls(impl->MSuccessors); +} + +node node::get_node_from_event(event nodeEvent) { + auto EventImpl = sycl::detail::getSyclObjImpl(nodeEvent); + auto GraphImpl = EventImpl->getCommandGraph(); + + return sycl::detail::createSyclObjFromImpl( + GraphImpl->getNodeForEvent(EventImpl)); +} + } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 46bc15f7b8022..5232f9769cbe4 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -37,6 +37,42 @@ namespace oneapi { namespace experimental { namespace detail { +inline node_type getNodeTypeFromCG(sycl::detail::CG::CGTYPE CGType) { + using sycl::detail::CG; + + // TODO: Handle subgraph case when internal representation has been changed to + // contain a single subgraph node. The current approach copies nodes into the + // parent graph which prevents this. + switch (CGType) { + case CG::None: + return node_type::empty; + case CG::Kernel: + return node_type::kernel; + case CG::CopyAccToPtr: + case CG::CopyPtrToAcc: + case CG::CopyAccToAcc: + case CG::CopyUSM: + return node_type::memcpy; + case CG::Memset2DUSM: + return node_type::memset; + case CG::Fill: + case CG::FillUSM: + return node_type::memfill; + case CG::PrefetchUSM: + return node_type::prefetch; + case CG::AdviseUSM: + return node_type::memadvise; + case CG::Barrier: + case CG::BarrierWaitlist: + return node_type::ext_oneapi_barrier; + case CG::CodeplayHostTask: + return node_type::host_task; + default: + assert(false && "Invalid Graph Node Type"); + return node_type::empty; + } +} + /// Implementation of node class from SYCL_EXT_ONEAPI_GRAPH. class node_impl { public: @@ -48,6 +84,8 @@ class node_impl { std::vector> MPredecessors; /// Type of the command-group for the node. sycl::detail::CG::CGTYPE MCGType = sycl::detail::CG::None; + /// User facing type of the node + node_type MNodeType = node_type::empty; /// Command group object which stores all args etc needed to enqueue the node std::unique_ptr MCommandGroup; @@ -96,9 +134,10 @@ class node_impl { /// @param CGType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(sycl::detail::CG::CGTYPE CGType, + node_impl(node_type NodeType, std::unique_ptr &&CommandGroup) - : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} + : MCGType(CommandGroup->getType()), MNodeType(NodeType), + MCommandGroup(std::move(CommandGroup)) {} /// Checks if this node has a given requirement. /// @param Requirement Requirement to lookup. @@ -551,13 +590,12 @@ class graph_impl { void removeRoot(const std::shared_ptr &Root); /// Create a kernel node in the graph. - /// @param CGType Type of the command-group. + /// @param NodeType User facing type of the node. /// @param CommandGroup The CG which stores all information for this node. /// @param Dep Dependencies of the created node. /// @return Created node in the graph. std::shared_ptr - add(sycl::detail::CG::CGTYPE CGType, - std::unique_ptr CommandGroup, + add(node_type NodeType, std::unique_ptr CommandGroup, const std::vector> &Dep = {}); /// Create a CGF node in the graph. @@ -573,16 +611,20 @@ class graph_impl { const std::vector> &Dep = {}); /// Create an empty node in the graph. + /// @param Impl Graph implementation pointer /// @param Dep List of predecessor nodes. /// @return Created node in the graph. std::shared_ptr - add(const std::vector> &Dep = {}); + add(const std::shared_ptr &Impl, + const std::vector> &Dep = {}); /// Create an empty node in the graph. + /// @param Impl Graph implementation pointer /// @param Events List of events associated to this node. /// @return Created node in the graph. std::shared_ptr - add(const std::vector Events); + add(const std::shared_ptr &Impl, + const std::vector Events); /// Add a queue to the set of queues which are currently recording to this /// graph. @@ -607,10 +649,15 @@ class graph_impl { bool clearQueues(); /// Associate a sycl event with a node in the graph. + /// @param GraphImpl shared_ptr to Graph impl associated with this event, aka + /// this /// @param EventImpl Event to associate with a node in map. /// @param NodeImpl Node to associate with event in map. - void addEventForNode(std::shared_ptr EventImpl, + void addEventForNode(std::shared_ptr GraphImpl, + std::shared_ptr EventImpl, std::shared_ptr NodeImpl) { + if (!EventImpl->getCommandGraph()) + EventImpl->setCommandGraph(GraphImpl); MEventsMap[EventImpl] = NodeImpl; } @@ -632,12 +679,28 @@ class graph_impl { "No event has been recorded for the specified graph node"); } + std::shared_ptr + getNodeForEvent(std::shared_ptr EventImpl) { + ReadLock Lock(MMutex); + + if (auto NodeFound = MEventsMap.find(EventImpl); + NodeFound != std::end(MEventsMap)) { + return NodeFound->second; + } + + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "No node in this graph is associated with this event"); + } + /// Duplicates and Adds sub-graph nodes from an executable graph to this /// graph. + /// @param Impl Graph implementation pointer /// @param SubGraphExec sub-graph to add to the parent. /// @return An empty node is used to schedule dependencies on this sub-graph. std::shared_ptr - addSubgraphNodes(const std::shared_ptr &SubGraphExec); + addSubgraphNodes(const std::shared_ptr &Impl, + const std::shared_ptr &SubGraphExec); /// Query for the context tied to this graph. /// @return Context associated with graph. @@ -873,10 +936,12 @@ class graph_impl { void addRoot(const std::shared_ptr &Root); /// Adds nodes to the exit nodes of this graph. + /// @param Impl Graph implementation pointer /// @param NodeList List of nodes from sub-graph in schedule order. /// @return An empty node is used to schedule dependencies on this sub-graph. std::shared_ptr - addNodesToExits(const std::list> &NodeList); + addNodesToExits(const std::shared_ptr &Impl, + const std::list> &NodeList); /// Adds dependencies for a new node, if it has no deps it will be /// added as a root node. diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index d98602ab02e35..a6f4622587fcf 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -11,6 +11,7 @@ #include "sycl/handler.hpp" #include #include +#include namespace sycl { inline namespace _V1 { @@ -117,6 +118,14 @@ class handler_impl { // Extra information for semaphore interoperability sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle; + + // The user facing node type, used for operations which are recorded to a + // graph. Since some operations may actually be a different type than the user + // submitted, e.g. a fill() which is performed as a kernel submission. This is + // used to pass the type that the user expects to graph nodes when they are + // created for later query by users. + sycl::ext::oneapi::experimental::node_type MUserFacingNodeType = + sycl::ext::oneapi::experimental::node_type::empty; }; } // namespace detail diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index b140efb9200f9..640aaf06471f1 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -504,6 +504,13 @@ event handler::finalize() { ext::oneapi::experimental::detail::graph_impl::WriteLock Lock( GraphImpl->MMutex); + ext::oneapi::experimental::node_type NodeType; + if (auto UserFacingType = MImpl->MUserFacingNodeType; + UserFacingType != ext::oneapi::experimental::node_type::empty) { + NodeType = UserFacingType; + } else { + NodeType = ext::oneapi::experimental::detail::getNodeTypeFromCG(MCGType); + } // Create a new node in the graph representing this command-group if (MQueue->isInOrder()) { // In-order queues create implicit linear dependencies between nodes. @@ -512,22 +519,20 @@ event handler::finalize() { auto DependentNode = GraphImpl->getLastInorderNode(MQueue); NodeImpl = DependentNode - ? GraphImpl->add(MCGType, std::move(CommandGroup), + ? GraphImpl->add(NodeType, std::move(CommandGroup), {DependentNode}) - : GraphImpl->add(MCGType, std::move(CommandGroup)); + : GraphImpl->add(NodeType, std::move(CommandGroup)); // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this // queue. GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { - NodeImpl = GraphImpl->add(MCGType, std::move(CommandGroup)); + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); } // Associate an event with this new node and return the event. - GraphImpl->addEventForNode(EventImpl, NodeImpl); - - EventImpl->setCommandGraph(GraphImpl); + GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); return detail::createSyclObjFromImpl(EventImpl); } @@ -891,6 +896,7 @@ void handler::memset(void *Dest, int Value, size_t Count) { MDstPtr = Dest; MPattern.push_back(static_cast(Value)); MLength = Count; + setUserFacingNodeType(ext::oneapi::experimental::node_type::memset); setType(detail::CG::FillUSM); } @@ -1412,7 +1418,7 @@ void handler::ext_oneapi_graph( // return it to the user later. // The nodes of the subgraph are duplicated when added to its parents. // This avoids changing properties of the graph added as a subgraph. - MSubgraphNode = ParentGraph->addSubgraphNodes(GraphImpl); + MSubgraphNode = ParentGraph->addSubgraphNodes(ParentGraph, GraphImpl); // If we are recording an in-order queue remember the subgraph node, so it // can be used as a dependency for any more nodes recorded from this queue. @@ -1421,8 +1427,7 @@ void handler::ext_oneapi_graph( } // Associate an event with the subgraph node. auto SubgraphEvent = std::make_shared(); - SubgraphEvent->setCommandGraph(ParentGraph); - ParentGraph->addEventForNode(SubgraphEvent, MSubgraphNode); + ParentGraph->addEventForNode(ParentGraph, SubgraphEvent, MSubgraphNode); } else { // Set the exec graph for execution during finalize. MExecGraph = GraphImpl; @@ -1437,6 +1442,10 @@ handler::getCommandGraph() const { return MQueue->getCommandGraph(); } +void handler::setUserFacingNodeType(ext::oneapi::experimental::node_type Type) { + MImpl->MUserFacingNodeType = Type; +} + std::optional> handler::getMaxWorkGroups() { auto Dev = detail::getSyclObjImpl(detail::getDeviceFromHandler(*this)); std::array PiResult = {}; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 6aabb8c3309f6..e05479efa8152 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3715,6 +3715,7 @@ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_semaphore_handleERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN4sycl3_V13ext6oneapi12experimental33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE +_ZN4sycl3_V13ext6oneapi12experimental4node19get_node_from_eventENS0_5eventE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev @@ -4123,6 +4124,7 @@ _ZN4sycl3_V17handler20associateWithHandlerEPNS0_6detail30UnsampledImageAccessorB _ZN4sycl3_V17handler20memcpyToDeviceGlobalEPKvS3_bmm _ZN4sycl3_V17handler20setKernelCacheConfigE23_pi_kernel_cache_config _ZN4sycl3_V17handler20setStateSpecConstSetEv +_ZN4sycl3_V17handler21setUserFacingNodeTypeENS0_3ext6oneapi12experimental9node_typeE _ZN4sycl3_V17handler22ext_oneapi_fill2d_implEPvmPKvmmm _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE @@ -4209,8 +4211,13 @@ _ZNK4sycl3_V114interop_handle16getNativeContextEv _ZNK4sycl3_V115device_selector13select_deviceEv _ZNK4sycl3_V116default_selectorclERKNS0_6deviceE _ZNK4sycl3_V120accelerator_selectorclERKNS0_6deviceE +_ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv +_ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv +_ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph11print_graphENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb +_ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph14get_root_nodesEv _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph8finalizeERKNS0_13property_listE +_ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9get_nodesEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem16get_channel_typeEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem16get_num_channelsEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem17get_channel_orderEv diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 24a55475121e6..ab3ffa4e3956c 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -804,7 +804,9 @@ TEST_F(CommandGraphTest, RecordSubGraph) { // The first and fourth nodes should have events associated with MainGraph but // not graph. The second and third nodes were added as a sub-graph and - // duplicated. They should not have events associated with Graph or MainGraph. + // duplicated. They should only have events associated with MainGraph, however + // these events are created internally in the graph and not present in user + // code. ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); ASSERT_EQ( @@ -812,13 +814,13 @@ TEST_F(CommandGraphTest, RecordSubGraph) { sycl::detail::getSyclObjImpl(Node1MainGraph)); ScheduleIt++; - ASSERT_ANY_THROW( + ASSERT_NO_THROW( sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); ScheduleIt++; - ASSERT_ANY_THROW( + ASSERT_NO_THROW( sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); @@ -1933,6 +1935,115 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) { ASSERT_FALSE(PartitionsList[4]->isHostTask()); } +TEST_F(CommandGraphTest, GetNodeQueries) { + // Tests graph and node queries for correctness + + // Add some nodes to the graph for testing and test after each addition. + auto RootA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 1lu); + ASSERT_EQ(GraphNodes.size(), 1lu); + } + auto RootB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 2lu); + } + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootA, RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 3lu); + } + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 4lu); + } + auto RootC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 3lu); + ASSERT_EQ(GraphNodes.size(), 5lu); + } + + ASSERT_EQ(RootA.get_predecessors().size(), 0lu); + ASSERT_EQ(RootA.get_successors().size(), 1lu); + ASSERT_EQ(RootB.get_predecessors().size(), 0lu); + ASSERT_EQ(RootB.get_successors().size(), 2lu); + ASSERT_EQ(RootC.get_predecessors().size(), 0lu); + ASSERT_EQ(RootC.get_successors().size(), 0lu); + ASSERT_EQ(NodeA.get_predecessors().size(), 2lu); + ASSERT_EQ(NodeA.get_successors().size(), 0lu); + ASSERT_EQ(NodeB.get_predecessors().size(), 1lu); + ASSERT_EQ(NodeB.get_successors().size(), 0lu); +} + +TEST_F(CommandGraphTest, NodeTypeQueries) { + + // Allocate some pointers for testing memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + auto NodeKernel = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_EQ(NodeKernel.get_type(), experimental::node_type::kernel); + + auto NodeMemcpy = Graph.add( + [&](sycl::handler &cgh) { cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemcpy.get_type(), experimental::node_type::memcpy); + + auto NodeMemset = Graph.add( + [&](sycl::handler &cgh) { cgh.memset(PtrB, 7, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memset); + + auto NodeMemfill = + Graph.add([&](sycl::handler &cgh) { cgh.fill(PtrB, 7, 16); }); + ASSERT_EQ(NodeMemfill.get_type(), experimental::node_type::memfill); + + auto NodePrefetch = Graph.add( + [&](sycl::handler &cgh) { cgh.prefetch(PtrA, 16 * sizeof(int)); }); + ASSERT_EQ(NodePrefetch.get_type(), experimental::node_type::prefetch); + + auto NodeMemadvise = Graph.add( + [&](sycl::handler &cgh) { cgh.mem_advise(PtrA, 16 * sizeof(int), 1); }); + ASSERT_EQ(NodeMemadvise.get_type(), experimental::node_type::memadvise); + + // Use queue recording for barrier since it is not supported in explicit API + Graph.begin_recording(Queue); + auto EventBarrier = + Queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + Graph.end_recording(); + + auto NodeBarrier = experimental::node::get_node_from_event(EventBarrier); + ASSERT_EQ(NodeBarrier.get_type(), + experimental::node_type::ext_oneapi_barrier); + + auto NodeHostTask = + Graph.add([&](sycl::handler &cgh) { cgh.host_task([]() {}); }); + ASSERT_EQ(NodeHostTask.get_type(), experimental::node_type::host_task); + + auto NodeEmpty = Graph.add(); + ASSERT_EQ(NodeEmpty.get_type(), experimental::node_type::empty); + + // TODO: Test subgraph case once changes have been implemented. +} + class MultiThreadGraphTest : public CommandGraphTest { public: MultiThreadGraphTest() From f82ce2022e310408ab5cba3610205a7d09bb092e Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 20 Dec 2023 15:00:08 +0000 Subject: [PATCH 03/15] Update windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 63 ++++++++++++++----------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 7e28aa4f96bbe..5252911902c1d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1262,32 +1262,38 @@ ?get_max_statement_size@stream@_V1@sycl@@QEBA_KXZ ?get_max_statement_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ ?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVdevice@45@AEBVcontext@45@@Z -?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z -?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z -?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ -?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ -?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z -?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ +?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z +?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z +?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ +?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z +?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ +?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z +?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ ?get_pitch@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$01@34@XZ ?get_platform@context@_V1@sycl@@QEBA?AVplatform@23@XZ ?get_platform@device@_V1@sycl@@QEBA?AVplatform@23@XZ ?get_platforms@platform@_V1@sycl@@SA?AV?$vector@Vplatform@_V1@sycl@@V?$allocator@Vplatform@_V1@sycl@@@std@@@std@@XZ -?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z -?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z -?get_precision@stream@_V1@sycl@@QEBA_KXZ -?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ -?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ -?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ -?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ -?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ -?get_size@stream@_V1@sycl@@QEBA_KXZ -?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ -?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z -?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ -?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ -?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ -?get_width@stream@_V1@sycl@@QEBA_KXZ -?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ +?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z +?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z +?get_precision@stream@_V1@sycl@@QEBA_KXZ +?get_predecessors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ +?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ +?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ +?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ +?get_root_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ +?get_size@stream@_V1@sycl@@QEBA_KXZ +?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ +?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z +?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ +?get_successors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ +?get_type@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4node_type@23456@XZ +?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ +?get_width@stream@_V1@sycl@@QEBA_KXZ +?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ ?get_work_item_buffer_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ ?gpu_selector_v@_V1@sycl@@YAHAEBVdevice@12@@Z ?handleHostData@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXPEAX@Z@std@@_K_N@Z @@ -1471,12 +1477,13 @@ ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ -?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ -?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ -?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z -?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z -?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z -?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ +?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ +?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ +?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z +?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z +?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z +?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z +?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXXZ ?set_final_data_internal@image_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z From aec9e04d2d789507dddf76bfb37977febfcebbd0 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 21 Dec 2023 12:44:34 +0000 Subject: [PATCH 04/15] Clarify predecessor/successor wording --- .../extensions/experimental/sycl_ext_oneapi_graph.asciidoc | 4 ++-- sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 1a6255baeca7f..1618769e09d80 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -505,14 +505,14 @@ node_type get_type() const; ---- std::vector get_predecessors() const; ---- -|Returns a list of the predecessor nodes which this node depends on. +|Returns a list of the predecessor nodes which this node directly depends on. | [source,c++] ---- std::vector get_successors() const; ---- -|Returns a list of the successor nodes which depend on this node. +|Returns a list of the successor nodes which directly depend on this node. | [source,c++] diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 761910af9911a..ed13a3422c3a2 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -92,7 +92,7 @@ enum class node_type { prefetch, memadvise, ext_oneapi_barrier, - host_task, + host_task }; /// Class representing a node in the graph, returned by command_graph::add(). From 94cb93aa073fac09552651b5c67ceaf09845345c Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 8 Jan 2024 15:57:54 +0000 Subject: [PATCH 05/15] [SYCL][Graph] Fix comments, expand testing --- sycl/source/detail/graph_impl.hpp | 2 +- sycl/source/handler.cpp | 13 ++++----- sycl/unittests/Extensions/CommandGraph.cpp | 34 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 5232f9769cbe4..019a9b2dc7018 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -131,7 +131,7 @@ class node_impl { node_impl() {} /// Construct a node representing a command-group. - /// @param CGType Type of the command-group. + /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. node_impl(node_type NodeType, diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 640aaf06471f1..97d45bd9cbe7c 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -504,13 +504,12 @@ event handler::finalize() { ext::oneapi::experimental::detail::graph_impl::WriteLock Lock( GraphImpl->MMutex); - ext::oneapi::experimental::node_type NodeType; - if (auto UserFacingType = MImpl->MUserFacingNodeType; - UserFacingType != ext::oneapi::experimental::node_type::empty) { - NodeType = UserFacingType; - } else { - NodeType = ext::oneapi::experimental::detail::getNodeTypeFromCG(MCGType); - } + ext::oneapi::experimental::node_type NodeType = + MImpl->MUserFacingNodeType != + ext::oneapi::experimental::node_type::empty + ? MImpl->MUserFacingNodeType + : ext::oneapi::experimental::detail::getNodeTypeFromCG(MCGType); + // Create a new node in the graph representing this command-group if (MQueue->isInOrder()) { // In-order queues create implicit linear dependencies between nodes. diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index ab3ffa4e3956c..afbe2e82ac851 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1992,6 +1992,16 @@ TEST_F(CommandGraphTest, GetNodeQueries) { ASSERT_EQ(NodeA.get_successors().size(), 0lu); ASSERT_EQ(NodeB.get_predecessors().size(), 1lu); ASSERT_EQ(NodeB.get_successors().size(), 0lu); + + // List of nodesthat we've added in the order they were added. + std::vector NodeList{RootA, RootB, NodeA, NodeB, RootC}; + auto GraphNodes = Graph.get_nodes(); + + // Check all nodes + for (size_t i = 0; i < GraphNodes.size(); i++) { + ASSERT_EQ(sycl::detail::getSyclObjImpl(GraphNodes[i]), + sycl::detail::getSyclObjImpl(NodeList[i])); + } } TEST_F(CommandGraphTest, NodeTypeQueries) { @@ -2044,6 +2054,30 @@ TEST_F(CommandGraphTest, NodeTypeQueries) { // TODO: Test subgraph case once changes have been implemented. } +TEST_F(CommandGraphTest, GetNodeFromEvent) { + // Test getting a node from a recorded event and using that as a dependency + // for an explicit node + Graph.begin_recording(Queue); + auto EventKernel = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + experimental::node NodeKernelA = + experimental::node::get_node_from_event(EventKernel); + + // Add node as a dependency with the property + auto NodeKernelB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + experimental::property::node::depends_on(NodeKernelA)); + + // Test adding a dependency through make_edge + auto NodeKernelC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_NO_THROW(Graph.make_edge(NodeKernelA, NodeKernelC)); + + auto GraphExec = Graph.finalize(); +} + class MultiThreadGraphTest : public CommandGraphTest { public: MultiThreadGraphTest() From aecd2419708efdde9b0a1e51f8df5b7ebb8e21fb Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 8 Jan 2024 17:21:19 +0000 Subject: [PATCH 06/15] Replace additional ifs with ternary --- sycl/source/detail/graph_impl.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 6dd86c25a665e..4d46c104a70b0 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -414,13 +414,13 @@ graph_impl::add(const std::shared_ptr &Impl, return Handler.MSubgraphNode; } - node_type NodeType; - if (auto UserFacingType = Handler.MImpl->MUserFacingNodeType; - UserFacingType != node_type::empty) { - NodeType = UserFacingType; - } else { - NodeType = getNodeTypeFromCG(Handler.MCGType); - } + node_type NodeType = + Handler.MImpl->MUserFacingNodeType != + ext::oneapi::experimental::node_type::empty + ? Handler.MImpl->MUserFacingNodeType + : ext::oneapi::experimental::detail::getNodeTypeFromCG( + Handler.MCGType); + auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep); // Add an event associated with this explicit node for mixed usage addEventForNode(Impl, std::make_shared(), NodeImpl); From 79e7066cc6329f69255228f7c7d23f347c23177d Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 9 Jan 2024 09:49:01 +0000 Subject: [PATCH 07/15] Remove ordering wording from get_root_nodes, fix comments --- .../extensions/experimental/sycl_ext_oneapi_graph.asciidoc | 3 +-- sycl/unittests/Extensions/CommandGraph.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 6b55b06918286..eb9221cb194b7 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -860,8 +860,7 @@ were added. ---- std::vector get_root_nodes() const; ---- -|Returns a list of all nodes in the graph which have no dependencies in the -order they were added to the graph. +|Returns a list of all nodes in the graph which have no dependencies. |=== diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index e7f6f34c9123a..5065a55173283 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -2001,11 +2001,11 @@ TEST_F(CommandGraphTest, GetNodeQueries) { ASSERT_EQ(NodeB.get_predecessors().size(), 1lu); ASSERT_EQ(NodeB.get_successors().size(), 0lu); - // List of nodesthat we've added in the order they were added. + // List of nodes that we've added in the order they were added. std::vector NodeList{RootA, RootB, NodeA, NodeB, RootC}; auto GraphNodes = Graph.get_nodes(); - // Check all nodes + // Check ordering of all nodes is correct for (size_t i = 0; i < GraphNodes.size(); i++) { ASSERT_EQ(sycl::detail::getSyclObjImpl(GraphNodes[i]), sycl::detail::getSyclObjImpl(NodeList[i])); From d1effccd5179139358e8f43952498e9e53e5467a Mon Sep 17 00:00:00 2001 From: Greg Lueck Date: Wed, 10 Jan 2024 18:08:53 -0500 Subject: [PATCH 08/15] [SYCL][Doc] Fix formatting in extension template (#12346) Commit b32db9a4cdd7ffd9ca4874e495d06766e10c8889 added some new suggested text to the extension specification template, which was formatted as a quote. This formatting is inconsistent with the other suggested text, which is not formatted as a quote. Fix this. --- sycl/doc/extensions/template.asciidoc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/template.asciidoc b/sycl/doc/extensions/template.asciidoc index 166ccd2174b43..31d430ed56049 100644 --- a/sycl/doc/extensions/template.asciidoc +++ b/sycl/doc/extensions/template.asciidoc @@ -135,16 +135,16 @@ _It is also appropriate to give an indication of who the target audience is for the extension. For example, if the extension is intended only for ninja programmers, we might say something like:_ -> The properties described in this extension are advanced features that most -> applications should not need to use. In most cases, applications get the -> best performance without using these properties. +The properties described in this extension are advanced features that most +applications should not need to use. In most cases, applications get the best +performance without using these properties. _Occasionally, we might add an extension as a stopgap measure for a limited audience. When this happens, it's best to discourage general usage with a statement like:_ -> This extension exists to solve a specific problem, and a general solution is -> still being evaluated. It is not recommended for general usage. +This extension exists to solve a specific problem, and a general solution is +still being evaluated. It is not recommended for general usage. _Note that text should be wrapped at 80 columns as shown in this template. Extensions use AsciiDoc markup language (like this template). If you need help From d22ae3e5a983edb112f1548fe1d096b6b2a2dcc2 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 10 Jan 2024 23:09:10 +0000 Subject: [PATCH 09/15] [SYCL][Doc] Linked extension process readme from dpcpp readme. (#12332) This small addition signposts interested users to instructions on adding DPC++ extensions. It was motivated by https://github.com/intel/llvm/issues/12251#issuecomment-1879736917 When looking into the logic of the docs, I found there was no signposting from the initial repo readme to tell contributors how to propose their own extensions. There have been some important third-party contributions to dpc++ and in the future it is possible such contributions could include proposing new oneapi extensions. Signed-off-by: JackAKirk --- sycl/doc/developer/ContributeToDPCPP.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md index c9e2f1ee42635..ee60eb5a59d70 100644 --- a/sycl/doc/developer/ContributeToDPCPP.md +++ b/sycl/doc/developer/ContributeToDPCPP.md @@ -10,6 +10,8 @@ All changes made to the DPC++ compiler and runtime library should generally preserve existing ABI/API and contributors should avoid making incompatible changes. One of the exceptions is experimental APIs, clearly marked so by namespace or related specification. +If you wish to propose a new experimental DPC++ extension then read +[README-process.md](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/README-process.md). Another exceptional case is the transition from SYCL 1.2.1 to SYCL 2020 standard. From 6b1021cda9800b962b01f4fb42e66a8c30def8c8 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Wed, 10 Jan 2024 15:09:25 -0800 Subject: [PATCH 10/15] [CLANG][NFC] Cleanup LIT test. (#12322) --- clang/test/CodeGen/fp-accuracy.c | 196 +++++++++++++++---------------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c index b6183f79c102d..a59fb82fbd898 100644 --- a/clang/test/CodeGen/fp-accuracy.c +++ b/clang/test/CodeGen/fp-accuracy.c @@ -168,47 +168,47 @@ double rsqrt(double); // CHECK-F2: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] // // CHECK-F3-LABEL: define dso_local void @f1 -// CHECK-F3: call double @llvm.fpbuiltin.acos.f64(double %conv) #[[ATTR_F3_HIGH:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.acosh.f64(double %conv2) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.asin.f64(double %conv4) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.asinh.f64(double %conv6) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atan.f64(double %conv8) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atan2.f64(double %conv10, double %conv11) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double %conv13) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double %conv15) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double %conv17) #[[ATTR_F3_HIGH]] -// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double %conv19) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double %conv21) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double %conv23) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double %conv25) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp2.f64(double %conv27) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.expm1.f64(double %conv29) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fadd.f64(double %conv31, double %conv32) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fdiv.f64(double %conv34, double %conv35) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fmul.f64(double %conv37, double %conv38) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.frem.f64(double %conv40, double %conv41) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fsub.f64(double %conv43, double %conv44) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.hypot.f64(double %conv46, double %conv47) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.ldexp.f64(double %conv49, i32 %conv50) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log.f64(double %conv52) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double %conv54) #[[ATTR_F3_MEDIUM:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.log1p.f64(double %conv56) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log2.f64(double %conv58) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.pow.f64(double %conv60, double %conv61) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.rsqrt.f64(double %conv63) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.sin.f64(double %conv65) #[[ATTR_F3_HIGH]] -// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double %conv67, ptr %p1, ptr %p2) #[[ATTR_F3_MEDIUM]] -// CHECK-F3: call double @llvm.fpbuiltin.sinh.f64(double %conv68) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.sqrt.f64(double %conv70) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double %conv72) #[[ATTR_F3_LOW:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double %conv74) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F3_HIGH:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atan.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp2.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.expm1.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fdiv.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fmul.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.frem.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fsub.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.log1p.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log2.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]] +// CHECK-F3: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" // // CHECK-LABEL-F4: define dso_local void @f1 -// CHECK-F4: call double @llvm.fpbuiltin.acos.f64(double %conv) #[[ATTR_F4_MEDIUM:[0-9]+]] +// CHECK-F4: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM:[0-9]+]] // CHECK-F4: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -237,50 +237,50 @@ double rsqrt(double); // CHECK-F4: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] -// CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr %p1, ptr %p2) +// CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) // CHECK-F4: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // -// CHECK-F5-LABEL: define dso_local void @f1( -// CHECK-F5: call double @acos(double noundef {{.*}}) -// CHECK-F5: call double @acosh(double noundef {{.*}}) -// CHECK-F5: call double @asin(double noundef {{.*}}) -// CHECK-F5: call double @asinh(double noundef {{.*}}) -// CHECK-F5: call double @atan(double noundef {{.*}}) -// CHECK-F5: call double @atan2(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @atanh(double noundef {{.*}}) +// CHECK-F5-LABEL: define dso_local void @f1 +// CHECK-F5: call double @acos(double {{.*}}) +// CHECK-F5: call double @acosh(double {{.*}}) +// CHECK-F5: call double @asin(double {{.*}}) +// CHECK-F5: call double @asinh(double {{.*}}) +// CHECK-F5: call double @atan(double {{.*}}) +// CHECK-F5: call double @atan2(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @atanh(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM:[0-9]+]] -// CHECK-F5: call double @cosh(double noundef {{.*}}) -// CHECK-F5: call double @erf(double noundef {{.*}}) -// CHECK-F5: call double @erfc(double noundef {{.*}}) +// CHECK-F5: call double @cosh(double {{.*}}) +// CHECK-F5: call double @erf(double {{.*}}) +// CHECK-F5: call double @erfc(double {{.*}}) // CHECK-F5: call double @llvm.exp.f64(double {{.*}}) -// CHECK-F5: call i32 (double, ...) @exp10(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ...) @exp10(double {{.*}}) // CHECK-F5: call double @llvm.exp2.f64(double {{.*}}) -// CHECK-F5: call double @expm1(double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fadd(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fdiv(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fmul(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @frem(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fsub(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @hypot(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @ldexp(double noundef {{.*}}, i32 noundef {{.*}}) +// CHECK-F5: call double @expm1(double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fadd(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fdiv(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fmul(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @frem(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fsub(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @hypot(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @ldexp(double {{.*}}, i32 {{.*}}) // CHECK-F5: call double @llvm.log.f64(double {{.*}}) // CHECK-F5: call double @llvm.log10.f64(double {{.*}}) -// CHECK-F5: call double @log1p(double noundef {{.*}}) +// CHECK-F5: call double @log1p(double {{.*}}) // CHECK-F5: call double @llvm.log2.f64(double {{.*}}) // CHECK-F5: call double @llvm.pow.f64(double {{.*}}, double {{.*}}) -// CHECK-F5: call i32 (double, ...) @rsqrt(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ...) @rsqrt(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F5_HIGH:[0-9]+]] -// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-F5: call double @sinh(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-F5: call double @sinh(double {{.*}}) // CHECK-F5: call double @llvm.sqrt.f64(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F5_HIGH]] -// CHECK-F5: call double @tanh(double noundef {{.*}}) +// CHECK-F5: call double @tanh(double {{.*}}) // // -// CHECK-F6-LABEL: define dso_local void @f1( +// CHECK-F6-LABEL: define dso_local void @f1 // CHECK-F6: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM:[0-9]+]] // CHECK-F6: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] @@ -404,7 +404,7 @@ void f1(float a, float b) { // CHECK-F1: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F1_LOW]] // CHECK-F1: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F1_MEDIUM]] // CHECK-F1: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F1_MEDIUM]] -// CHECK-F1: call float @tanf(float noundef {{.*}}) +// CHECK-F1: call float @tanf(float {{.*}}) // // CHECK-F2-LABEL: define dso_local void @f2 // CHECK-F2: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F2_MEDIUM]] @@ -412,7 +412,7 @@ void f1(float a, float b) { // CHECK-F2: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F2_HIGH]] // CHECK-F2: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]] -// CHECK-F2: call float @tanf(float noundef {{.*}}) +// CHECK-F2: call float @tanf(float {{.*}}) // // CHECK-LABEL-F4: define dso_local void @f2 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -422,24 +422,24 @@ void f1(float a, float b) { // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @tanf(float {{.*}}) // -// CHECK-F5-LABEL: define dso_local void @f2( +// CHECK-F5-LABEL: define dso_local void @f2 // CHECK-F5: call float @llvm.cos.f32(float {{.*}}) // CHECK-F5: call float @llvm.sin.f32(float {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F5_HIGH]] // CHECK-F5: call double @llvm.log10.f64(double {{.*}}) -// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-F5: call float @tanf(float noundef {{.*}}) +// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-F5: call float @tanf(float {{.*}}) // // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // -// CHECK-F6-LABEL: define dso_local void @f2( +// CHECK-F6-LABEL: define dso_local void @f2 // CHECK-F6: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]] -// CHECK-F6: call float @tanf(float noundef {{.*}}) #[[ATTR8:[0-9]+]] +// CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]] // // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" @@ -450,7 +450,7 @@ void f1(float a, float b) { // CHECK-SPIR: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_SYCL2]] // CHECK-SPIR: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_SYCL5]] // CHECK-SPIR: call void @llvm.fpbuiltin.sincos.f32(float {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_SYCL1]] -// CHECK-SPIR: call spir_func float @tanf(float noundef {{.*}}) +// CHECK-SPIR: call spir_func float @tanf(float {{.*}}) // CHECK-LABEL: define dso_local void @f3 // CHECK: call float @fake_exp10(float {{.*}}) @@ -480,48 +480,48 @@ void f1(float a, float b) { // CHECK-SPIR: attributes #[[ATTR_SYCL8]] = {{.*}}"fpbuiltin-max-error"="2.0" // CHECK-DEFAULT-LABEL: define dso_local void @f1 -// CHECK-DEFAULT: call double @acos(double noundef {{.*}}) -// CHECK-DEFAULT: call double @acosh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @asin(double noundef {{.*}}) -// CHECK-DEFAULT: call double @asinh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @atan(double noundef {{.*}}) -// CHECK-DEFAULT: call double @atan2(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @atanh(double noundef {{.*}}) +// CHECK-DEFAULT: call double @acos(double {{.*}}) +// CHECK-DEFAULT: call double @acosh(double {{.*}}) +// CHECK-DEFAULT: call double @asin(double {{.*}}) +// CHECK-DEFAULT: call double @asinh(double {{.*}}) +// CHECK-DEFAULT: call double @atan(double {{.*}}) +// CHECK-DEFAULT: call double @atan2(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @atanh(double {{.*}}) // CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}}) -// CHECK-DEFAULT: call double @cosh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @erf(double noundef {{.*}}) -// CHECK-DEFAULT: call double @erfc(double noundef {{.*}}) +// CHECK-DEFAULT: call double @cosh(double {{.*}}) +// CHECK-DEFAULT: call double @erf(double {{.*}}) +// CHECK-DEFAULT: call double @erfc(double {{.*}}) // CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ...) @exp10(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ...) @exp10(double {{.*}}) // CHECK-DEFAULT: call double @llvm.exp2.f64(double {{.*}}) -// CHECK-DEFAULT: call double @expm1(double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fadd(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fdiv(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fmul(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @frem(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fsub(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @hypot(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @ldexp(double noundef {{.*}}, i32 noundef {{.*}}) +// CHECK-DEFAULT: call double @expm1(double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fadd(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fdiv(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fmul(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @frem(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fsub(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @hypot(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @ldexp(double {{.*}}, i32 {{.*}}) // CHECK-DEFAULT: call double @llvm.log.f64(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}}) -// CHECK-DEFAULT: call double @log1p(double noundef {{.*}}) +// CHECK-DEFAULT: call double @log1p(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log2.f64(double {{.*}}) // CHECK-DEFAULT: call double @llvm.pow.f64(double {{.*}}, double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double {{.*}}) // CHECK-DEFAULT: call double @llvm.sin.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-DEFAULT: call double @sinh(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-DEFAULT: call double @sinh(double {{.*}}) // CHECK-DEFAULT: call double @llvm.sqrt.f64(double {{.*}}) -// CHECK-DEFAULT: call double @tan(double noundef {{.*}}) -// CHECK-DEFAULT: call double @tanh(double noundef {{.*}}) +// CHECK-DEFAULT: call double @tan(double {{.*}}) +// CHECK-DEFAULT: call double @tanh(double {{.*}}) // // CHECK-DEFAULT-LABEL: define dso_local void @f2 // CHECK-DEFAULT: call float @llvm.cos.f32(float {{.*}}) // CHECK-DEFAULT: call float @llvm.sin.f32(float {{.*}}) -// CHECK-DEFAULT: call double @tan(double noundef {{.*}}) +// CHECK-DEFAULT: call double @tan(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-DEFAULT: call float @tanf(float noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-DEFAULT: call float @tanf(float {{.*}}) // CHECK-DEFAULT-LABEL: define dso_local void @f3 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}}) From 5245c75898771cd42480f0e3e09beddf04a703d0 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Wed, 10 Jan 2024 18:49:11 -0600 Subject: [PATCH 11/15] [SYCL][Matrix tests]make multi_ptr access direct inside the kernel (#12351) This change explicitly defines multi_ptr inside the kernel instead of outside the kernel. Before, we rely on the runtime to correctly treat the lambda capture clause and create correct private copies for each of these variables. With this change, we are making sure that these variables are correctly captured (private copies) inside the kernel. --- .../joint_matrix_annotated_ptr_impl.hpp | 15 +++++++------ .../joint_matrix_bf16_fill_k_cache_impl.hpp | 17 ++++++++------- .../joint_matrix_colA_rowB_colC_impl.hpp | 17 ++++++++------- .../Matrix/joint_matrix_out_bounds_impl.hpp | 17 ++++++++------- .../Matrix/joint_matrix_transposeC_impl.hpp | 21 ++++++++++--------- 5 files changed, 48 insertions(+), 39 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp index 619cac27a4d28..d90d46a90f801 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp @@ -6,18 +6,21 @@ template (A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 4349b32745751..0c458f67658d5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -76,14 +76,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { assert(rowsA % tM == 0); assert(colsA % tK == 0); assert(colsB % tN == 0); - - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - // submit main kernel std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); @@ -94,6 +86,15 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { // loop global // loop localrange [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); auto m2 = it.get_group(0); auto n2 = it.get_group(1); auto m1 = it.get_local_id(0); diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp index c2716c94a359c..f11ef0eadd7df 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -19,19 +19,22 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp index 3607eab14fbc0..82116e6ea2b2a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp @@ -18,20 +18,21 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q, unsigned int vnniFactor) { // Add one iteration for the out of bounds dpas instruction size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); size_t NDRangeN = N / TN; - - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 3564472c5d958..9c435d1ee4337 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -13,20 +13,21 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - auto p_input = address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); From 218d9fe17de171ffb5c1b48c204b54e8a0675053 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 10 Jan 2024 23:43:20 -0800 Subject: [PATCH 12/15] [SYCL] Limit the directories that are searched when loading dependencies of the plugins (#12336) Currently the default search order is used when loading dependencies of the plugins (these dependencies include the Level Zero loader and the ICD loader for opencl and level zero plugins respectively) and that list includes current directory and some other directories which are not considered safe. This patch limits the list of directories when loading the dependencies of the plugins. See: https://learn.microsoft.com/en-us/windows/win32/api/libloaderapi/nf-libloaderapi-loadlibraryexa for reference. --------- Co-authored-by: aelovikov-intel --- .../pi_win_proxy_loader.cpp | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp b/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp index f687e811cfe09..53f59b1b18f80 100644 --- a/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp +++ b/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp @@ -133,31 +133,23 @@ void preloadLibraries() { MapT &dllMap = getDllMap(); - auto ocl_path = LibSYCLDir / __SYCL_OPENCL_PLUGIN_NAME; - dllMap.emplace(ocl_path, - LoadLibraryEx(ocl_path.wstring().c_str(), NULL, NULL)); - - auto l0_path = LibSYCLDir / __SYCL_LEVEL_ZERO_PLUGIN_NAME; - dllMap.emplace(l0_path, LoadLibraryEx(l0_path.wstring().c_str(), NULL, NULL)); - - auto cuda_path = LibSYCLDir / __SYCL_CUDA_PLUGIN_NAME; - dllMap.emplace(cuda_path, - LoadLibraryEx(cuda_path.wstring().c_str(), NULL, NULL)); - - auto esimd_path = LibSYCLDir / __SYCL_ESIMD_EMULATOR_PLUGIN_NAME; - dllMap.emplace(esimd_path, - LoadLibraryEx(esimd_path.wstring().c_str(), NULL, NULL)); - - auto hip_path = LibSYCLDir / __SYCL_HIP_PLUGIN_NAME; - dllMap.emplace(hip_path, - LoadLibraryEx(hip_path.wstring().c_str(), NULL, NULL)); - - auto ur_path = LibSYCLDir / __SYCL_UNIFIED_RUNTIME_PLUGIN_NAME; - dllMap.emplace(ur_path, LoadLibraryEx(ur_path.wstring().c_str(), NULL, NULL)); - - auto nativecpu_path = LibSYCLDir / __SYCL_NATIVE_CPU_PLUGIN_NAME; - dllMap.emplace(nativecpu_path, - LoadLibraryEx(nativecpu_path.wstring().c_str(), NULL, NULL)); + // When searching for dependencies of the plugins limit the + // list of directories to %windows%\system32 and the directory that contains + // the loaded DLL (the plugin). This is necessary to avoid loading dlls from + // current directory and some other directories which are considered unsafe. + auto loadPlugin = [&](auto pluginName, + DWORD flags = LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR | + LOAD_LIBRARY_SEARCH_SYSTEM32) { + auto path = LibSYCLDir / pluginName; + dllMap.emplace(path, LoadLibraryEx(path.wstring().c_str(), NULL, flags)); + }; + loadPlugin(__SYCL_OPENCL_PLUGIN_NAME); + loadPlugin(__SYCL_LEVEL_ZERO_PLUGIN_NAME); + loadPlugin(__SYCL_CUDA_PLUGIN_NAME); + loadPlugin(__SYCL_ESIMD_EMULATOR_PLUGIN_NAME); + loadPlugin(__SYCL_HIP_PLUGIN_NAME); + loadPlugin(__SYCL_UNIFIED_RUNTIME_PLUGIN_NAME); + loadPlugin(__SYCL_NATIVE_CPU_PLUGIN_NAME); // Restore system error handling. (void)SetErrorMode(SavedMode); From 8ea022954ec109fd243d5f9c0d99c1933b1793e5 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 11 Jan 2024 12:00:11 +0000 Subject: [PATCH 13/15] [SYCL][Graph] Add support for fill and memset nodes in graphs (#11472) - Adds support for fill and memset nodes in graphs. - Supported on Level Zero only for now. - Adds E2E and unit tests for these new node types. - Minor modifications due to renaming of some UR functions. --------- Co-authored-by: Maxime France-Pillois Co-authored-by: Ewan Crawford --- sycl/doc/design/CommandGraph.md | 4 + sycl/include/sycl/detail/pi.def | 2 + sycl/include/sycl/detail/pi.h | 45 +++++++++- sycl/plugins/cuda/pi_cuda.cpp | 21 +++++ sycl/plugins/hip/pi_hip.cpp | 21 +++++ sycl/plugins/level_zero/pi_level_zero.cpp | 21 +++++ sycl/plugins/opencl/pi_opencl.cpp | 21 +++++ sycl/plugins/unified_runtime/pi2ur.hpp | 31 +++++++ .../unified_runtime/pi_unified_runtime.cpp | 21 +++++ sycl/source/detail/memory_manager.cpp | 44 ++++++++++ sycl/source/detail/memory_manager.hpp | 18 ++++ sycl/source/detail/scheduler/commands.cpp | 22 +++++ sycl/test-e2e/Graph/Explicit/buffer_fill.cpp | 11 +++ sycl/test-e2e/Graph/Explicit/usm_memset.cpp | 11 +++ sycl/test-e2e/Graph/Inputs/buffer_fill.cpp | 88 +++++++++++++++++++ sycl/test-e2e/Graph/Inputs/usm_memset.cpp | 34 +++++++ .../Graph/RecordReplay/buffer_fill.cpp | 11 +++ .../Graph/RecordReplay/usm_memset.cpp | 11 +++ sycl/test/abi/pi_cuda_symbol_check.dump | 2 + sycl/test/abi/pi_hip_symbol_check.dump | 2 + sycl/test/abi/pi_level_zero_symbol_check.dump | 2 + sycl/test/abi/pi_opencl_symbol_check.dump | 2 + sycl/test/abi/sycl_symbols_linux.dump | 2 + sycl/test/abi/sycl_symbols_windows.dump | 2 + sycl/unittests/Extensions/CommandGraph.cpp | 62 +++++++++++++ sycl/unittests/helpers/PiMockPlugin.hpp | 17 ++++ 26 files changed, 524 insertions(+), 4 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/buffer_fill.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/usm_memset.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/buffer_fill.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/usm_memset.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 7c405d5ca791b..fed89532ff7ce 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -37,12 +37,14 @@ with the following entry-points: | `urCommandBufferFinalizeExp` | No more commands can be appended, makes command-buffer ready to enqueue on a command-queue. | | `urCommandBufferAppendKernelLaunchExp` | Append a kernel execution command to command-buffer. | | `urCommandBufferAppendUSMMemcpyExp` | Append a USM memcpy command to the command-buffer. | +| `urCommandBufferAppendUSMFillExp` | Append a USM fill command to the command-buffer. | | `urCommandBufferAppendMemBufferCopyExp` | Append a mem buffer copy command to the command-buffer. | | `urCommandBufferAppendMemBufferWriteExp` | Append a memory write command to a command-buffer object. | | `urCommandBufferAppendMemBufferReadExp` | Append a memory read command to a command-buffer object. | | `urCommandBufferAppendMemBufferCopyRectExp` | Append a rectangular memory copy command to a command-buffer object. | | `urCommandBufferAppendMemBufferWriteRectExp` | Append a rectangular memory write command to a command-buffer object. | | `urCommandBufferAppendMemBufferReadRectExp` | Append a rectangular memory read command to a command-buffer object. | +| `urCommandBufferAppendMemBufferFillExp` | Append a memory fill command to a command-buffer object. | | `urCommandBufferEnqueueExp` | Submit command-buffer to a command-queue for execution. | See the [UR EXP-COMMAND-BUFFER](https://oneapi-src.github.io/unified-runtime/core/EXP-COMMAND-BUFFER.html) @@ -347,6 +349,8 @@ The types of commands which are unsupported, and lead to this exception are: This corresponds to a memory buffer write command. * `handler::copy(src, dest)` or `handler::memcpy(dest, src)` - Where both `src` and `dest` are USM pointers. This corresponds to a USM copy command. +* `handler::memset(ptr, value, numBytes)` - This corresponds to a USM memory + fill command. Note that `handler::copy(src, dest)` where both `src` and `dest` are an accessor is supported, as a memory buffer copy command exists in the OpenCL extension. diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 69513335bf191..d963cfb860f4e 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -176,6 +176,8 @@ _PI_API(piextCommandBufferMemBufferWrite) _PI_API(piextCommandBufferMemBufferWriteRect) _PI_API(piextCommandBufferMemBufferRead) _PI_API(piextCommandBufferMemBufferReadRect) +_PI_API(piextCommandBufferMemBufferFill) +_PI_API(piextCommandBufferFillUSM) _PI_API(piextEnqueueCommandBuffer) _PI_API(piextUSMPitchedAlloc) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 010c59dd3c9d6..9860906e0f847 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -147,9 +147,10 @@ // 14.38 Change PI_MEM_ADVICE_* values to flags for use in bitwise operations. // 14.39 Added PI_EXT_INTEL_DEVICE_INFO_ESIMD_SUPPORT device info query. // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones. +// 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM #define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 40 +#define _PI_H_VERSION_MINOR 41 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -2441,7 +2442,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect( pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - void *ptr, pi_uint32 num_events_in_wait_list, + void *ptr, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); @@ -2458,7 +2459,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect( /// \param sync_point The sync_point associated with this memory operation. __SYCL_EXPORT pi_result piextCommandBufferMemBufferWrite( pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, - size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, + size_t size, const void *ptr, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); @@ -2483,7 +2484,43 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferWriteRect( pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - const void *ptr, pi_uint32 num_events_in_wait_list, + const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a mem buffer fill command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the location to fill the data. +/// \param pattern pointer to the pattern to fill the buffer with. +/// \param pattern_size size of the pattern in bytes. +/// \param offset Offset into the buffer to fill from. +/// \param size fill size in bytes. +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer command_buffer, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a USM fill command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param ptr pointer to the USM allocation to fill. +/// \param pattern pointer to the pattern to fill ptr with. +/// \param pattern_size size of the pattern in bytes. +/// \param size fill size in bytes. +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferFillUSM( + pi_ext_command_buffer command_buffer, void *ptr, const void *pattern, + size_t pattern_size, size_t size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 94a39137ec4f7..5eb06f37b2237 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1137,6 +1137,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 7095526dc1d34..775183d82d239 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1145,6 +1145,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index eb5ab8a42259f..6cb5322fa778f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1303,6 +1303,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 6cc6a325af923..7512d411144ab 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1081,6 +1081,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index d3051c47bd93b..75d1bd598e80a 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4580,6 +4580,37 @@ inline pi_result piextCommandBufferMemBufferWrite( return PI_SUCCESS; } +inline pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + + HANDLE_ERRORS(urCommandBufferAppendMemBufferFillExp( + UrCommandBuffer, UrBuffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferFillUSM( + pi_ext_command_buffer CommandBuffer, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferAppendUSMFillExp( + UrCommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint)); + return PI_SUCCESS; +} + inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index ab5b801c3fda3..7cb9fdbb9b554 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1102,6 +1102,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index ae357a8f4fe5b..d0071dbabd15a 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1666,6 +1666,50 @@ void MemoryManager::ext_oneapi_copy_usm_cmd_buffer( } } +void MemoryManager::ext_oneapi_fill_usm_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, + size_t Len, int Pattern, std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + + if (!DstMem) + throw runtime_error("NULL pointer argument in memory fill operation.", + PI_ERROR_INVALID_VALUE); + + const PluginPtr &Plugin = Context->getPlugin(); + // Pattern is interpreted as an unsigned char so pattern size is always 1. + size_t PatternSize = 1; + Plugin->call( + CommandBuffer, DstMem, &Pattern, PatternSize, Len, Deps.size(), + Deps.data(), OutSyncPoint); +} + +void MemoryManager::ext_oneapi_fill_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, + sycl::range<3> Size, sycl::range<3> AccessRange, sycl::id<3> AccessOffset, + unsigned int ElementSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + + const PluginPtr &Plugin = Context->getPlugin(); + if (SYCLMemObj->getType() != detail::SYCLMemObjI::MemObjType::Buffer) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Images are not supported in Graphs"); + } + if (Dim <= 1) { + Plugin->call( + CommandBuffer, pi::cast(Mem), Pattern, + PatternSize, AccessOffset[0] * ElementSize, + AccessRange[0] * ElementSize, Deps.size(), Deps.data(), OutSyncPoint); + return; + } + throw runtime_error("Not supported configuration of fill requested", + PI_ERROR_INVALID_OPERATION); +} + void MemoryManager::copy_image_bindless( void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index a1b68b1418c69..6169c99392f66 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -316,6 +316,24 @@ class __SYCL_EXPORT MemoryManager { void *DstMem, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + static void ext_oneapi_fill_usm_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, + size_t Len, int Pattern, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + + static void + ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, void *Mem, + size_t PatternSize, const char *Pattern, + unsigned int Dim, sycl::range<3> Size, + sycl::range<3> AccessRange, + sycl::id<3> AccessOffset, unsigned int ElementSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index db14a10943ce3..2ffc0ebd54a38 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2764,6 +2764,28 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return PI_SUCCESS; } + case CG::CGTYPE::Fill: { + CGFill *Fill = (CGFill *)MCommandGroup.get(); + Requirement *Req = (Requirement *)(Fill->getReqToFill()); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + + MemoryManager::ext_oneapi_fill_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(), + AllocaCmd->getMemAllocation(), Fill->MPattern.size(), + Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange, + Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } + case CG::CGTYPE::FillUSM: { + CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get(); + MemoryManager::ext_oneapi_fill_usm_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(), + Fill->getLength(), Fill->getFill(), std::move(MSyncPointDeps), + &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } default: throw runtime_error("CG type not implemented for command buffers.", PI_ERROR_INVALID_OPERATION); diff --git a/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp b/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp new file mode 100644 index 0000000000000..73b961994a72b --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} +// +// TODO enable cuda once buffer issue investigated and fixed +// UNSUPPORTED: cuda + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_fill.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp new file mode 100644 index 0000000000000..a8a42abc1acd0 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} + +// USM memset command not supported for OpenCL +// UNSUPPORTED: opencl + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_memset.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp b/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp new file mode 100644 index 0000000000000..351194dadda0f --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp @@ -0,0 +1,88 @@ +// Tests adding a Buffer fill operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 10; + const float Pattern = 3.14f; + std::vector Data(N); + buffer Buffer{Data}; + + const uint64_t PatternI64 = 0x3333333355555555; + std::vector DataI64(N); + buffer BufferI64{DataI64}; + + const uint32_t PatternI32 = 888; + std::vector DataI32(N); + buffer BufferI32{DataI32}; + + const uint16_t PatternI16 = 777; + std::vector DataI16(N); + buffer BufferI16{DataI16}; + + const uint8_t PatternI8 = 33; + std::vector DataI8(N); + buffer BufferI8{DataI8}; + + Buffer.set_write_back(false); + BufferI64.set_write_back(false); + BufferI32.set_write_back(false); + BufferI16.set_write_back(false); + BufferI8.set_write_back(false); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Pattern); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI64.get_access(CGH); + CGH.fill(Acc, PatternI64); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI32.get_access(CGH); + CGH.fill(Acc, PatternI32); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI16.get_access(CGH); + CGH.fill(Acc, PatternI16); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI8.get_access(CGH); + CGH.fill(Acc, PatternI8); + }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + } + host_accessor HostData(Buffer); + host_accessor HostDataI64(BufferI64); + host_accessor HostDataI32(BufferI32); + host_accessor HostDataI16(BufferI16); + host_accessor HostDataI8(BufferI8); + for (int i = 0; i < N; i++) { + assert(HostData[i] == Pattern); + assert(HostDataI64[i] == PatternI64); + assert(HostDataI32[i] == PatternI32); + assert(HostDataI16[i] == PatternI16); + assert(HostDataI8[i] == PatternI8); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_memset.cpp b/sycl/test-e2e/Graph/Inputs/usm_memset.cpp new file mode 100644 index 0000000000000..f357b9b3a5adf --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_memset.cpp @@ -0,0 +1,34 @@ +// Tests adding a USM memset operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + unsigned char *Arr = malloc_device(N, Queue); + + int Value = 77; + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.memset(Arr, Value, N); }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == Value); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp new file mode 100644 index 0000000000000..91729ace49742 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} +// +// TODO enable cuda once buffer issue investigated and fixed +// UNSUPPORTED: cuda + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_fill.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp new file mode 100644 index 0000000000000..acbb0a502c67f --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} + +// USM memset command not supported for OpenCL +// UNSUPPORTED: opencl + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_memset.cpp" diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index abf73cce97bd0..fcdf008702292 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -84,9 +84,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 3940b6d80677a..c0dacf2632e9b 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -84,9 +84,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 38b3a420b2e71..a2bd23cbf26ce 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -83,9 +83,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 11ee74902849b..8bece2c54db32 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -83,9 +83,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 62336dad2ed28..6a3bbbab13a5e 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3926,10 +3926,12 @@ _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1 _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvRKSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvRKSt6vectorIP9_pi_eventSaISB_EEPSB_RKS5_INS1_10event_implEE _ZN4sycl3_V16detail13MemoryManager24allocateInteropMemObjectESt10shared_ptrINS1_12context_implEEPvRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager26ext_oneapi_fill_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvmPKcjNS0_5rangeILi3EEESE_NS0_2idILi3EEEjSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSA_jSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2H_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjPcjSC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyH2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPcjNS0_5rangeILi3EEENS0_2idILi3EEEjPvjSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager30ext_oneapi_copy_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEPKvP22_pi_ext_command_buffermPvSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager30ext_oneapi_fill_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPvmiSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager3mapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEENS0_6access4modeEjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_RKS6_INS1_10event_implEE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index c3e000b4f9553..f20eb9cede900 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1041,6 +1041,8 @@ ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ ?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z +?ext_oneapi_fill_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAX_KPEBDIV?$range@$02@34@6V?$id@$02@34@IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_fill_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KHV?$vector@IV?$allocator@I@std@@@6@PEAI@Z ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ ?ext_oneapi_get_kernel@kernel_bundle_plain@detail@_V1@sycl@@QEAA?AVkernel@34@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z ?ext_oneapi_get_state@queue@_V1@sycl@@QEBA?AW4queue_state@experimental@oneapi@ext@23@XZ diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 8ad08257a1046..a9d5dd4f63081 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -2178,3 +2178,65 @@ TEST_F(MultiThreadGraphTest, Finalize) { ASSERT_EQ(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl), true); } } + +// Test adding fill and memset nodes to a graph +TEST_F(CommandGraphTest, FillMemsetNodes) { + const int Value = 7; + // Buffer fill + buffer Buffer{range<1>{1}}; + Buffer.set_write_back(false); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto NodeA = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + auto NodeB = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + + auto NodeAImpl = sycl::detail::getSyclObjImpl(NodeA); + auto NodeBImpl = sycl::detail::getSyclObjImpl(NodeB); + + // Check Operator== + EXPECT_EQ(NodeAImpl, NodeAImpl); + EXPECT_NE(NodeAImpl, NodeBImpl); + } + + // USM + { + int *USMPtr = malloc_device(1, Queue); + + // We need to create some differences between nodes because unlike buffer + // fills they are not differentiated on accessor ptr value. + auto FillNodeA = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value, 1); }); + auto FillNodeB = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value + 1, 1); }); + auto MemsetNodeA = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 1); }); + auto MemsetNodeB = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 2); }); + + auto FillNodeAImpl = sycl::detail::getSyclObjImpl(FillNodeA); + auto FillNodeBImpl = sycl::detail::getSyclObjImpl(FillNodeB); + auto MemsetNodeAImpl = sycl::detail::getSyclObjImpl(MemsetNodeA); + auto MemsetNodeBImpl = sycl::detail::getSyclObjImpl(MemsetNodeB); + + // Check Operator== + EXPECT_EQ(FillNodeAImpl, FillNodeAImpl); + EXPECT_EQ(FillNodeBImpl, FillNodeBImpl); + EXPECT_NE(FillNodeAImpl, FillNodeBImpl); + + EXPECT_EQ(MemsetNodeAImpl, MemsetNodeAImpl); + EXPECT_EQ(MemsetNodeBImpl, MemsetNodeBImpl); + EXPECT_NE(MemsetNodeAImpl, MemsetNodeBImpl); + sycl::free(USMPtr, Queue); + } +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index f12917b1e70b9..decc1a7e309ee 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1376,6 +1376,23 @@ inline pi_result mock_piextCommandBufferMemBufferCopyRect( return PI_SUCCESS; } +inline pi_result mock_piextCommandBufferMemBufferFill( + pi_ext_command_buffer command_buffer, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferFillUSM( + pi_ext_command_buffer command_buffer, void *ptr, const void *pattern, + size_t pattern_size, size_t size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + inline pi_result mock_piTearDown(void *PluginParameter) { return PI_SUCCESS; } inline pi_result mock_piPluginGetLastError(char **message) { From f303cf35eaf173a59ee63e2d47d9201f8c45fed9 Mon Sep 17 00:00:00 2001 From: Hugh Delaney <46290137+hdelan@users.noreply.github.com> Date: Thu, 11 Jan 2024 12:13:20 +0000 Subject: [PATCH 14/15] [SYCL] Add missing fabs (#12218) `fabs` was missing for Nvidia compilation --- libdevice/cmath_wrapper.cpp | 3 +++ libdevice/cmath_wrapper_fp64.cpp | 4 ++++ libdevice/device_math.h | 6 ++++++ libdevice/fallback-cmath-fp64.cpp | 4 ++++ libdevice/fallback-cmath.cpp | 3 +++ sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp | 9 +++++---- sycl/test-e2e/DeviceLib/cmath_test.cpp | 11 ++++++----- 7 files changed, 31 insertions(+), 9 deletions(-) diff --git a/libdevice/cmath_wrapper.cpp b/libdevice/cmath_wrapper.cpp index 79d8fbf12ce25..5d9c8f0a77d13 100644 --- a/libdevice/cmath_wrapper.cpp +++ b/libdevice/cmath_wrapper.cpp @@ -19,6 +19,9 @@ long int labs(long int x) { return __devicelib_labs(x); } DEVICE_EXTERN_C_INLINE long long int llabs(long long int x) { return __devicelib_llabs(x); } +DEVICE_EXTERN_C_INLINE +float fabsf(float x) { return __devicelib_fabsf(x); } + DEVICE_EXTERN_C_INLINE div_t div(int x, int y) { return __devicelib_div(x, y); } diff --git a/libdevice/cmath_wrapper_fp64.cpp b/libdevice/cmath_wrapper_fp64.cpp index bb5b1986a5a8c..e7b0815ae6526 100644 --- a/libdevice/cmath_wrapper_fp64.cpp +++ b/libdevice/cmath_wrapper_fp64.cpp @@ -15,6 +15,10 @@ // reference. If users provide their own math or complex functions(with // the prototype), functions in device libraries will be ignored and // overrided by users' version. + +DEVICE_EXTERN_C_INLINE +double fabs(double x) { return __devicelib_fabs(x); } + DEVICE_EXTERN_C_INLINE double log(double x) { return __devicelib_log(x); } diff --git a/libdevice/device_math.h b/libdevice/device_math.h index 930bcae7d7967..a402c748299d2 100644 --- a/libdevice/device_math.h +++ b/libdevice/device_math.h @@ -40,6 +40,12 @@ long int __devicelib_labs(long int x); DEVICE_EXTERN_C long long int __devicelib_llabs(long long int x); +DEVICE_EXTERN_C +float __devicelib_fabsf(float x); + +DEVICE_EXTERN_C +double __devicelib_fabs(double x); + DEVICE_EXTERN_C div_t __devicelib_div(int x, int y); diff --git a/libdevice/fallback-cmath-fp64.cpp b/libdevice/fallback-cmath-fp64.cpp index c42855699811a..9656f229c4fd1 100644 --- a/libdevice/fallback-cmath-fp64.cpp +++ b/libdevice/fallback-cmath-fp64.cpp @@ -14,6 +14,10 @@ // To support fallback device libraries on-demand loading, please update the // DeviceLibFuncMap in llvm/tools/sycl-post-link/sycl-post-link.cpp if you add // or remove any item in this file. + +DEVICE_EXTERN_C_INLINE +double __devicelib_fabs(double x) { return x < 0 ? -x : x; } + DEVICE_EXTERN_C_INLINE double __devicelib_log(double x) { return __spirv_ocl_log(x); } diff --git a/libdevice/fallback-cmath.cpp b/libdevice/fallback-cmath.cpp index 28a1463489b17..dc9e2806111f5 100644 --- a/libdevice/fallback-cmath.cpp +++ b/libdevice/fallback-cmath.cpp @@ -25,6 +25,9 @@ long int __devicelib_labs(long int x) { return x < 0 ? -x : x; } DEVICE_EXTERN_C_INLINE long long int __devicelib_llabs(long long int x) { return x < 0 ? -x : x; } +DEVICE_EXTERN_C_INLINE +float __devicelib_fabsf(float x) { return x < 0 ? -x : x; } + DEVICE_EXTERN_C_INLINE div_t __devicelib_div(int x, int y) { return {x / y, x % y}; } diff --git a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp index 994b61485801e..747ce4ed77465 100644 --- a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp +++ b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp @@ -19,12 +19,12 @@ namespace s = sycl; constexpr s::access::mode sycl_read = s::access::mode::read; constexpr s::access::mode sycl_write = s::access::mode::write; -#define TEST_NUM 63 +#define TEST_NUM 64 double ref[TEST_NUM] = { - 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0, 1, 0, 2, 0, 0, - 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, 0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0, 1, 0, 2, 0, 0, + 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, 0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; double refIptr = 1; @@ -59,6 +59,7 @@ template void device_cmath_test(s::queue &deviceQueue) { T minus_infinity = -INFINITY; double subnormal; *((uint64_t *)&subnormal) = 0xFFFFFFFFFFFFFULL; + res_access[i++] = std::fabs(-1.0); res_access[i++] = std::cos(0.0); res_access[i++] = std::sin(0.0); res_access[i++] = std::round(1.0); diff --git a/sycl/test-e2e/DeviceLib/cmath_test.cpp b/sycl/test-e2e/DeviceLib/cmath_test.cpp index 3cc359f79fb94..77aeb312571d2 100644 --- a/sycl/test-e2e/DeviceLib/cmath_test.cpp +++ b/sycl/test-e2e/DeviceLib/cmath_test.cpp @@ -143,15 +143,15 @@ template void device_cmath_test_1(s::queue &deviceQueue) { assert(quo == 0); } -// MSVC implements std::ldexp and std::frexp by invoking the -// 'double' version of corresponding C math functions(ldexp and frexp). Those -// 2 functions can only work on Windows with fp64 extension support from -// underlying device. +// MSVC implements std::ldexp, std::fabs and std::frexp by +// invoking the 'double' version of corresponding C math functions(ldexp, fabs +// and frexp). Those functions can only work on Windows with fp64 extension +// support from underlying device. #ifndef _WIN32 template void device_cmath_test_2(s::queue &deviceQueue) { s::range<1> numOfItems{2}; T result[2] = {-1}; - T ref[2] = {0, 2}; + T ref[3] = {0, 2, 1}; // Variable exponent is an integer value to store the exponent in frexp // function int exponent = -1; @@ -166,6 +166,7 @@ template void device_cmath_test_2(s::queue &deviceQueue) { int i = 0; res_access[i++] = std::frexp(0.0f, &exp_access[0]); res_access[i++] = std::ldexp(1.0f, 1); + res_access[i++] = std::fabs(-1.0f); }); }); } From 8aa2b613d8d94816f48c343f15784b5ef1cdba6c Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 11 Jan 2024 16:08:05 +0100 Subject: [PATCH 15/15] [UR] Document how to customize HIP directories (#12127) Co-authored-by: Kenneth Benzie (Benie) Co-authored-by: ldrumm --- sycl/doc/GetStartedGuide.md | 11 +++++++++++ sycl/plugins/hip/CMakeLists.txt | 8 ++++++++ sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 2dc2abf704642..0f36f0d38eeb4 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -261,6 +261,17 @@ variable `SYCL_BUILD_PI_HIP_ROCM_DIR` which can be passed using the python $DPCPP_HOME/llvm/buildbot/configure.py --hip \ --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/usr/local/rocm ``` +If further customization is required — for instance when the layout of +individual directories can not be inferred from `SYCL_BUILD_PI_HIP_ROCM_DIR` — +it is possible to specify the location of HIP include, HSA include and HIP +library directories, using the following CMake variables: +* `SYCL_BUILD_PI_HIP_INCLUDE_DIR`, +* `SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR`, +* `SYCL_BUILD_PI_HIP_LIB_DIR`. +Please note that a similar customization would also be required for Unified +Runtime, see [the list of options provided by its +CMake](https://github.com/oneapi-src/unified-runtime#cmake-standard-options) +for details. [LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMDGPU compilation chain. The AMDGPU backend generates a standard ELF relocatable code diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 0f949af2c109c..5d5e8e4782066 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -44,6 +44,14 @@ else() set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_LIB_DIR}") endif() +# Set up defaults for UR +set(UR_HIP_INCLUDE_DIR "${PI_HIP_INCLUDE_DIR}" CACHE PATH + "Custom ROCm HIP include dir") +set(UR_HIP_HSA_INCLUDE_DIRS "${PI_HIP_HSA_INCLUDE_DIR}" CACHE PATH + "Custom ROCm HSA include dir") +set(UR_HIP_LIB_DIR "${PI_HIP_LIB_DIR}" CACHE PATH + "Custom ROCm HIP library dir") + # Mark override options for advanced usage mark_as_advanced(SYCL_BUILD_PI_HIP_INCLUDE_DIR SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR SYCL_BUILD_PI_HIP_LIB_DIR) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c3273c9affb9e..f81efbc880999 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit c2d78257ba7e7bbc230333f291282d16145aaac7 - # Merge: 8bb539c5 b3a1d52d + # commit c53953ae492587698d5adbab8ffee254d97b6a4e + # Merge: 9f88cf88 66d52ace # Author: Kenneth Benzie (Benie) - # Date: Wed Jan 10 11:24:12 2024 +0000 - # Merge pull request #1129 from sarnex/adapters - # [UR][L0] Make urPlatformGetBackendOption return -ze-opt-level=2 for -O1 and -O2 - set(UNIFIED_RUNTIME_TAG c2d78257ba7e7bbc230333f291282d16145aaac7) + # Date: Wed Jan 10 14:50:23 2024 +0000 + # Merge pull request #1170 from jchlanda/jakub/hip_custom_dirs + # [HIP] Allow custom location of ROCm components + set(UNIFIED_RUNTIME_TAG c53953ae492587698d5adbab8ffee254d97b6a4e) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")