diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h index 9935bc275b8f0..f68afda13076d 100644 --- a/include/onnxruntime/core/graph/graph_viewer.h +++ b/include/onnxruntime/core/graph/graph_viewer.h @@ -125,6 +125,18 @@ class GraphViewer { /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */ const Node* ParentNode() const noexcept { return graph_->ParentNode(); } +#if !defined(ORT_MINIMAL_BUILD) + /** Get the consumer nodes of a node arg */ + std::vector GetConsumerNodes(const std::string& node_arg_name) const { + return graph_->GetConsumerNodes(node_arg_name); + } + + /** Get the producer node of a node arg */ + const Node* GetProducerNode(const std::string& node_arg_name) const { + return graph_->GetProducerNode(node_arg_name); + } +#endif + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer); diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h new file mode 100644 index 0000000000000..cb0a175554117 --- /dev/null +++ b/onnxruntime/core/framework/fallback_cpu_capability.h @@ -0,0 +1,153 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/graph/graph_viewer.h" +#include "onnx/defs/data_type_utils.h" +#include + +using namespace ONNX_NAMESPACE::Utils; + +namespace onnxruntime { + +namespace { +const int64_t Small_Initializer_Threshold = 100; + +bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) { + const ONNX_NAMESPACE::TensorProto* initializer_tensor; + if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor)) + return false; + int64_t size = 1; + for (auto& dim : initializer_tensor->dims()) { + size *= dim; + } + return size <= Small_Initializer_Threshold && + graph.GetConsumerNodes(arg->Name()).size() == 1; +} +} // namespace + +/** + Returns a list of nodes that are prefered on CPU. + They are commonly shape-related computation subgraphs. + @param graph Graph viewer + @param provider_type The targe execution provider type + @param kernel_registries Kernel registies for the target EP + @param tentative_nodes Nodes that are tentative to be placed on on target EP + */ +std::unordered_set GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph, + const std::string& provider_type, + const std::vector& kernel_registries, + const std::vector& tentative_nodes) { + const std::vector& ordered_nodes = graph.GetNodesInTopologicalOrder(); + std::vector node_id_to_order_map(graph.MaxNodeIndex()); + for (size_t id = 0; id < ordered_nodes.size(); ++id) { + const NodeIndex& node_id = ordered_nodes[id]; + node_id_to_order_map[node_id] = id; + } + + // If return false, n1 will be output first; If return true, n2 will be output first + auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) { + return node_id_to_order_map[n1] > node_id_to_order_map[n2]; + }; + + std::priority_queue, decltype(greater_order_comp)> candidates(greater_order_comp); + std::unordered_set visited; + + std::unordered_set cpu_output_args; + std::unordered_set provider_nodes; + std::unordered_map node_to_kernel; + + for (auto& node_id : tentative_nodes) { + provider_nodes.insert(node_id); + const Node* node = graph.GetNode(node_id); + + const KernelCreateInfo* kernel_info = nullptr; + for (auto registry : kernel_registries) { + auto st = registry->TryFindKernel(*node, provider_type, &kernel_info); + if (st.IsOK()) + break; + } + // at least one registry has a target provider's kernel for this node + ORT_ENFORCE(kernel_info != nullptr); + node_to_kernel.insert({node_id, kernel_info}); + + // first, find all the direct consumer of cpu tensors. + ORT_THROW_IF_ERROR(node->ForEachWithIndex( + node->OutputDefs(), + [&](const NodeArg& node_arg, size_t out_index) { + if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) { + cpu_output_args.insert(&node_arg); + auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name()); + for (auto& consumer_node : consumer_nodes) { + candidates.push(consumer_node->Index()); + LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name(); + } + } + return Status::OK(); + })); + } + + const std::vector& graph_inputs = graph.GetInputs(); + std::unordered_set cpu_nodes; + // The algo below is trying to identity a subgraph that only depends on cpu tensors. + // Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back. + // The detail: + // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input, + // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors. + while (!candidates.empty()) { + NodeIndex cur = candidates.top(); + candidates.pop(); + if (visited.count(cur) != 0) + continue; + visited.insert(cur); + + if (provider_nodes.find(cur) == provider_nodes.end()) + continue; + + auto* node = graph.GetNode(cur); + bool place_in_cpu = true; + for (size_t i = 0; i < node->InputDefs().size(); ++i) { + auto* input = node->InputDefs()[i]; + + // skip placing on CPU if the data typs is float16 or bfloat16 + if (input->Type() == DataTypeUtils::ToType("float16") || + input->Type() == DataTypeUtils::ToType("bfloat16")) { + place_in_cpu = false; + break; + } + + // allow placing on CPU if it's a small initializer or graph input + if (IsSmallInitializerWithSingleConsumer(graph, input) || + std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) { + continue; + } + + // the input is not a CPU tensor + if (cpu_output_args.find(input) == cpu_output_args.end()) { + place_in_cpu = false; + break; + } + + // input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP + if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) { + place_in_cpu = false; + break; + } + } + + if (place_in_cpu) { + cpu_nodes.insert(cur); + LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name(); + for (auto* output : node->OutputDefs()) { + cpu_output_args.insert(output); + } + for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) { + candidates.push((*it).Index()); + } + } + } + + return cpu_nodes; +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index cabd267a181d5..a77cbbbe03630 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -7,6 +7,7 @@ #include "cuda_allocator.h" #include "core/framework/kernel_registry.h" #include "core/framework/compute_capability.h" +#include "core/framework/fallback_cpu_capability.h" #include "core/framework/memcpy.h" #include "core/graph/graph_utils.h" #include "core/providers/cuda/gpu_data_transfer.h" @@ -1822,9 +1823,7 @@ std::unique_ptr CUDAExecutionProvider::GetDataTransf std::vector> CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, const std::vector& kernel_registries) const { - std::vector> result; - std::unordered_set defs_outside_cuda; - + std::vector candidates; for (auto& node_index : graph.GetNodesInTopologicalOrder()) { const auto* p_node = graph.GetNode(node_index); if (p_node == nullptr) @@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, const auto& node = *p_node; const KernelCreateInfo* cuda_kernel_def = nullptr; if (!node.GetExecutionProviderType().empty()) { - defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend()); continue; } @@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // none of the provided registries has a CUDA kernel for this node if (cuda_kernel_def == nullptr) { - // node is not in cuda exeuction provider if no kernel def found, - // or if other execution provider already assigned to it - defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend()); continue; } bool not_supported = false; - bool force_outside = false; bool force_inside = false; // for some compute heavy ops, we'll force it to run inside CUDA if ("LSTM" == node.OpType()) { // the supported activations covers the bidirectional mode @@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // cast is not compute heavy, and may be placed outside } -//Below rule only works for inference, for training, we can't do constant folding. -//We need find a better solution. -//Temporary disable the check here, the cost is all the cast will be on GPU now. -#ifndef ENABLE_TRAINING - if (!not_supported && !force_inside) { - // Note that nodes with only inputs from initializer would not be place on CUDA - // Ideally, those nodes should be eliminated in constant folding - bool should_force_outside = true; - bool all_inputs_are_initializers = true; - ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(), - [&](const NodeArg& def, size_t index) { - // The input is not a initializer and the input is from CPU - // or the input declared as CPU memory and is from CPU - // in that case we should still keep the node on CUDA - bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true); - bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0; - if ((!initializer_input && !input_is_on_cpu) || - (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) { - should_force_outside = false; - } - - if (!initializer_input) { - all_inputs_are_initializers = false; - } - return Status::OK(); - })); - - // If all the inputs are initializers, we shouldn't force it to CPU - if (should_force_outside && !all_inputs_are_initializers) { - force_outside = true; - } - } -#endif - if (!force_inside && (not_supported || force_outside)) { - defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend()); + if (!force_inside && not_supported) { if (not_supported) { LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name(); - } else if (force_outside) { - LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name(); } } else { - // for nodes placed on CUDA, check if its output is on CPU - ORT_THROW_IF_ERROR(node.ForEachWithIndex( - node.OutputDefs(), - [&](const NodeArg& def, size_t out_index) { - if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault) - defs_outside_cuda.insert(&def); - return Status::OK(); - })); - std::unique_ptr sub_graph = onnxruntime::make_unique(); - sub_graph->nodes.push_back(node.Index()); - result.push_back(onnxruntime::make_unique(std::move(sub_graph))); + candidates.push_back(node.Index()); } } + + // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU + // These are usually shape related computation subgraphs + // Following logic can be extended for other EPs + std::unordered_set cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates); + + std::vector> result; + for (auto& node_index : candidates) { + if (cpu_nodes.count(node_index) > 0) + continue; + + std::unique_ptr sub_graph = onnxruntime::make_unique(); + sub_graph->nodes.push_back(node_index); + result.push_back(onnxruntime::make_unique(std::move(sub_graph))); + } return result; }