diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h
index 9935bc275b8f0..f68afda13076d 100644
--- a/include/onnxruntime/core/graph/graph_viewer.h
+++ b/include/onnxruntime/core/graph/graph_viewer.h
@@ -125,6 +125,18 @@ class GraphViewer {
   /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
   const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
 
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Get the consumer nodes of a node arg */
+  std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
+    return graph_->GetConsumerNodes(node_arg_name);
+  }
+
+  /** Get the producer node of a node arg */
+  const Node* GetProducerNode(const std::string& node_arg_name) const {
+    return graph_->GetProducerNode(node_arg_name);
+  }
+#endif
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
 
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
new file mode 100644
index 0000000000000..cb0a175554117
--- /dev/null
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -0,0 +1,153 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/graph/graph_viewer.h"
+#include "onnx/defs/data_type_utils.h"
+#include <queue>
+
+using namespace ONNX_NAMESPACE::Utils;
+
+namespace onnxruntime {
+
+namespace {
+const int64_t Small_Initializer_Threshold = 100;
+
+bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) {
+  const ONNX_NAMESPACE::TensorProto* initializer_tensor;
+  if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor))
+    return false;
+  int64_t size = 1;
+  for (auto& dim : initializer_tensor->dims()) {
+    size *= dim;
+  }
+  return size <= Small_Initializer_Threshold &&
+         graph.GetConsumerNodes(arg->Name()).size() == 1;
+}
+}  // namespace
+
+/**
+  Returns a list of nodes that are prefered on CPU. 
+  They are commonly shape-related computation subgraphs.
+  @param graph Graph viewer 
+  @param provider_type The targe execution provider type
+  @param kernel_registries Kernel registies for the target EP
+  @param tentative_nodes Nodes that are tentative to be placed on on target EP
+  */
+std::unordered_set<NodeIndex> GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph,
+                                                  const std::string& provider_type,
+                                                  const std::vector<const KernelRegistry*>& kernel_registries,
+                                                  const std::vector<NodeIndex>& tentative_nodes) {
+  const std::vector<NodeIndex>& ordered_nodes = graph.GetNodesInTopologicalOrder();
+  std::vector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
+  for (size_t id = 0; id < ordered_nodes.size(); ++id) {
+    const NodeIndex& node_id = ordered_nodes[id];
+    node_id_to_order_map[node_id] = id;
+  }
+
+  // If return false, n1 will be output first; If return true, n2 will be output first
+  auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
+    return node_id_to_order_map[n1] > node_id_to_order_map[n2];
+  };
+
+  std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
+  std::unordered_set<NodeIndex> visited;
+
+  std::unordered_set<const NodeArg*> cpu_output_args;
+  std::unordered_set<NodeIndex> provider_nodes;
+  std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
+
+  for (auto& node_id : tentative_nodes) {
+    provider_nodes.insert(node_id);
+    const Node* node = graph.GetNode(node_id);
+
+    const KernelCreateInfo* kernel_info = nullptr;
+    for (auto registry : kernel_registries) {
+      auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
+      if (st.IsOK())
+        break;
+    }
+    // at least one registry has a target provider's kernel for this node
+    ORT_ENFORCE(kernel_info != nullptr);
+    node_to_kernel.insert({node_id, kernel_info});
+
+    // first, find all the direct consumer of cpu tensors.
+    ORT_THROW_IF_ERROR(node->ForEachWithIndex(
+        node->OutputDefs(),
+        [&](const NodeArg& node_arg, size_t out_index) {
+          if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
+            cpu_output_args.insert(&node_arg);
+            auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
+            for (auto& consumer_node : consumer_nodes) {
+              candidates.push(consumer_node->Index());
+              LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name();
+            }
+          }
+          return Status::OK();
+        }));
+  }
+
+  const std::vector<const NodeArg*>& graph_inputs = graph.GetInputs();
+  std::unordered_set<NodeIndex> cpu_nodes;
+  // The algo below is trying to identity a subgraph that only depends on cpu tensors.
+  // Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back.
+  // The detail:
+  // for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
+  // force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
+  while (!candidates.empty()) {
+    NodeIndex cur = candidates.top();
+    candidates.pop();
+    if (visited.count(cur) != 0)
+      continue;
+    visited.insert(cur);
+
+    if (provider_nodes.find(cur) == provider_nodes.end())
+      continue;
+
+    auto* node = graph.GetNode(cur);
+    bool place_in_cpu = true;
+    for (size_t i = 0; i < node->InputDefs().size(); ++i) {
+      auto* input = node->InputDefs()[i];
+
+      // skip placing on CPU if the data typs is float16 or bfloat16
+      if (input->Type() == DataTypeUtils::ToType("float16") ||
+          input->Type() == DataTypeUtils::ToType("bfloat16")) {
+        place_in_cpu = false;
+        break;
+      }
+
+      // allow placing on CPU if it's a small initializer or graph input
+      if (IsSmallInitializerWithSingleConsumer(graph, input) ||
+          std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) {
+        continue;
+      }
+
+      // the input is not a CPU tensor
+      if (cpu_output_args.find(input) == cpu_output_args.end()) {
+        place_in_cpu = false;
+        break;
+      }
+
+      // input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP
+      if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) {
+        place_in_cpu = false;
+        break;
+      }
+    }
+
+    if (place_in_cpu) {
+      cpu_nodes.insert(cur);
+      LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name();
+      for (auto* output : node->OutputDefs()) {
+        cpu_output_args.insert(output);
+      }
+      for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
+        candidates.push((*it).Index());
+      }
+    }
+  }
+
+  return cpu_nodes;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index cabd267a181d5..a77cbbbe03630 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -7,6 +7,7 @@
 #include "cuda_allocator.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/fallback_cpu_capability.h"
 #include "core/framework/memcpy.h"
 #include "core/graph/graph_utils.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
@@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
 std::vector<std::unique_ptr<ComputeCapability>>
 CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                      const std::vector<const KernelRegistry*>& kernel_registries) const {
-  std::vector<std::unique_ptr<ComputeCapability>> result;
-  std::unordered_set<const NodeArg*> defs_outside_cuda;
-
+  std::vector<NodeIndex> candidates;
   for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
     const auto* p_node = graph.GetNode(node_index);
     if (p_node == nullptr)
@@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     const auto& node = *p_node;
     const KernelCreateInfo* cuda_kernel_def = nullptr;
     if (!node.GetExecutionProviderType().empty()) {
-      defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
       continue;
     }
 
@@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
 
     // none of the provided registries has a CUDA kernel for this node
     if (cuda_kernel_def == nullptr) {
-      // node is not in cuda exeuction provider if no kernel def found,
-      // or if other execution provider already assigned to it
-      defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
       continue;
     }
 
     bool not_supported = false;
-    bool force_outside = false;
     bool force_inside = false;  // for some compute heavy ops, we'll force it to run inside CUDA
     if ("LSTM" == node.OpType()) {
       // the supported activations covers the bidirectional mode
@@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
       // cast is not compute heavy, and may be placed outside
     }
 
-//Below rule only works for inference, for training, we can't do constant folding.
-//We need find a better solution.
-//Temporary disable the check here, the cost is all the cast will be on GPU now.
-#ifndef ENABLE_TRAINING
-    if (!not_supported && !force_inside) {
-      // Note that nodes with only inputs from initializer would not be place on CUDA
-      // Ideally, those nodes should be eliminated in constant folding
-      bool should_force_outside = true;
-      bool all_inputs_are_initializers = true;
-      ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(),
-                                               [&](const NodeArg& def, size_t index) {
-                                                 // The input is not a initializer and the input is from CPU
-                                                 // or the input declared as CPU memory and is from CPU
-                                                 // in that case we should still keep the node on CUDA
-                                                 bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
-                                                 bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
-                                                 if ((!initializer_input && !input_is_on_cpu) ||
-                                                     (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
-                                                   should_force_outside = false;
-                                                 }
-
-                                                 if (!initializer_input) {
-                                                   all_inputs_are_initializers = false;
-                                                 }
-                                                 return Status::OK();
-                                               }));
-
-      // If all the inputs are initializers, we shouldn't force it to CPU
-      if (should_force_outside && !all_inputs_are_initializers) {
-        force_outside = true;
-      }
-    }
-#endif
-    if (!force_inside && (not_supported || force_outside)) {
-      defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
+    if (!force_inside && not_supported) {
       if (not_supported) {
         LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
-      } else if (force_outside) {
-        LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
       }
     } else {
-      // for nodes placed on CUDA, check if its output is on CPU
-      ORT_THROW_IF_ERROR(node.ForEachWithIndex(
-          node.OutputDefs(),
-          [&](const NodeArg& def, size_t out_index) {
-            if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault)
-              defs_outside_cuda.insert(&def);
-            return Status::OK();
-          }));
-      std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
-      sub_graph->nodes.push_back(node.Index());
-      result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
+      candidates.push_back(node.Index());
     }
   }
+
+  // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
+  // These are usually shape related computation subgraphs
+  // Following logic can be extended for other EPs
+  std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates);
+
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+  for (auto& node_index : candidates) {
+    if (cpu_nodes.count(node_index) > 0)
+      continue;
+
+    std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
+    sub_graph->nodes.push_back(node_index);
+    result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
+  }
   return result;
 }