From cee693c52427e42b9dd26fb9f690b7ac52bb1d4c Mon Sep 17 00:00:00 2001
From: HailoRT-Automation
 <98901220+HailoRT-Automation@users.noreply.github.com>
Date: Wed, 28 Sep 2022 20:15:52 +0300
Subject: [PATCH] v4.10.0

v4.10.0
---
 .github/workflows/publish-python-apidocs.yml  |   3 +-
 .gitignore                                    |   1 +
 CODEOWNERS                                    |  27 +-
 cgmanifests/cgmanifest.json                   |   2 +-
 cmake/external/tvm.cmake                      |   2 +-
 cmake/onnxruntime.cmake                       |   9 +-
 cmake/onnxruntime_common.cmake                |   2 +-
 cmake/onnxruntime_flatbuffers.cmake           |  13 -
 cmake/onnxruntime_providers.cmake             |   2 +-
 .../abseil/Fix_Nvidia_Build_Break.patch       |  41 ++
 cmake/store_toolchain.cmake                   |   5 -
 cmake/target_delayload.cmake                  |   7 +-
 cmake/winml_unittests.cmake                   |   2 +-
 .../runtest.sh                                |   2 +-
 docs/python/inference/api_summary.rst         | 192 ++++---
 hailo/README.md                               |  19 +-
 include/onnxruntime/core/graph/graph.h        |  63 +--
 .../core/providers/tvm/tvm_provider_factory.h |   2 +-
 onnxruntime/contrib_ops/cpu/signal/dft.cc     | 156 +++--
 onnxruntime/contrib_ops/cpu/signal/dft.h      |   4 +
 onnxruntime/core/framework/kernel_registry.cc |   1 +
 onnxruntime/core/graph/graph.cc               | 114 +---
 onnxruntime/core/graph/node_attr_utils.cc     |  81 +++
 onnxruntime/core/graph/node_attr_utils.h      |  51 ++
 .../core/graph/signal_ops/signal_defs.cc      | 290 +++++++---
 .../core/optimizer/bias_dropout_fusion.cc     |   4 +-
 .../core/optimizer/conv_activation_fusion.cc  |  25 +-
 .../core/optimizer/embed_layer_norm_fusion.cc |   8 +-
 .../core/optimizer/gemm_activation_fusion.cc  |   2 +-
 .../core/optimizer/layer_norm_fusion.cc       |   4 +-
 .../core/optimizer/skip_layer_norm_fusion.cc  |   2 +-
 .../transpose_optimizer/optimizer_api_impl.cc |   2 +-
 onnxruntime/core/platform/posix/ort_mutex.cc  |   1 -
 onnxruntime/core/platform/windows/env.cc      |   1 -
 .../core/providers/cuda/tensor/transpose.cc   |   2 +-
 .../providers/cuda/tensor/transpose_impl.cu   | 102 ++--
 .../providers/cuda/tensor/transpose_impl.h    |   2 +-
 .../src/GraphTransformer.cpp                  |   2 +-
 .../providers/hailo/hailo_global_vdevice.cc   |  49 --
 .../providers/hailo/hailo_global_vdevice.h    |  32 --
 onnxruntime/core/providers/hailo/hailo_op.cc  |  13 +-
 .../core/providers/tvm/custom_logging.cc      |  30 +-
 .../core/providers/tvm/tvm_allocator.cc       |   4 +-
 .../core/providers/tvm/tvm_allocator.h        |  18 +-
 onnxruntime/core/providers/tvm/tvm_api.cc     |  47 +-
 onnxruntime/core/providers/tvm/tvm_api.h      |  40 +-
 onnxruntime/core/providers/tvm/tvm_common.h   |  12 +-
 .../core/providers/tvm/tvm_compiler.cc        |  38 ++
 onnxruntime/core/providers/tvm/tvm_compiler.h |  40 ++
 onnxruntime/core/providers/tvm/tvm_defaults.h |   2 +-
 .../core/providers/tvm/tvm_ep_options.cc      | 261 +++++++++
 ...ution_provider_info.h => tvm_ep_options.h} |  39 +-
 .../providers/tvm/tvm_execution_provider.cc   | 533 +++++-------------
 .../providers/tvm/tvm_execution_provider.h    |  53 +-
 .../tvm/tvm_execution_provider_info.cc        | 111 ----
 .../providers/tvm/tvm_provider_factory.cc     |  25 +-
 onnxruntime/core/providers/tvm/tvm_runner.cc  |  26 +
 onnxruntime/core/providers/tvm/tvm_runner.h   |  35 ++
 .../core/providers/tvm/tvm_runner_impl.cc     | 165 ++++++
 .../core/providers/tvm/tvm_runner_impl.h      | 104 ++++
 onnxruntime/core/providers/tvm/tvm_utils.h    |   5 +-
 .../core/providers/tvm/xpu_data_transfer.cc   |  20 +-
 .../core/providers/tvm/xpu_data_transfer.h    |  10 +-
 .../python/tools/microbench/attention.py      |  57 ++
 .../python/tools/microbench/benchmark.py      |  81 ++-
 onnxruntime/python/tools/microbench/cast.py   |  75 +++
 .../python/tools/microbench/fast_gelu.py      |   5 +
 onnxruntime/python/tools/microbench/matmul.py |   5 +
 .../tools/microbench/skip_layer_norm.py       |  59 ++
 .../python/tools/quantization/calibrate.py    |   2 +-
 .../python/tools/symbolic_shape_infer.py      |  27 +-
 .../python/tools/transformers/benchmark.py    |  24 +-
 .../tools/transformers/bert_perf_test.py      |  68 ++-
 .../python/tools/transformers/float16.py      |   6 +
 .../transformers/gpt2_beamsearch_helper.py    |   7 +-
 .../python/tools/transformers/gpt2_helper.py  |   3 +-
 .../longformer/convert_longformer_to_onnx.py  |   7 +-
 .../transformers/models/t5/past_helper.py     |   1 -
 .../transformers/models/t5/t5_decoder.py      |  11 +-
 .../transformers/models/t5/t5_encoder.py      |  12 +-
 .../models/t5/t5_encoder_decoder_init.py      |  10 +-
 .../tools/transformers/models/t5/t5_helper.py |   1 -
 .../tools/transformers/onnx_exporter.py       |  21 +-
 .../transformers/torch_onnx_export_helper.py  |  68 +++
 .../test/framework/inference_session_test.cc  |  18 +-
 .../test/framework/shape_inference_test.cc    |  13 +-
 onnxruntime/test/ir/graph_test.cc             |  14 +-
 onnxruntime/test/optimizer/qdq_test_utils.cc  |  31 +
 onnxruntime/test/optimizer/qdq_test_utils.h   |   9 +-
 .../test/perftest/performance_runner.cc       |   5 +-
 .../test/providers/cpu/controlflow/if_test.cc |   4 +-
 .../providers/cpu/controlflow/loop_test.cc    |   2 +-
 .../test/providers/nnapi/nnapi_basic_test.cc  |  91 ++-
 .../test/providers/provider_test_utils.h      |  36 +-
 .../providers/tensorrt/tensorrt_basic_test.cc |  24 +-
 .../test/python/onnxruntime_test_python.py    |  27 +
 ...untime_test_python_symbolic_shape_infer.py |  10 +-
 .../test_data/models/gpt2_megatron_opt.onnx   | Bin 13404 -> 8901 bytes
 onnxruntime/test/util/include/test_utils.h    |  12 +-
 onnxruntime/test/util/test_utils.cc           |   5 +-
 .../orttraining/eager/opgen/opgen/atenops.py  |  13 +-
 .../orttraining/eager/ort_eager_common.h      |   3 +-
 .../orttraining/eager/test/ort_eps_test.py    |  13 +-
 .../mnist_reader/mnist_reader_common.hpp      |   3 +-
 .../ortmodule/_custom_op_symbolic_registry.py |  20 +-
 .../ortmodule/_graph_execution_manager.py     |   7 +
 .../training/ortmodule/_training_manager.py   |   5 -
 .../python/training/ortmodule/_utils.py       |  18 +
 .../torch_interop_utils.cc                    |  21 +-
 .../test/gradient/allreduce_op_test.cc        |  47 +-
 .../test/gradient/gradient_checker.cc         |   2 +-
 .../orttraining/test/python/_test_helpers.py  |  98 ++--
 .../python/orttraining_test_ortmodule_api.py  |  53 +-
 .../orttraining_test_ortmodule_autograd.py    |  55 ++
 .../training_ops/function_op_test_utils.cc    |   2 +-
 ...ndroid-x86_64-crosscompile-ci-pipeline.yml | 280 ++++++++-
 .../azure-pipelines/linux-ci-pipeline.yml     |  75 +--
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |   2 +-
 .../orttraining-linux-ci-pipeline.yml         |  30 +-
 .../orttraining-linux-external-custom-ops.yml |   2 +-
 .../py-packaging-selectable-stage.yml         |   2 +-
 .../templates/py-packaging-stage.yml          |   4 +-
 .../azure-pipelines/templates/win-cpu-ci.yml  |   4 +-
 .../azure-pipelines/templates/win-gpu-ci.yml  |   4 +-
 .../docker/scripts/manylinux/requirements.txt |   1 -
 tools/python/PythonTools.md                   |   2 +-
 tools/python/onnx_test_data_utils.py          |  25 +-
 .../test/api/LearningModelSessionAPITest.cpp  | 150 +++--
 128 files changed, 3134 insertions(+), 1618 deletions(-)
 delete mode 100644 cmake/store_toolchain.cmake
 create mode 100644 onnxruntime/core/graph/node_attr_utils.cc
 create mode 100644 onnxruntime/core/graph/node_attr_utils.h
 delete mode 100644 onnxruntime/core/providers/hailo/hailo_global_vdevice.cc
 delete mode 100644 onnxruntime/core/providers/hailo/hailo_global_vdevice.h
 create mode 100644 onnxruntime/core/providers/tvm/tvm_compiler.cc
 create mode 100644 onnxruntime/core/providers/tvm/tvm_compiler.h
 create mode 100644 onnxruntime/core/providers/tvm/tvm_ep_options.cc
 rename onnxruntime/core/providers/tvm/{tvm_execution_provider_info.h => tvm_ep_options.h} (52%)
 delete mode 100644 onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc
 create mode 100644 onnxruntime/core/providers/tvm/tvm_runner.cc
 create mode 100644 onnxruntime/core/providers/tvm/tvm_runner.h
 create mode 100644 onnxruntime/core/providers/tvm/tvm_runner_impl.cc
 create mode 100644 onnxruntime/core/providers/tvm/tvm_runner_impl.h
 create mode 100644 onnxruntime/python/tools/microbench/attention.py
 create mode 100644 onnxruntime/python/tools/microbench/cast.py
 create mode 100644 onnxruntime/python/tools/microbench/skip_layer_norm.py
 create mode 100644 onnxruntime/python/tools/transformers/torch_onnx_export_helper.py

diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index b2da6ed5f19e1..ba7c4eaf58b9e 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -27,14 +27,13 @@ jobs:
       - name: Set vars
         id: vars
         run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
-      - name: Check outputs
-        run: echo ${{ steps.vars.outputs.sha_short }}
       - uses: actions/checkout@v2
         with:
           ref: gh-pages
           clean: false    
       - name: Move API docs into target area
         run: |
+          ls docs/api
           rm -rf docs/api/python
           mv build/docs/inference/html docs/api/python   
       - name: Create Pull Request
diff --git a/.gitignore b/.gitignore
index 7c3f4b8ecf5ff..d27dedbc2b7ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ onnxruntime_profile*.json
 /csharp/**/bin/
 /csharp/Directory.Build.props
 docs/python/inference/*.onnx
+*.onnx
 onnxprofile_profile_test_*.json
 /csharp/packages
 /csharp/src/Microsoft.ML.OnnxRuntime/targets/**/*.targets
diff --git a/CODEOWNERS b/CODEOWNERS
index 1b331ce95a66c..43a1eda4115d5 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,20 +1,21 @@
 # Python frontend owners
-orttraining/*.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-orttraining/orttraining/python/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-orttraining/orttraining/test/python/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-orttraining/pytorch_frontend_examples/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-onnxruntime/python/training/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-onnxruntime/test/python/onnxruntime_test_ort_trainer.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-onnxruntime/test/python/onnxruntime_test_training_unit_tests.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-onnxruntime/test/python/onnxruntime_test_training_unittest_utils.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
-samples/python/training/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre
+orttraining/*.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+orttraining/orttraining/python/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+orttraining/orttraining/test/python/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+orttraining/pytorch_frontend_examples/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+onnxruntime/python/training/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+onnxruntime/test/python/onnxruntime_test_ort_trainer.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+onnxruntime/test/python/onnxruntime_test_training_unit_tests.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+onnxruntime/test/python/onnxruntime_test_training_unittest_utils.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre
+samples/python/training/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre
 
 # Mobile
-/onnxruntime/test/testdata/kernel_def_hashes/ @skottmckay @gwang-msft @YUNQIUGUO @edgchen1
-/onnxruntime/core/framework/kernel_def_hash_helpers.* @skottmckay @gwang-msft @YUNQIUGUO @edgchen1
+/onnxruntime/test/testdata/kernel_def_hashes/ @skottmckay @YUNQIUGUO @edgchen1
+/onnxruntime/core/framework/kernel_def_hash_helpers.* @skottmckay @YUNQIUGUO @edgchen1
 
 # Contrib Ops
 onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn
 onnxruntime/core/graph/contrib_ops/nchwc_schema_defs.cc @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn
-onnxruntime/core/graph/contrib_ops/quantization_defs.* @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn
\ No newline at end of file
+onnxruntime/core/graph/contrib_ops/quantization_defs.* @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn
+onnxruntime/core/mlas/** @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn
\ No newline at end of file
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index e043e07ea13b4..379ff6921c568 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -46,7 +46,7 @@
          "component": {
             "type": "git",
             "git": {
-               "commitHash": "d721d320bd2f66d342d24b71600fe1f5e222e952",
+               "commitHash": "ffd5f70370642c909222f9a4cae8400023dacbdc",
                "repositoryUrl": "https://github.com/apache/tvm.git"
             },
             "comments": "needed for TVM EP"
diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake
index c1ee5fdde51b7..3f425a0938e2f 100644
--- a/cmake/external/tvm.cmake
+++ b/cmake/external/tvm.cmake
@@ -4,7 +4,7 @@ if (onnxruntime_USE_TVM)
   FetchContent_Declare(
     tvm
     GIT_REPOSITORY https://github.com/apache/tvm.git
-    GIT_TAG        d721d320bd2f66d342d24b71600fe1f5e222e952
+    GIT_TAG        ffd5f70370642c909222f9a4cae8400023dacbdc
   )
 
   FetchContent_GetProperties(tvm)
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 64068a03a09f1..2331b21de7480 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -213,8 +213,13 @@ install(TARGETS onnxruntime
 
 set_target_properties(onnxruntime PROPERTIES FOLDER "ONNXRuntime")
 
-if (WINDOWS_STORE)
-  target_link_options(onnxruntime PRIVATE /DELAYLOAD:api-ms-win-core-libraryloader-l1-2-1.dll)
+if (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
+  # Workaround STL bug https://github.com/microsoft/STL/issues/434#issuecomment-921321254
+  # Note that the workaround makes std::system_error crash before Windows 10
+
+  # The linker warns "LNK4199: /DELAYLOAD:api-ms-win-core-heapl2-1-0.dll ignored; no imports found from api-ms-win-core-heapl2-1-0.dll"
+  # when you're not using imports directly, even though the import exists in the STL and the DLL would have been linked without DELAYLOAD
+  target_link_options(onnxruntime PRIVATE /DELAYLOAD:api-ms-win-core-heapl2-1-0.dll /ignore:4199)
 endif()
 
 if (winml_is_inbox)
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 9590050dfe9d2..d32db62ca56d0 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -206,7 +206,7 @@ endif()
 
 
 if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
-  if(WINDOWS_STORE OR (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC))
+  if((WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC))
     # msvc compiler report syntax error with cpuinfo arm source files
     # and cpuinfo does not have code for getting arm uarch info under windows
   else()
diff --git a/cmake/onnxruntime_flatbuffers.cmake b/cmake/onnxruntime_flatbuffers.cmake
index bcb196bcd8cd9..49302e92f5a66 100644
--- a/cmake/onnxruntime_flatbuffers.cmake
+++ b/cmake/onnxruntime_flatbuffers.cmake
@@ -21,16 +21,3 @@ set_target_properties(onnxruntime_flatbuffers PROPERTIES FOLDER "ONNXRuntime")
 if (FLATBUFFERS_BUILD_FLATC)
   add_dependencies(onnxruntime_flatbuffers flatc)
 endif()
-
-if (WINDOWS_STORE)
-  function(target_force_include target scope file)
-    if (MSVC)
-        target_compile_options(${target} ${scope} "/FI${file}")
-    else()
-        target_compile_options(${target} ${scope} -include "${file}")
-    endif()
-  endfunction()
-
-  target_force_include(flatbuffers PRIVATE uwp_stubs.h)
-  target_force_include(flatc PRIVATE uwp_stubs.h)
-endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index b6553de172d24..0ff23ad9507fa 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -1318,7 +1318,7 @@ if (onnxruntime_USE_HAILO)
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
 
-  find_package(HailoRT 4.8.1 EXACT REQUIRED)
+  find_package(HailoRT 4.10.0 EXACT REQUIRED)
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_hailo_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_hailo ${onnxruntime_providers_hailo_cc_srcs})
diff --git a/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch b/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch
index d481a14a5544f..f8e4750cbccff 100644
--- a/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch
+++ b/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch
@@ -20,3 +20,44 @@ index 1d7d6cd..f6a7a78 100644
          {allocated_storage_view.data, allocated_storage_view.capacity});
    }
  
+diff --git a/absl/copts/GENERATED_AbseilCopts.cmake b/absl/copts/GENERATED_AbseilCopts.cmake
+index a4ab1aa..dfd13fd 100644
+--- a/absl/copts/GENERATED_AbseilCopts.cmake
++++ b/absl/copts/GENERATED_AbseilCopts.cmake
+@@ -129,8 +129,6 @@ list(APPEND ABSL_MSVC_FLAGS
+     "/wd4005"
+     "/wd4068"
+     "/wd4180"
+-    "/wd4244"
+-    "/wd4267"
+     "/wd4503"
+     "/wd4800"
+ )
+diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl
+index a6efc98..8c4de8e 100644
+--- a/absl/copts/GENERATED_copts.bzl
++++ b/absl/copts/GENERATED_copts.bzl
+@@ -130,8 +130,6 @@ ABSL_MSVC_FLAGS = [
+     "/wd4005",
+     "/wd4068",
+     "/wd4180",
+-    "/wd4244",
+-    "/wd4267",
+     "/wd4503",
+     "/wd4800",
+ ]
+diff --git a/absl/copts/copts.py b/absl/copts/copts.py
+index 0d6c1ec..75fd935 100644
+--- a/absl/copts/copts.py
++++ b/absl/copts/copts.py
+@@ -132,10 +132,6 @@ COPT_VARS = {
+             "/wd4068",  # unknown pragma
+             # qualifier applied to function type has no meaning; ignored
+             "/wd4180",
+-            # conversion from 'type1' to 'type2', possible loss of data
+-            "/wd4244",
+-            # conversion from 'size_t' to 'type', possible loss of data
+-            "/wd4267",
+             # The decorated name was longer than the compiler limit
+             "/wd4503",
+             # forcing value to bool 'true' or 'false' (performance warning)
diff --git a/cmake/store_toolchain.cmake b/cmake/store_toolchain.cmake
deleted file mode 100644
index ebdb88da7752c..0000000000000
--- a/cmake/store_toolchain.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-set(CMAKE_SYSTEM_NAME WindowsStore)
-set(CMAKE_SYSTEM_VERSION 10.0)
-if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
-  set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
-endif()
diff --git a/cmake/target_delayload.cmake b/cmake/target_delayload.cmake
index c776b2529a2b9..53f252a3e71ac 100644
--- a/cmake/target_delayload.cmake
+++ b/cmake/target_delayload.cmake
@@ -9,9 +9,6 @@ function(target_delayload target_name)
     foreach(lib ${ARGN})
         target_link_options(${target_name} PRIVATE /DELAYLOAD:"${lib}")
     endforeach()
-    if (WINDOWS_STORE)
-        target_link_libraries(${target_name} PRIVATE dloadhelper.lib)
-    else()
-        target_link_libraries(${target_name} PRIVATE delayimp.lib)
-    endif()
+
+    target_link_libraries(${target_name} PRIVATE delayimp.lib)
 endfunction()
diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
index f7da46124b879..6e14591224886 100644
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@@ -190,7 +190,7 @@ set_winml_target_properties(winml_google_test_lib)
 set_winml_target_properties(winml_test_common)
 get_winml_test_api_src(${WINML_TEST_SRC_DIR} winml_test_api_src)
 
-if (NOT WINDOWS_STORE AND NOT ${winml_is_inbox})
+if (NOT ${winml_is_inbox})
   get_winml_test_api_redist_only_src(${WINML_TEST_SRC_DIR} winml_test_api_redist_only_src)
 endif()
 
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
index 586f8986086c8..28d92b73530c4 100755
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
@@ -19,7 +19,7 @@ echo "Current NuGet package version is $CurrentOnnxRuntimeVersion"
 if [ $RunTestCsharp = "true" ]; then
   if [[ $IsMacOS == "True" || $IsMacOS == "true" ]]; then
     mkdir -p $BUILD_BINARIESDIRECTORY/models
-    ln -s $BUILD_SOURCESDIRECTORY/cmake/external/onnx/onnx/backend/test/data/node $BUILD_BINARIESDIRECTORY/models/opset14
+    ln -s $BUILD_SOURCESDIRECTORY/cmake/external/onnx/onnx/backend/test/data/node $BUILD_BINARIESDIRECTORY/models/opset16
   fi
   # Run C# tests
   dotnet restore $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj -s $LocalNuGetRepo -s https://api.nuget.org/v3/index.json
diff --git a/docs/python/inference/api_summary.rst b/docs/python/inference/api_summary.rst
index 0325cdd47fa97..12d17bafd3ee9 100644
--- a/docs/python/inference/api_summary.rst
+++ b/docs/python/inference/api_summary.rst
@@ -1,65 +1,107 @@
 
-===========
-API Summary
-===========
-
-Summary of public functions and classes exposed
-in *ONNX Runtime*.
+===
+API
+===
 
 .. contents::
     :local:
 
-OrtValue
-=========
+API Overview
+============
 
-*ONNX Runtime* works with native Python data structures which are mapped into ONNX data formats :
-Numpy arrays (tensors), dictionaries (maps), and a list of Numpy arrays (sequences).
-The data backing these are on CPU.
+*ONNX Runtime* loads and runs inference on a model in ONNX graph format, or ORT format (for memory and disk constrained environments).
 
-*ONNX Runtime* supports a custom data structure that supports all ONNX data formats that allows users
-to place the data backing these on a device, for example, on a CUDA supported device. This allows for
-interesting *IOBinding* scenarios (discussed below). In addition, *ONNX Runtime* supports directly
-working with *OrtValue* (s) while inferencing a model if provided as part of the input feed.
+The data consumed and produced by the model can be specified and accessed in the way that best matches your scenario.
+
+Load and run a model
+--------------------
 
-Below is an example showing creation of an *OrtValue* from a Numpy array while placing its backing memory
-on a CUDA device:
+InferenceSession is the main class of ONNX Runtime. It is used to load and run an ONNX model,
+as well as specify environment and application configuration options. 
 
 .. code-block:: python
 
-	# X is numpy array on cpu, create an OrtValue and place it on cuda device id = 0
-	ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0)
-	ortvalue.device_name()  # 'cuda'
-	ortvalue.shape()  # shape of the numpy array X
-	ortvalue.data_type()  # 'tensor(float)'
-	ortvalue.is_tensor()  # 'True'
+	session = onnxruntime.InferenceSession('model.onnx')
+
+	outputs = session.run([output names], inputs)
+
+ONNX and ORT format models consist of a graph of computations, modeled as operators,
+and implemented as optimized operator kernels for different hardware targets.
+ONNX Runtime orchestrates the execution of operator kernels via `execution providers`.
+An execution provider contains the set of kernels for a specific execution target (CPU, GPU, IoT etc).
+Execution provides are configured using the `providers` parameter. Kernels from different execution
+providers are chosen in the priority order given in the list of providers. In the example below
+if there is a kernel in the CUDA execution provider ONNX Runtime executes that on GPU. If not
+the kernel is executed on CPU.
+
+.. code-block:: python
+
+	session = onnxruntime.InferenceSession(model,
+	                                       providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+
+The list of available execution providers can be found here: `Execution Providers <https://onnxruntime.ai/docs/execution-providers>`_.
+
+Since ONNX Runtime 1.10, you must explicitly specify the execution provider for your target.
+Running on CPU is the only time the API allows no explicit setting of the `provider` parameter.
+In the examples that follow, the `CUDAExecutionProvider` and `CPUExecutionProvider` are used, assuming the application is running on NVIDIA GPUs.
+Replace these with the execution provider specific to your environment.
+
+You can supply other session configurations via the `session options` parameter. For example, to enable
+profiling on the session:
+
+.. code-block:: python
+
+	options = onnxruntime.SessionOptions()
+	options.enable_profiling=True
+	session = onnxruntime.InferenceSession('model.onnx', sess_options=options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
+
+
+Data inputs and outputs
+-----------------------
+
+The ONNX Runtime Inference Session consumes and produces data using its OrtValue class.
+
+Data on CPU
+^^^^^^^^^^^
+
+On CPU (the default), OrtValues can be mapped to and from native Python data structures: numpy arrays, dictionaries and lists of
+numpy arrays. 
+
+.. code-block:: python
+
+	# X is numpy array on cpu
+	ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X)
+	ortvalue.device_name()  # 'cpu'
+	ortvalue.shape()        # shape of the numpy array X
+	ortvalue.data_type()    # 'tensor(float)'
+	ortvalue.is_tensor()    # 'True'
 	np.array_equal(ortvalue.numpy(), X)  # 'True'
 
 	# ortvalue can be provided as part of the input feed to a model
-	ses = onnxruntime.InferenceSession('model.onnx')
-	res = sess.run(["Y"], {"X": ortvalue})
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
+	results = session.run(["Y"], {"X": ortvalue})
 
-IOBinding
-=========
+By default, *ONNX Runtime* always places input(s) and output(s) on CPU. Having the data on CPU
+may not optimal if the input or output is consumed and produced on a device
+other than CPU because it introduces data copy between CPU and the device.
 
-By default, *ONNX Runtime* always places input(s) and output(s) on CPU, which 
-is not optimal if the input or output is consumed and produced on a device
-other than CPU because it introduces data copy between CPU and the device. 
-*ONNX Runtime* provides a feature, *IO Binding*, which addresses this issue by
-enabling users to specify which device to place input(s) and output(s) on. 
-Here are scenarios to use this feature. 
 
-(In the following code snippets, *model.onnx* is the model to execute, 
-*X* is the input data to feed, and *Y* is the output data.)
+Data on device
+^^^^^^^^^^^^^^
+
+*ONNX Runtime* supports a custom data structure that supports all ONNX data formats that allows users
+to place the data backing these on a device, for example, on a CUDA supported device. In ONNX Runtime,
+this called `IOBinding`.
 
-Scenario 1:
+To use the `IOBinding` feature, replace `InferenceSession.run()` with `InferenceSession.run_with_iobinding()`.
 
 A graph is executed on a device other than CPU, for instance CUDA. Users can 
-use IOBinding to put input on CUDA as the follows.
+use IOBinding to copy the data onto the GPU.
 
 .. code-block:: python
 
 	# X is numpy array on cpu 
-	session = onnxruntime.InferenceSession('model.onnx')
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
 	io_binding = session.io_binding()
 	# OnnxRuntime will copy the data over to the CUDA device if 'input' is consumed by nodes on the CUDA device 
 	io_binding.bind_cpu_input('input', X)
@@ -67,37 +109,32 @@ use IOBinding to put input on CUDA as the follows.
 	session.run_with_iobinding(io_binding)
 	Y = io_binding.copy_outputs_to_cpu()[0]
 
-Scenario 2:
-
 The input data is on a device, users directly use the input. The output data is on CPU.
 
 .. code-block:: python
 
 	# X is numpy array on cpu
 	X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0)
-	session = onnxruntime.InferenceSession('model.onnx')
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
 	io_binding = session.io_binding()
 	io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr())
 	io_binding.bind_output('output')
 	session.run_with_iobinding(io_binding)
 	Y = io_binding.copy_outputs_to_cpu()[0]
 
-Scenario 3:
-
-The input data and output data are both on a device, users directly use the input and also place output on the device.
+The input data and output data are both on a device, users directly use the input and also place output on the device.	
 
 .. code-block:: python
 
 	#X is numpy array on cpu
 	X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0)
 	Y_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, 'cuda', 0)  # Change the shape to the actual shape of the output being bound
-	session = onnxruntime.InferenceSession('model.onnx')
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
 	io_binding = session.io_binding()
 	io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr())
 	io_binding.bind_output(name='output', device_type=Y_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=Y_ortvalue.shape(), buffer_ptr=Y_ortvalue.data_ptr())
 	session.run_with_iobinding(io_binding)
 
-Scenario 4:
 
 Users can request *ONNX Runtime* to allocate an output on a device. This is particularly useful for dynamic shaped outputs.
 Users can use the *get_outputs()* API to get access to the *OrtValue* (s) corresponding to the allocated output(s).
@@ -107,7 +144,7 @@ Users can thus consume the *ONNX Runtime* allocated memory for the output as an
 
 	#X is numpy array on cpu
 	X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0)
-	session = onnxruntime.InferenceSession('model.onnx')
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
 	io_binding = session.io_binding()
 	io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr())
 	#Request ONNX Runtime to bind and allocate memory on CUDA for 'output'
@@ -117,7 +154,7 @@ Users can thus consume the *ONNX Runtime* allocated memory for the output as an
 	ort_output = io_binding.get_outputs()[0]
 
 
-Scenario 5:
+In addition, *ONNX Runtime* supports directly working with *OrtValue* (s) while inferencing a model if provided as part of the input feed.
 
 Users can bind *OrtValue* (s) directly.
 
@@ -127,39 +164,52 @@ Users can bind *OrtValue* (s) directly.
 	#X is numpy array on cpu
 	X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0)
 	Y_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, 'cuda', 0)  # Change the shape to the actual shape of the output being bound
-	session = onnxruntime.InferenceSession('model.onnx')
+	session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
 	io_binding = session.io_binding()
 	io_binding.bind_ortvalue_input('input', X_ortvalue)
 	io_binding.bind_ortvalue_output('output', Y_ortvalue)
 	session.run_with_iobinding(io_binding)
 
-Device
-======
-
-The package is compiled for a specific device, GPU or CPU.
-The CPU implementation includes optimizations
-such as MKL (Math Kernel Libary). The following function
-indicates the chosen option:
 
-.. autofunction:: onnxruntime.get_device
+You can also bind inputs and outputs directly to a PyTorch tensor.
 
-Examples and datasets
-=====================
-
-The package contains a few models stored in ONNX format
-used in the documentation. These don't need to be downloaded
-as they are installed with the package.
-
-.. autofunction:: onnxruntime.datasets.get_example
+.. code-block:: python
 
-Load and run a model
-====================
+    # X is a PyTorch tensor on device
+    session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']))
+    binding = session.io_binding()
+
+    X_tensor = X.contiguous()
+
+    binding.bind_input(
+        name='X',
+        device_type='cuda',
+        device_id=0,
+        element_type=np.float32,
+        shape=tuple(x_tensor.shape),
+        buffer_ptr=x_tensor.data_ptr(),
+        )
+
+    ## Allocate the PyTorch tensor for the model output
+    Y_shape = ... # You need to specify the output PyTorch tensor shape
+    Y_tensor = torch.empty(Y_shape, dtype=torch.float32, device='cuda:0').contiguous()
+    binding.bind_output(
+        name='Y',
+        device_type='cuda',
+        device_id=0,
+        element_type=np.float32,
+        shape=tuple(Y_tensor.shape),
+        buffer_ptr=Y_tensor.data_ptr(),
+    )
+
+    session.run_with_iobinding(binding)
+
+
+API Details
+===========
 
-*ONNX Runtime* reads a model saved in ONNX format.
-The main class *InferenceSession* wraps these functionalities
-in a single place.
 
-Main class
+InferenceSession
 ----------
 
 .. autoclass:: onnxruntime.InferenceSession
diff --git a/hailo/README.md b/hailo/README.md
index 2dd300985886c..e12b9c28e6b5a 100644
--- a/hailo/README.md
+++ b/hailo/README.md
@@ -5,11 +5,11 @@ Hailo ONNX Runtime integrates ONNX Runtime with HailoRT to enable Hailo-EP, prov
 * ONNX Runtime version 1.11.1 with Python 3.7 and above
 
 # Prerequisites
-* HailoRT v4.8.1
+* HailoRT v4.10.0
 
 # Build Instructions
 To build ONNXRuntime with HailoRT please follow the following steps:
-* Clone ONNXRuntime-Hailo from github.
+* Clone ONNXRuntime-Hailo from github
 * Compile ONNXRuntime with Hailo using the following command:
     ```
         ./build.sh --use_hailo --parallel --skip_tests --enable_pybind --build_wheel --config Release
@@ -17,8 +17,8 @@ To build ONNXRuntime with HailoRT please follow the following steps:
 
 # Run ONNX Runtime with HailoRT
 To run your ONNX model on ONNXRuntime with Hailo execution provider, follow the following steps:
-1. Convert your ONNX model with DFC tool - see [Model Compilation](https://hailo.ai/developer-zone/documentation/dataflow-compiler/latest/?sp_referrer=compilation.html#for-inference-using-onnx-runtime).
-2. Create the ONNXRuntime session with `"HailoExecutionProvider"` in the execution providers list, and run the ONNX model.
+1. Convert your ONNX model with DFC tool - see [Model Compilation](https://hailo.ai/developer-zone/documentation/dataflow-compiler/latest/?sp_referrer=compilation.html#for-inference-using-onnx-runtime)
+2. Create the ONNXRuntime session with `"HailoExecutionProvider"` in the execution providers list, and run the ONNX model
 
 ## Examples:
 * C++
@@ -27,9 +27,14 @@ To run your ONNX model on ONNXRuntime with Hailo execution provider, follow the
     
     The ONNX models used in these tests are located in [testdata/hailo directory](./../onnxruntime/test/testdata/hailo/).
     To run the tests, do the following:
-    1. Compile onnxruntime with Hailo.
-    2. Go to `build/Linux/Release/`.
-    3. Run a test with the name `Test_Name`: `./onnxruntime_test_all --gtest_filter=HailoCustomOpTest.Test_Name`.
+    1. Compile onnxruntime with Hailo
+    2. Go to `build/Linux/Release/`
+    3. Run a test with the name `Test_Name`: `./onnxruntime_test_all --gtest_filter=HailoCustomOpTest.Test_Name`
 * Python
 
     The example `hailo/examples/hailo_example.py` contains a basic inference example using onnxruntime with Hailo-EP.
+    The ONNX model used in this example is located in [hailo/examples/](./../examples/).
+    To run the example, do the following:
+    1. Compile onnxruntime with Hailo
+    2. Go to `build/Linux/Release/dist/` and install the Python wheel (for example: `pip install onnxruntime-1.11.0-cp36-cp36m-linux_x86_64.whl`)
+    3. Go to `hailo/examples` and run `python hailo_example.py`
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index a389c407fc264..bf4f99571f674 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -342,52 +342,49 @@ class Node {
   /** Gets the number of output edges from this Node */
   size_t GetOutputEdgesCount() const noexcept { return relationships_.output_edges.size(); }
 
-  /** Add an attribute to this Node with specified attribute name and value. */
-  void AddAttribute(std::string attr_name, const ONNX_NAMESPACE::AttributeProto& value);
-  void AddAttribute(std::string attr_name, ONNX_NAMESPACE::AttributeProto&& value);
+  /** Adds an AttributeProto to this Node.
+  @remarks The attribute name is used as the key in the attribute map. */
+  void AddAttributeProto(ONNX_NAMESPACE::AttributeProto value);
 
-#define ADD_ATTR_INTERFACES(TypeName)                              \
-  void AddAttribute(std::string attr_name, const TypeName& value); \
-  void AddAttribute(std::string attr_name,                         \
-                    gsl::span<TypeName const> values);
+  // keep this signature in sync with ADD_ATTR_SINGLE_INTERFACE below
+  /** Adds an attribute to this Node with the specified attribute name and value. */
+  void AddAttribute(std::string attr_name, int64_t value);
 
-#define ADD_ATTR_MOVE_INTERFACE(TypeName) \
-  void AddAttribute(std::string attr_name, TypeName&& value);
+  // keep this signature in sync with ADD_ATTR_LIST_INTERFACE below
+  /** Adds an attribute to this Node with the specified attribute name and values. */
+  void AddAttribute(std::string attr_name, gsl::span<const int64_t> values);
 
-  void AddAttribute(std::string attr_name, std::string value);
-  void AddAttribute(std::string attr_name, gsl::span<std::string const> values);
+#define ADD_ATTR_SINGLE_INTERFACE(Type) \
+  void AddAttribute(std::string attr_name, Type value)
 
-  ADD_ATTR_INTERFACES(int64_t)
-  ADD_ATTR_INTERFACES(float)
-  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TensorProto)
-  ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::TensorProto)
+#define ADD_ATTR_LIST_INTERFACE(Type) \
+  void AddAttribute(std::string attr_name, gsl::span<const Type> values)
+
+#define ADD_ATTR_INTERFACES(Type)  \
+  ADD_ATTR_SINGLE_INTERFACE(Type); \
+  ADD_ATTR_LIST_INTERFACE(Type)
+
+  ADD_ATTR_INTERFACES(float);
+  ADD_ATTR_INTERFACES(std::string);
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TensorProto);
 #if !defined(DISABLE_SPARSE_TENSORS)
-  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::SparseTensorProto)
-  ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::SparseTensorProto)
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::SparseTensorProto);
 #endif
-  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TypeProto)
-  ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::TypeProto)
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TypeProto);
 
-  void AddAttribute(std::string attr_name, const ONNX_NAMESPACE::GraphProto& value);
-  void AddAttribute(std::string attr_name, ONNX_NAMESPACE::GraphProto&& value);
+  ADD_ATTR_SINGLE_INTERFACE(ONNX_NAMESPACE::GraphProto);
 
-  // The below overloads are made so the compiler does not attempt to resolve
-  // C-strings with a gsl::span overloads
+#undef ADD_ATTR_SINGLE_INTERFACE
+#undef ADD_ATTR_LIST_INTERFACE
+#undef ADD_ATTR_INTERFACES
+
+  // The below overload is made so the compiler does not attempt to resolve
+  // string literals with the gsl::span overload
   template <size_t N>
   void AddAttribute(std::string attr_name, const char (&value)[N]) {
     this->AddAttribute(std::move(attr_name), std::string(value, N - 1));
   }
 
-  template <size_t M, size_t N>
-  void AddAttribute(const char (&attr_name)[M], const char (&value)[N]) {
-    this->AddAttribute(std::string(attr_name, M - 1), std::string(value, N - 1));
-  }
-
-  template <size_t M, typename T>
-  void AddAttribute(const char (&attr_name)[M], T&& value) {
-    this->AddAttribute(std::string(attr_name, M - 1), std::forward<T>(value));
-  }
-
   /** Gets the Node's attributes. */
   const NodeAttributes& GetAttributes() const noexcept { return attributes_; }
 
diff --git a/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h b/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
index 6d2478a94881d..3dbd270cde9b6 100644
--- a/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
+++ b/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
@@ -10,7 +10,7 @@
 extern "C" {
 #endif
 
-ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, _In_ const char* settings);
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, _In_ const char* opt_str);
 
 #ifdef __cplusplus
 }
diff --git a/onnxruntime/contrib_ops/cpu/signal/dft.cc b/onnxruntime/contrib_ops/cpu/signal/dft.cc
index d08852b84a124..4a90243da66df 100644
--- a/onnxruntime/contrib_ops/cpu/signal/dft.cc
+++ b/onnxruntime/contrib_ops/cpu/signal/dft.cc
@@ -39,20 +39,16 @@ ONNX_OPERATOR_KERNEL_EX(
     kMSExperimentalDomain,
     1,
     kCpuExecutionProvider,
-    KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints<float, double>()),
+    KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T1", BuildKernelDefConstraints<float, double>())
+                                       .TypeConstraint("T2", BuildKernelDefConstraints<int64_t>()),
     STFT);
 
 static bool is_real_valued_signal(const onnxruntime::TensorShape & shape) {
-  // The first dimention is the batch size
-  // The second dimention is the signal value
-  return shape.NumDimensions() == 2;
+  return shape.NumDimensions() == 2 || shape[shape.NumDimensions() - 1] == 1;
 }
 
 static bool is_complex_valued_signal(const onnxruntime::TensorShape& shape) {
-  // The first dimention is the batch size
-  // The second dimention is the signal length
-  // The third dimention is set to 2 and represents the real and imaginary parts of the complex sample
-  return shape.NumDimensions() == 3 && shape[2] == 2;
+  return shape.NumDimensions() > 2 && shape[shape.NumDimensions() - 1] == 2;
 }
 
 static bool is_power_of_2(size_t size) {
@@ -143,24 +139,27 @@ static T compute_angular_velocity(size_t number_of_samples, bool inverse) {
 }
 
 template <typename T, typename U>
-static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx,
-    const Tensor* X, Tensor* Y, const Tensor* window, bool is_onesided, bool inverse,
+static Status fft_radix2(OpKernelContext* /*ctx*/,
+    const Tensor* X, Tensor* Y,
+    size_t X_offset, size_t X_stride, size_t Y_offset, size_t Y_stride, int64_t axis,
+    const Tensor* window, bool is_onesided, bool inverse,
     std::vector<std::complex<T>>& V,
     std::vector<std::complex<T>>& temp_output) {
 
   // Get shape and significant bits
   const auto& X_shape = X->Shape();
-  size_t number_of_samples = static_cast<size_t>(X_shape[1]);
+  size_t number_of_samples = static_cast<size_t>(X_shape[axis]);
   unsigned significant_bits = static_cast<unsigned>(log2(number_of_samples));
 
   // Get data
-  auto* X_data = const_cast<U*>(reinterpret_cast<const U*>(X->DataRaw())) + (batch_idx * number_of_samples);
+  auto* X_data = const_cast<U*>(reinterpret_cast<const U*>(X->DataRaw())) + X_offset;
   // Get window
   U* window_data = nullptr;
   if (window) {
     window_data = const_cast<U*>(reinterpret_cast<const U*>(window->DataRaw()));
   }
 
+  size_t Y_data_stride = 1;
   std::complex<T>* Y_data;
   if (is_onesided) {
     if (temp_output.size() != number_of_samples) {
@@ -168,7 +167,8 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx,
     }
     Y_data = temp_output.data();
   } else {
-    Y_data = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + (batch_idx * number_of_samples);
+    Y_data = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + Y_offset;
+    Y_data_stride = Y_stride;
   }
 
   auto angular_velocity = compute_angular_velocity<T>(number_of_samples, inverse);
@@ -184,9 +184,9 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx,
 
   for (size_t i = 0; i < number_of_samples; i++) {
     size_t bit_reversed_index = bit_reverse(i, significant_bits);
-    auto x = *(X_data + bit_reversed_index);
+    auto x = *(X_data + bit_reversed_index*X_stride);
     auto window_element = window_data ? *(window_data + bit_reversed_index) : 1; 
-    *(Y_data + i) = std::complex<T>(1, 0) * x * window_element;
+    *(Y_data + i*Y_data_stride) = std::complex<T>(1, 0) * x * window_element;
   }
 
   // Run fft_radix2
@@ -199,8 +199,8 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx,
       auto first_idx = bit_reverse(k, current_significant_bits);
       auto second_idx = bit_reverse(midpoint + k, current_significant_bits);
       for (size_t j = 0; j < number_of_samples; j += i) {
-        std::complex<T>* even = (Y_data + j) + k;
-        std::complex<T>* odd = (Y_data + j) + (midpoint + k);
+        std::complex<T>* even = (Y_data + j*Y_data_stride) + k;
+        std::complex<T>* odd = (Y_data + j*Y_data_stride) + (midpoint + k);
         std::complex<T> first = *even + (V[first_idx] * *odd);
         std::complex<T> second = *even + (V[second_idx] * *odd);
         *even = first;
@@ -212,32 +212,34 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx,
   // Scale the output if inverse
   if (inverse) {
     for (size_t i = 0; i < number_of_samples; i++) {
-      std::complex<T>& val = *(Y_data + i);
+      std::complex<T>& val = *(Y_data + i * Y_data_stride);
       val /= static_cast<T>(number_of_samples);
     }
   }
 
   if (is_onesided) {
-    const auto& Y_shape = Y->Shape();
-    size_t fft_output_size = static_cast<size_t>(Y_shape[1]);
-    auto destination = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + (batch_idx * fft_output_size);
-    memcpy(destination, Y_data, sizeof(std::complex<T>) * fft_output_size);
+    auto destination = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + Y_offset;
+    for (size_t i = 0; i < number_of_samples; i++) {
+      *(destination + Y_stride * i) = *(Y_data + i);
+    }
   }
 
   return Status::OK();
 }
 
 template <typename T, typename U>
-static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tensor* window, bool inverse) {
+static Status dft_naive(const Tensor* X, Tensor* Y,
+  size_t X_offset, size_t X_stride, size_t Y_offset, size_t Y_stride, int64_t axis,
+  const Tensor* window, bool inverse) {
   // Get shape and significant bits
   const auto& X_shape = X->Shape();
-  size_t number_of_samples = static_cast<size_t>(X_shape[1]);
+  size_t number_of_samples = static_cast<size_t>(X_shape[axis]);
   const auto& Y_shape = Y->Shape();
-  size_t dft_output_size = static_cast<size_t>(Y_shape[1]);
+  size_t dft_output_size = static_cast<size_t>(Y_shape[axis]);
 
   // Get data
-  auto* X_data = const_cast<U*>(reinterpret_cast<const U*>(X->DataRaw())) + (batch_idx * number_of_samples);
-  auto* Y_data = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + (batch_idx * dft_output_size);
+  auto* X_data = const_cast<U*>(reinterpret_cast<const U*>(X->DataRaw())) + X_offset;
+  auto* Y_data = reinterpret_cast<std::complex<T>*>(Y->MutableDataRaw()) + Y_offset;
   
   U* window_data = nullptr;
   if (window) {
@@ -247,14 +249,14 @@ static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tens
   auto angular_velocity = compute_angular_velocity<T>(number_of_samples, inverse);
 
   for (size_t i = 0; i < dft_output_size; i++) {
-    std::complex<T>& out = *(Y_data + i);
+    std::complex<T>& out = *(Y_data + i*Y_stride);
     out.real(0);
     out.imag(0);
 
     for (size_t j = 0; j < number_of_samples; j++) {  // vectorize over this loop
       auto exponential = std::complex<T>(cos(i * j * angular_velocity), sin(i * j * angular_velocity));
       auto window_element = window_data ? * (window_data + j) : 1;
-      auto element = *(X_data + j) * window_element;
+      auto element = *(X_data + j*X_stride) * window_element;
       out += exponential * element;
     }
 
@@ -267,26 +269,70 @@ static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tens
 }
 
 template <typename T, typename U>
-static Status discrete_fourier_transform(OpKernelContext* ctx, const Tensor* X, Tensor* Y, const Tensor* window, bool is_onesided, bool inverse,
+static Status discrete_fourier_transform(OpKernelContext* ctx, const Tensor* X, Tensor* Y, int64_t axis, const Tensor* window, bool is_onesided, bool inverse,
                                          std::vector<std::complex<T>>& V, std::vector<std::complex<T>>& temp_output) {
   // Get shape
   const auto& X_shape = X->Shape();
-  size_t number_of_batches = static_cast<size_t>(X_shape[0]);
-  size_t number_of_samples = static_cast<size_t>(X_shape[1]);
-   
-  // radix 2 fft
-  for (size_t i = 0; i < number_of_batches; i++) {
+  const auto& Y_shape = Y->Shape();
+  size_t number_of_samples = static_cast<size_t>(X_shape[axis]);
+  
+  auto batch_and_signal_rank = X->Shape().NumDimensions();
+  auto total_dfts = static_cast<size_t>(X->Shape().Size() / X->Shape()[axis]);
+
+  auto is_input_real = X->Shape().NumDimensions() == 2 || X->Shape()[X->Shape().NumDimensions() - 1] == 1;
+  auto compex_input_factor = is_input_real ? 1 : 2;
+  if (X->Shape().NumDimensions() > 2)
+  {
+    total_dfts /= X->Shape()[X->Shape().NumDimensions() - 1];
+    batch_and_signal_rank -= 1;
+  }
+
+
+
+  // Calculate x/y offsets/strides
+  for (size_t i = 0; i < total_dfts; i++)
+  {
+    size_t X_offset = 0;
+    size_t X_stride = X_shape.SizeFromDimension(axis+1) / compex_input_factor;
+    size_t cumulative_packed_stride = total_dfts;
+    size_t temp = i;
+    for (size_t r = 0; r < batch_and_signal_rank; r++) {
+      if (r == static_cast<size_t>(axis))
+      {
+        continue;
+      }
+      cumulative_packed_stride /= X_shape[r];
+      auto index = temp / cumulative_packed_stride;
+      temp -= (index * cumulative_packed_stride);
+      X_offset += index * X_shape.SizeFromDimension(r + 1) / compex_input_factor;
+    }
+
+    size_t Y_offset = 0;
+    size_t Y_stride = Y_shape.SizeFromDimension(axis + 1) / 2;
+    cumulative_packed_stride = total_dfts;
+    temp = i;
+    for (size_t r = 0; r < batch_and_signal_rank; r++) {
+      if (r == static_cast<size_t>(axis))
+      {
+        continue;
+      }
+      cumulative_packed_stride /= X_shape[r];
+      auto index = temp / cumulative_packed_stride;
+      temp -= (index * cumulative_packed_stride);
+      Y_offset += index * Y_shape.SizeFromDimension(r + 1) / 2;
+    }
+
     if (is_power_of_2(number_of_samples)) {
-      ORT_RETURN_IF_ERROR((fft_radix2<T, U>(ctx, i, X, Y, window, is_onesided, inverse, V, temp_output)));
+      ORT_RETURN_IF_ERROR((fft_radix2<T, U>(ctx, X, Y, X_offset, X_stride, Y_offset, Y_stride, axis, window, is_onesided, inverse, V, temp_output)));
     } else {
-      ORT_RETURN_IF_ERROR((dft_naive<T, U>(i, X, Y, window, inverse)));
+      ORT_RETURN_IF_ERROR((dft_naive<T, U>(X, Y, X_offset, X_stride, Y_offset, Y_stride, axis, window, inverse)));
     }
-  }
+  } 
 
   return Status::OK();
 }
 
-static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, bool inverse) {
+static Status discrete_fourier_transform(OpKernelContext* ctx, int64_t axis, bool is_onesided, bool inverse) {
   // Get input shape
   const auto* X = ctx->Input<Tensor>(0);
   const auto& X_shape = X->Shape();
@@ -295,13 +341,21 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided,
 
   // Get the DFT output size. Onesided will return only the unique values!
   // note: x >> 1 === std::floor(x / 2.f)
-  int64_t number_of_samples = static_cast<int64_t>(X_shape[1]);
+  int64_t number_of_samples = static_cast<int64_t>(X_shape[axis]);
   auto dft_output_size = is_onesided ?
       ((number_of_samples >> 1) + 1) :
       number_of_samples;
 
   // Get output shape
-  auto Y_shape = onnxruntime::TensorShape({X_shape[0], dft_output_size, 2});
+  auto Y_shape = onnxruntime::TensorShape(X_shape);
+  if (X_shape.NumDimensions() == 2)
+  {
+    Y_shape = onnxruntime::TensorShape({X_shape[0], dft_output_size, 2});
+  } else 
+  {
+    Y_shape[Y_shape.NumDimensions() - 1] = 2;
+  }
+  Y_shape[axis] = dft_output_size;
   auto Y = ctx->Output(0, Y_shape);
 
   // Get data type
@@ -312,9 +366,9 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided,
     std::vector<std::complex<float>> V;
     std::vector<std::complex<float>> temp_output;
     if (is_real_valued) {
-      ORT_RETURN_IF_ERROR((discrete_fourier_transform<float, float>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output)));
+      ORT_RETURN_IF_ERROR((discrete_fourier_transform<float, float>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output)));
     } else if (is_complex_valued) {
-      ORT_RETURN_IF_ERROR((discrete_fourier_transform<float, std::complex<float>>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output)));
+      ORT_RETURN_IF_ERROR((discrete_fourier_transform<float, std::complex<float>>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output)));
     } else {
         ORT_THROW("Unsupported input signal shape. The signal's first dimenstion must be the batch dimension and its second dimension must be the signal length dimension. It may optionally include a 3rd dimension of size 2 for complex inputs.", data_type);
     }
@@ -322,9 +376,9 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided,
     std::vector<std::complex<double>> V;
     std::vector<std::complex<double>> temp_output;
     if (is_real_valued) {
-      ORT_RETURN_IF_ERROR((discrete_fourier_transform<double, double>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output)));
+      ORT_RETURN_IF_ERROR((discrete_fourier_transform<double, double>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output)));
     } else if (is_complex_valued) {
-      ORT_RETURN_IF_ERROR((discrete_fourier_transform<double, std::complex<double>>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output)));
+      ORT_RETURN_IF_ERROR((discrete_fourier_transform<double, std::complex<double>>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output)));
     } else {
       ORT_THROW("Unsupported input signal shape. The signal's first dimenstion must be the batch dimension and its second dimension must be the signal length dimension. It may optionally include a 3rd dimension of size 2 for complex inputs.", data_type);
     }
@@ -336,12 +390,12 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided,
 }
 
 Status DFT::Compute(OpKernelContext* ctx) const {
-  ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, is_onesided_, false));
+  ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, axis_ + 1, is_onesided_, false));
   return Status::OK();
 }
 
 Status IDFT::Compute(OpKernelContext* ctx) const {
-  ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, false, true));
+  ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, axis_ + 1, false, true));
   return Status::OK();
 }
 
@@ -376,9 +430,9 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside
   
   // Get signal
   const auto* signal = ctx->Input<Tensor>(0);
-  const auto* window = ctx->Input<Tensor>(1);
-  const auto* frame_length_tensor = ctx->Input<Tensor>(2);
-  const auto frame_step = get_scalar_value_from_tensor<int64_t>(ctx->Input<Tensor>(3));
+  const auto frame_step = get_scalar_value_from_tensor<int64_t>(ctx->Input<Tensor>(1));
+  const auto* window = ctx->Input<Tensor>(2);
+  const auto* frame_length_tensor = ctx->Input<Tensor>(3);
 
   // Get input signal shape
   const auto& signal_shape = signal->Shape();
@@ -468,7 +522,7 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside
               0);
 
       // Run individual dft
-      ORT_RETURN_IF_ERROR((discrete_fourier_transform<T, U>(ctx, &input, &output, window, is_onesided, false, V, temp_output)));
+      ORT_RETURN_IF_ERROR((discrete_fourier_transform<T, U>(ctx, &input, &output, 1, window, is_onesided, false, V, temp_output)));
     }
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/signal/dft.h b/onnxruntime/contrib_ops/cpu/signal/dft.h
index 2b04781c70f59..fc90d48fab25c 100644
--- a/onnxruntime/contrib_ops/cpu/signal/dft.h
+++ b/onnxruntime/contrib_ops/cpu/signal/dft.h
@@ -8,16 +8,20 @@ namespace contrib {
 
 class DFT final : public OpKernel {
   bool is_onesided_ = true;
+  int64_t axis_ = 0;
  public:
   explicit DFT(const OpKernelInfo& info) : OpKernel(info) {
     is_onesided_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("onesided", 0));
+    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
   }
   Status Compute(OpKernelContext* ctx) const override;
 };
 
 class IDFT final : public OpKernel {
+  int64_t axis_ = 0;
  public:
   explicit IDFT(const OpKernelInfo& info) : OpKernel(info) {
+    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
   }
   Status Compute(OpKernelContext* ctx) const override;
 };
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index 01820c34741f7..e77eadc010cfa 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -309,6 +309,7 @@ Status KernelRegistry::TryFindKernel(const Node& node,
         << " kernel is not supported in " << expected_provider << "."
         << " Encountered following errors: (" << ToString(verify_kernel_def_error_strs) << ")";
 
+    VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 81c44b91f5e55..ef9d66f117998 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -22,6 +22,7 @@
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/model.h"
 #include "core/graph/model_load_utils.h"
+#include "core/graph/node_attr_utils.h"
 #include "core/graph/op.h"
 #include "core/graph/runtime_optimization_record_container.h"
 
@@ -762,7 +763,7 @@ Status Node::LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node, const log
         subgraphs_.push_back(std::move(subgraph));
       }
 
-      AddAttribute(attr_proto.name(), std::move(attr_proto));
+      AddAttributeProto(std::move(attr_proto));
     }
   }
 
@@ -872,82 +873,46 @@ void Node::CreateSubgraph(const std::string& attr_name) {
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
-void Node::AddAttribute(std::string attr_name, const ONNX_NAMESPACE::AttributeProto& value) {
-  graph_->SetGraphResolveNeeded();
-  graph_->SetGraphProtoSyncNeeded();
-  attributes_[std::move(attr_name)] = value;
-}
+void Node::AddAttributeProto(AttributeProto value) {
+  utils::SetNodeAttribute(std::move(value), attributes_);
 
-void Node::AddAttribute(std::string attr_name, ONNX_NAMESPACE::AttributeProto&& value) {
   graph_->SetGraphResolveNeeded();
   graph_->SetGraphProtoSyncNeeded();
-  attributes_[std::move(attr_name)] = std::move(value);
 }
 
-static void AddAttributeHelper(Node& node, std::string attr_name,
-                               AttributeProto_AttributeType attr_type, AttributeProto&& a) {
-  a.set_name(attr_name);
-  a.set_type(attr_type);
-  node.AddAttribute(std::move(attr_name), std::move(a));
-}
-
-void Node::AddAttribute(std::string attr_name, std::string value) {
-  AttributeProto a;
-  *(a.mutable_s()) = std::move(value);
-  AddAttributeHelper(*this, std::move(attr_name),
-                     AttributeProto_AttributeType::AttributeProto_AttributeType_STRING,
-                     std::move(a));
-};
-
-#define ADD_BASIC_ATTR_IMPL(type, enumType, field)                           \
-  void Node::AddAttribute(std::string attr_name, const type& value) {        \
-    AttributeProto a;                                                        \
-    a.set_##field(value);                                                    \
-    AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \
-  };
-
-#define ADD_ATTR_IMPL(type, enumType, field)                                 \
-  void Node::AddAttribute(std::string attr_name, const type& value) {        \
-    AttributeProto a;                                                        \
-    *(a.mutable_##field()) = value;                                          \
-    AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \
+#define ADD_ATTR_SINGLE_IMPL(Type)                                                   \
+  void Node::AddAttribute(std::string attr_name, Type value) {                       \
+    AttributeProto a = utils::MakeAttribute(std::move(attr_name), std::move(value)); \
+    AddAttributeProto(std::move(a));                                                 \
   }
 
-#define ADD_ATTR_MOVE_IMPL(type, enumType, field)                            \
-  void Node::AddAttribute(std::string attr_name, type&& value) {             \
-    AttributeProto a;                                                        \
-    *(a.mutable_##field()) = std::move(value);                               \
-    AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \
+#define ADD_ATTR_LIST_IMPL(Type)                                                 \
+  void Node::AddAttribute(std::string attr_name, gsl::span<const Type> values) { \
+    AttributeProto a = utils::MakeAttribute(std::move(attr_name), values);       \
+    AddAttributeProto(std::move(a));                                             \
   }
 
-#define ADD_LIST_ATTR_IMPL(type, enumType, field)                            \
-  void Node::AddAttribute(std::string attr_name,                             \
-                          gsl::span<type const> values) {                    \
-    AttributeProto a;                                                        \
-    auto* mutable_field = a.mutable_##field();                               \
-    for (const auto& val : values) {                                         \
-      *(mutable_field->Add()) = val;                                         \
-    }                                                                        \
-    AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \
-  }
+#define ADD_ATTR_IMPLS(Type) \
+  ADD_ATTR_SINGLE_IMPL(Type) \
+  ADD_ATTR_LIST_IMPL(Type)
 
-void Node::AddAttribute(std::string attr_name, const GraphProto& value) {
-  AttributeProto a;
-  *a.mutable_g() = value;
-  // Do not move attr_name as it is needed below
-  AddAttributeHelper(*this, attr_name, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, std::move(a));
-
-#if !defined(ORT_MINIMAL_BUILD)
-  // subgraph is created via deserialization and not here in a minimal build
-  CreateSubgraph(attr_name);
+ADD_ATTR_IMPLS(int64_t)
+ADD_ATTR_IMPLS(float)
+ADD_ATTR_IMPLS(std::string)
+ADD_ATTR_IMPLS(TensorProto)
+#if !defined(DISABLE_SPARSE_TENSORS)
+ADD_ATTR_IMPLS(SparseTensorProto)
 #endif
-};
+ADD_ATTR_IMPLS(TypeProto)
+
+#undef ADD_ATTR_SINGLE_IMPL
+#undef ADD_ATTR_LIST_IMPL
+#undef ADD_ATTR_IMPLS
 
-void Node::AddAttribute(std::string attr_name, GraphProto&& value) {
-  AttributeProto a;
-  *a.mutable_g() = std::move(value);
+void Node::AddAttribute(std::string attr_name, GraphProto value) {
   // Do not move attr_name as it is needed below
-  AddAttributeHelper(*this, attr_name, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, std::move(a));
+  AttributeProto a = utils::MakeAttribute(attr_name, std::move(value));
+  AddAttributeProto(std::move(a));
 
 #if !defined(ORT_MINIMAL_BUILD)
   // subgraph is created via deserialization and not here in a minimal build
@@ -955,24 +920,6 @@ void Node::AddAttribute(std::string attr_name, GraphProto&& value) {
 #endif
 };
 
-ADD_BASIC_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT, f)
-ADD_BASIC_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INT, i)
-ADD_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t)
-ADD_ATTR_MOVE_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t)
-ADD_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp)
-ADD_ATTR_MOVE_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp)
-
-ADD_LIST_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOATS, floats)
-ADD_LIST_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INTS, ints)
-ADD_LIST_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS, strings)
-ADD_LIST_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSORS, tensors)
-ADD_LIST_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTOS, type_protos)
-#if !defined(DISABLE_SPARSE_TENSORS)
-ADD_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR, sparse_tensor)
-ADD_ATTR_MOVE_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR, sparse_tensor)
-ADD_LIST_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSORS, sparse_tensors)
-#endif
-
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 bool Node::ClearAttribute(const std::string& attr_name) {
   graph_->SetGraphResolveNeeded();
@@ -2588,8 +2535,9 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
         // The attribute was not specified in the node.
         if (!attr_def.second.required) {
           if (utils::HasName(attr_def.second.default_value)) {
+            assert(attr_def.first == attr_def.second.default_value.name());
             // Set default value to the node attributes.
-            node.AddAttribute(attr_def.first, attr_def.second.default_value);
+            node.AddAttributeProto(attr_def.second.default_value);
           }
           // TODO: Handle optional attribute but no default value specified in op definition.
         } else {
diff --git a/onnxruntime/core/graph/node_attr_utils.cc b/onnxruntime/core/graph/node_attr_utils.cc
new file mode 100644
index 0000000000000..120df9e5d43f3
--- /dev/null
+++ b/onnxruntime/core/graph/node_attr_utils.cc
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/graph/node_attr_utils.h"
+
+#include "core/common/common.h"
+#include "core/framework/tensorprotoutils.h"
+
+using namespace ONNX_NAMESPACE;
+
+namespace onnxruntime::utils {
+
+static void SetNameAndType(std::string attr_name, AttributeProto_AttributeType attr_type, AttributeProto& a) {
+  a.set_name(std::move(attr_name));
+  a.set_type(attr_type);
+}
+
+#define MAKE_BASIC_ATTR_IMPL(type, enumType, field)                 \
+  AttributeProto MakeAttribute(std::string attr_name, type value) { \
+    AttributeProto a;                                               \
+    a.set_##field(std::move(value));                                \
+    SetNameAndType(std::move(attr_name), enumType, a);              \
+    return a;                                                       \
+  }
+
+#define MAKE_ATTR_IMPL(type, enumType, field)                       \
+  AttributeProto MakeAttribute(std::string attr_name, type value) { \
+    AttributeProto a;                                               \
+    *(a.mutable_##field()) = std::move(value);                      \
+    SetNameAndType(std::move(attr_name), enumType, a);              \
+    return a;                                                       \
+  }
+
+#define MAKE_LIST_ATTR_IMPL(type, enumType, field)                                    \
+  AttributeProto MakeAttribute(std::string attr_name, gsl::span<const type> values) { \
+    AttributeProto a;                                                                 \
+    auto* mutable_field = a.mutable_##field();                                        \
+    for (const auto& val : values) {                                                  \
+      *(mutable_field->Add()) = val;                                                  \
+    }                                                                                 \
+    SetNameAndType(std::move(attr_name), enumType, a);                                \
+    return a;                                                                         \
+  }
+
+MAKE_BASIC_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INT, i)
+MAKE_LIST_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INTS, ints)
+
+MAKE_BASIC_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT, f)
+MAKE_LIST_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOATS, floats)
+
+MAKE_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRING, s)
+MAKE_LIST_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS, strings)
+
+MAKE_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t)
+MAKE_LIST_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSORS, tensors)
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+MAKE_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR,
+               sparse_tensor)
+MAKE_LIST_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSORS,
+                    sparse_tensors)
+#endif
+
+MAKE_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp)
+MAKE_LIST_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTOS, type_protos)
+
+MAKE_ATTR_IMPL(GraphProto, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, g)
+MAKE_LIST_ATTR_IMPL(GraphProto, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPHS, graphs)
+
+#undef MAKE_BASIC_ATTR_IMPL
+#undef MAKE_ATTR_IMPL
+#undef MAKE_LIST_ATTR_IMPL
+
+std::pair<NodeAttributes::iterator, bool> SetNodeAttribute(AttributeProto attribute,
+                                                           NodeAttributes& node_attributes) {
+  ORT_ENFORCE(utils::HasName(attribute), "AttributeProto must have a name.");
+  std::string name = attribute.name();
+  return node_attributes.insert_or_assign(std::move(name), std::move(attribute));
+}
+
+}  // namespace onnxruntime::utils
diff --git a/onnxruntime/core/graph/node_attr_utils.h b/onnxruntime/core/graph/node_attr_utils.h
new file mode 100644
index 0000000000000..94242e8d26404
--- /dev/null
+++ b/onnxruntime/core/graph/node_attr_utils.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+#include <gsl/gsl>
+
+#include "onnx/onnx_pb.h"
+
+#include "core/graph/basic_types.h"
+
+namespace onnxruntime::utils {
+
+// keep these signatures in sync with DECLARE_MAKE_ATTRIBUTE_FNS below
+/** Creates an AttributeProto with the specified name and value. */
+ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, int64_t value);
+/** Creates an AttributeProto with the specified name and values. */
+ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, gsl::span<const int64_t> values);
+
+#define DECLARE_MAKE_ATTRIBUTE_FNS(type)                                           \
+  ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, type value); \
+  ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, gsl::span<const type> values)
+
+DECLARE_MAKE_ATTRIBUTE_FNS(float);
+DECLARE_MAKE_ATTRIBUTE_FNS(std::string);
+DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::TensorProto);
+#if !defined(DISABLE_SPARSE_TENSORS)
+DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::SparseTensorProto);
+#endif
+DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::TypeProto);
+DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::GraphProto);
+
+#undef DECLARE_MAKE_ATTRIBUTE_FNS
+
+// The below overload is made so the compiler does not attempt to resolve
+// string literals with the gsl::span overload
+inline ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, const char* value) {
+  return MakeAttribute(std::move(attr_name), std::string{value});
+}
+
+/**
+ * Sets an attribute in `node_attributes` with key `attribute.name()` and value `attribute`.
+ * If an attribute with the same name exists, it will be overwritten.
+ * @return Pair of (iterator to attribute, whether attribute was added (true) or updated (false)).
+ */
+std::pair<NodeAttributes::iterator, bool> SetNodeAttribute(ONNX_NAMESPACE::AttributeProto attribute,
+                                                           NodeAttributes& node_attributes);
+
+}  // namespace onnxruntime::utils
diff --git a/onnxruntime/core/graph/signal_ops/signal_defs.cc b/onnxruntime/core/graph/signal_ops/signal_defs.cc
index 6b78bf075af5d..70720b8f85ed0 100644
--- a/onnxruntime/core/graph/signal_ops/signal_defs.cc
+++ b/onnxruntime/core/graph/signal_ops/signal_defs.cc
@@ -42,6 +42,24 @@ static T get_scalar_value_from_tensor(const ONNX_NAMESPACE::TensorProto* t) {
   }
 }
 
+inline const ONNX_NAMESPACE::TensorShapeProto* getOptionalInputShape(ONNX_NAMESPACE::InferenceContext& ctx, size_t n) {
+  const auto* input_type = ctx.getInputType(n);
+
+  if (input_type == nullptr) {
+    return nullptr;
+  }
+
+  const auto value_case = input_type->value_case();
+  if (value_case != ONNX_NAMESPACE::TypeProto::kTensorType && value_case != ONNX_NAMESPACE::TypeProto::kSparseTensorType) {
+    fail_type_inference("Attribute expected to have tensor or sparse tensor type");
+  }
+  if (value_case == ONNX_NAMESPACE::TypeProto::kTensorType) {
+    return &input_type->tensor_type().shape();
+  } else {
+    return &input_type->sparse_tensor_type().shape();
+  }
+}
+
 void RegisterSignalSchemas() {
   MS_SIGNAL_OPERATOR_SCHEMA(DFT)
       .SetDomain(kMSExperimentalDomain)
@@ -53,132 +71,242 @@ void RegisterSignalSchemas() {
             "Values can be 0 or 1.",
             AttributeProto::AttributeType::AttributeProto_AttributeType_INT,
             static_cast<int64_t>(0))
+      .Attr("axis",
+            "The axis on which to perform the DFT. By default this value is set to 0, which corresponds to the first dimension after the batch index."
+            "This value must be less than signal_dimN, where signal_dimN is the number of dimensions in the signal.",
+            AttributeProto::AttributeType::AttributeProto_AttributeType_INT,
+            static_cast<int64_t>(0))
       .Input(0,
-          "input",
-          "For complex input, the following shape is expected: [batch_idx][n_fft][2]" 
-          "The final dimension represents the real and imaginary parts of the value."
-          "For real input, the following shape is expected: [batch_idx][n_fft]"
-          "The first dimension is the batch dimension.",
-          "T")
+             "input",
+             "For real input, the following shape is expected: [batch_idx][n_fft]."
+             "For complex input, the following shape is expected: [batch_idx][n_fft][2]." 
+             "The final dimension represents the real and imaginary parts of the value."
+             "For real multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]."
+             "For complex multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]."
+             "The first dimension is the batch dimension.",
+             "T")
       .Output(0,
               "output",
               "The Fourier Transform of the input vector."
-              "If onesided is 1, [batch_idx][floor(n_fft/2)+1][2]" 
-              "If onesided is 0, [batch_idx][n_fft][2]",
+              "If signal_dimN = 1, and onesided is 0, [batch_idx][n_fft][2]"
+              "If signal_dimN = 1, and onesided is 1, [batch_idx][floor(n_fft/2)+1][2]" 
+              "If signal_dimN = 2, and onesided is 0 and axis = 0, [batch_idx][signal_dim1][signal_dim2][2]"
+              "If signal_dimN = 2, and onesided is 0 and axis = 1, [batch_idx][signal_dim1][signal_dim2][2]"
+              "If signal_dimN = 2, and onesided is 1 and axis = 0, [batch_idx][floor(signal_dim1/2)+1][signal_dim2][2]"
+              "If signal_dimN = 2, and onesided is 1 and axis = 1, [batch_idx][signal_dim1][floor(signal_dim2/2)+1][2]",
               "T")
-      .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "")
+      .TypeConstraint(
+          "T",
+          {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+          "Constrain input and output types to float tensors.")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
-        int64_t ndim = 1;
-
-        bool is_onesided = true;
-        auto attr_proto = ctx.getAttribute("onesided");
-        if (attr_proto && attr_proto->has_i()) {
-          is_onesided = static_cast<bool>(attr_proto->i());
-        }
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          const int64_t batch_ndim = 1;
 
-        if (ctx.getInputType(0)->tensor_type().has_shape()) {
           auto& input_shape = getInputShape(ctx, 0);
-          ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape;
+          auto dim_size = static_cast<int64_t>(input_shape.dim_size());
+          auto has_component_dimension = dim_size > 2; 
 
+          ONNX_NAMESPACE::TensorShapeProto result_shape_proto = input_shape;
+          
+          bool is_onesided = static_cast<bool>(getAttribute(ctx, "onesided", 0));
           if (is_onesided) {
-            auto n_fft = input_shape.dim(1).dim_value();
-            result_shape.mutable_dim(1)->set_dim_value((n_fft >> 1) + 1);
+              // Since signal_ndim = 1, and multidimensional DFT is not supported,
+              // only the single signal dim (1) needs to be updated
+              auto n_fft = input_shape.dim(1).dim_value();
+              result_shape_proto.mutable_dim(1)->set_dim_value((n_fft >> 1) + 1);
           }
-
-          auto dim_size = static_cast<int64_t>(input_shape.dim_size());
-          if (dim_size == ndim + 1) {                  // real input
-            result_shape.add_dim()->set_dim_value(2);  // output is same shape, but with extra dim for 2 values (real/imaginary)
-          } else if (dim_size == ndim + 2) {           // complex input, do nothing
+  
+          if (has_component_dimension) {
+            result_shape_proto.mutable_dim(static_cast<int>(dim_size - 1))->set_dim_value(2);
           } else {
-            fail_shape_inference(
-                "the input_shape must [batch_idx][n_fft] for real values or [batch_idx][n_fft][2] for complex values.")
+            result_shape_proto.add_dim()->set_dim_value(2);  
           }
-          updateOutputShape(ctx, 0, result_shape);
-        }
+
+          updateOutputShape(ctx, 0, result_shape_proto);
       });
-  ;
 
   MS_SIGNAL_OPERATOR_SCHEMA(IDFT)
       .SetDomain(kMSExperimentalDomain)
       .SinceVersion(1)
       .SetDoc(R"DOC(IDFT)DOC")
+      .Attr("axis",
+            "The axis on which to perform the DFT. By default this value is set to 0, which corresponds to the first dimension after the batch index."
+            "This value must be less than signal_dimN, where signal_dimN is the number of dimensions in the signal.",
+            AttributeProto::AttributeType::AttributeProto_AttributeType_INT,
+            static_cast<int64_t>(0))
       .Input(0,
              "input",
-             "A complex signal of dimension signal_ndim."
-             "The last dimension of the tensor should be 2,"
-             "representing the real and imaginary components of complex numbers,"
-             "and should have at least signal_ndim + 2 dimensions."
+             "For real input, the following shape is expected: [batch_idx][n_fft]."
+             "For complex input, the following shape is expected: [batch_idx][n_fft][2]." 
+             "The final dimension represents the real and imaginary parts of the value."
+             "For real multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]."
+             "For complex multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]."
              "The first dimension is the batch dimension.",
              "T")
       .Output(0,
               "output",
-              "The inverse fourier transform of the input vector,"
-              "using the same format as the input.",
-              "T")
-      .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "")
+              "The inverse discrete Fourier transform of the input. "
+              "If signal_dimN = 1, [batch_idx][n_fft][2]"
+              "If signal_dimN = 2 and axis = 0, [batch_idx][signal_dim1][signal_dim2][2]"
+              "If signal_dimN = 2 and axis = 1, [batch_idx][signal_dim1][signal_dim2][2]"
+              "For all types of input, the last dimension of the output represents the components of a complex number.",
+              "T",
+              OpSchema::Single,
+              true,
+              1,
+              OpSchema::NonDifferentiable)
+      .TypeConstraint(
+                "T",
+                {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+                "Constrain input and output types to float tensors.")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
-        int64_t ndim = 1;
-        auto attr_proto = ctx.getAttribute("signal_ndim");
-        if (attr_proto && attr_proto->has_i()) {
-          ndim = static_cast<size_t>(attr_proto->i());
-        }
+                propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                const int64_t batch_ndim = 1;
+                
+                auto& input_shape = getInputShape(ctx, 0);
+                ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape;
+                auto dim_size = static_cast<int64_t>(input_shape.dim_size());
+                auto has_component_dimension = dim_size > 2; 
 
-        auto& input_shape = getInputShape(ctx, 0);
-        ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape;
+                if (has_component_dimension) {
+                  result_shape.mutable_dim(static_cast<int>(dim_size - 1))->set_dim_value(2);
+                } else {
+                  result_shape.add_dim()->set_dim_value(2);  
+                }
 
-        auto dim_size = static_cast<int64_t>(input_shape.dim_size());
-        if (dim_size == ndim + 1) {                  // real input
-          result_shape.add_dim()->set_dim_value(2);  // output is same shape, but with extra dim for 2 values (real/imaginary)
-        } else if (dim_size == ndim + 2) {           // complex input, do nothing
-        } else {
-          fail_shape_inference(
-              "the input_shape must have 1 + signal_ndim dimensions for real inputs, or 2 + signal_ndim dimensions for complex input.")
-        }
-
-        updateOutputShape(ctx, 0, result_shape);
+                updateOutputShape(ctx, 0, result_shape);
       });
 
   MS_SIGNAL_OPERATOR_SCHEMA(STFT)
       .SetDomain(kMSExperimentalDomain)
       .SinceVersion(1)
       .SetDoc(R"DOC(STFT)DOC")
-      .Attr("onesided",
-            "If True (default), only values for half of the fft size are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry."
-            "The output tensor will return the first floor(n_fft/2) + 1 values from the DFT."
-            "Values can be 0 or 1.",
-            AttributeProto::AttributeType::AttributeProto_AttributeType_INT,
-            static_cast<int64_t>(1))
+      .Attr(
+          "onesided",
+          "If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because "
+          "the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. "
+          "Note if the input or window tensors are complex, then onesided output is not possible. "
+          "Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT)."
+          "When invoked with real or complex valued input, the default value is 0. "
+          "Values can be 0 or 1.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
       .Input(0,
              "signal",
-             "A complex signal of dimension signal_ndim."
-             "The last dimension of the tensor should be 2,"
-             "representing the real and imaginary components of complex numbers,"
-             "and should have at least signal_ndim + 2 dimensions."
-             "The first dimension is the batch dimension.",
-             "T1")
+             "Input tensor representing a real or complex valued signal. "
+             "For real input, the following shape is expected: [batch_size][signal_length]. "
+             "For complex input, the following shape is expected: [batch_size][signal_length][2], where "
+             "[batch_size][signal_length][0] represents the real component and [batch_size][signal_length][1] represents the imaginary component of the signal.",
+             "T1",
+             OpSchema::Single,
+             true,
+             1,
+             OpSchema::NonDifferentiable)
       .Input(1,
+             "frame_step",
+             "The number of samples to step between successive DFTs.",
+             "T2",
+             OpSchema::Single,
+             true,
+             1,
+             OpSchema::NonDifferentiable)
+      .Input(2,
              "window",
-             "A tensor representing the window that will be slid over the input signal.",
+             "A tensor representing the window that will be slid over the signal."
+             "The window must have rank 1 with shape: [window_shape]. "
+             "It's an optional value. ",
              "T1",
-             OpSchema::FormalParameterOption::Optional)
-      .Input(2,
-             "frame_length",  // frame_length, fft_length, pad_mode
-             "Size of the fft.",
-             "T2",
-             OpSchema::FormalParameterOption::Optional)
+             OpSchema::Optional,
+             true,
+             1,
+             OpSchema::NonDifferentiable)
       .Input(3,
-             "frame_step",
-             "The number of samples to step between successive DFTs.",
-             "T2")
+             "frame_length",
+             "A scalar representing the size of the DFT. "
+             "It's an optional value.",
+             "T2",
+             OpSchema::Optional,
+             true,
+             1,
+             OpSchema::NonDifferentiable)
       .Output(0,
               "output",
               "The inverse fourier transform of the input vector,"
               "using the same format as the input.",
               "T1")
-      .TypeConstraint("T1", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "")
-      .TypeConstraint("T2", {"tensor(int64)"}, "");
+      .TypeConstraint(
+          "T1",
+          {"tensor(float)",
+              "tensor(float16)",
+              "tensor(double)",
+              "tensor(bfloat16)"},
+          "Constrain signal and output to float tensors.")
+      .TypeConstraint(
+          "T2",
+          {"tensor(int64)"},
+          "Constrain scalar length types to int64_t.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+            propagateElemTypeFromInputToOutput(ctx, 0, 0);
+            constexpr int64_t batch_ndim = 1;
+            constexpr int64_t component_ndim = 1;
+
+            // Get inputs
+            auto& input_shape = getInputShape(ctx, 0);
+            auto frame_step = get_scalar_value_from_tensor<int64_t>(ctx.getInputData(1));
+            const ONNX_NAMESPACE::TensorShapeProto* window_input = nullptr;
+            try {
+                window_input = getOptionalInputShape(ctx, 2);
+            } catch (...) {
+                window_input = nullptr;
+            }
+
+            const ONNX_NAMESPACE::TensorShapeProto* frame_length_input = nullptr;
+            try {
+                frame_length_input = getOptionalInputShape(ctx, 3);
+            } catch (...) {
+                frame_length_input = nullptr;
+            }
+
+            // Determine the size of the DFT based on the 2 optional inputs window and frame_length. One must be set.
+            int64_t dft_size = 0;
+            if (window_input == nullptr && frame_length_input == nullptr) {
+                fail_type_inference("STFT expects to have at least one of these inputs set: [window, frame_length].");
+            } else if (window_input != nullptr && window_input->dim_size() > 0 && frame_length_input != nullptr) {
+                if (window_input->dim_size() != 1) {
+                fail_type_inference("STFT's window input, must have rank = 1.");
+                }
+                auto window_length = window_input->dim(0).dim_value();
+                auto frame_length = get_scalar_value_from_tensor<int64_t>(ctx.getInputData(3));
+                if (window_length != frame_length) {
+                fail_type_inference("If STFT has both a window input and frame_length specified, the dimension of the window must match the frame_length specified!");
+                }
+                dft_size = window_length;
+            } else if (window_input != nullptr && window_input->dim_size() > 0) {
+                if (window_input->dim_size() != 1) {
+                fail_type_inference("STFT's window input, must have rank = 1.");
+                }
+                dft_size = window_input->dim(0).dim_value();
+            } else if (frame_length_input != nullptr) {
+                dft_size = get_scalar_value_from_tensor<int64_t>(ctx.getInputData(3));
+            }
+
+            bool is_onesided = static_cast<bool>(getAttribute(ctx, "onesided", 0));
+            if (is_onesided) {
+                dft_size = is_onesided ? ((dft_size >> 1) + 1) : dft_size;
+            }
+
+            auto signal_size = input_shape.dim(1).dim_value();
+            auto n_dfts = static_cast<int64_t>(std::floor((signal_size - dft_size) / static_cast<float>(frame_step)) + 1);
+
+            // The output has the following shape: [batch_size][frames][dft_unique_bins][2]
+            ONNX_NAMESPACE::TensorShapeProto result_shape_proto;
+            result_shape_proto.add_dim()->set_dim_value(input_shape.dim(0).dim_value());  // batch size
+            result_shape_proto.add_dim()->set_dim_value(n_dfts);
+            result_shape_proto.add_dim()->set_dim_value(dft_size);
+            result_shape_proto.add_dim()->set_dim_value(2);
+            updateOutputShape(ctx, 0, result_shape_proto);
+    });
 
   // Window Functions
   MS_SIGNAL_OPERATOR_SCHEMA(HannWindow)
diff --git a/onnxruntime/core/optimizer/bias_dropout_fusion.cc b/onnxruntime/core/optimizer/bias_dropout_fusion.cc
index e81dd40c482d5..31f800aa6f0cd 100644
--- a/onnxruntime/core/optimizer/bias_dropout_fusion.cc
+++ b/onnxruntime/core/optimizer/bias_dropout_fusion.cc
@@ -188,10 +188,10 @@ Status BiasDropoutFusion::ApplyImpl(Graph& graph, bool& modified, int graph_leve
                                                   kMSDomain);
 
     // Get attribute "seed" from "Dropout" node if available.
-    NodeAttributes dropout_attrs = dropout_node.GetAttributes();
+    const NodeAttributes& dropout_attrs = dropout_node.GetAttributes();
     NodeAttributes::const_iterator seed = dropout_attrs.find("seed");
     if (seed != dropout_attrs.end()) {
-      dropout_add_fusion_node.AddAttribute("seed", seed->second);
+      dropout_add_fusion_node.AddAttributeProto(seed->second);
     }
 
     // Assign provider to this new node. Provider should be same as the provider for old node.
diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc
index 4d6fdebbe49a1..a02aa309a0bd2 100644
--- a/onnxruntime/core/optimizer/conv_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc
@@ -8,6 +8,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_utils.h"
+#include "core/graph/node_attr_utils.h"
 #include "core/optimizer/utils.h"
 #include "core/optimizer/selectors_actions/actions.h"
 
@@ -137,23 +138,6 @@ class ConvAddRelu : public NodeSelector {
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 namespace actions {
-// TODO refactor to lift common logic from Node::AddAttribute()
-void SetStringAttribute(std::string name, std::string value, NodeAttributes& attributes) {
-  ONNX_NAMESPACE::AttributeProto a{};
-  a.set_name(name);
-  a.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING);
-  a.set_s(std::move(value));
-  attributes.insert_or_assign(std::move(name), std::move(a));
-};
-
-void SetFloatsAttribute(std::string name, gsl::span<float> value, NodeAttributes& attributes) {
-  ONNX_NAMESPACE::AttributeProto a{};
-  a.set_name(name);
-  a.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOATS);
-  a.mutable_floats()->Assign(value.begin(), value.end());
-  attributes.insert_or_assign(std::move(name), std::move(a));
-};
-
 using NTO = NodesToOptimize;
 
 class FuseConvActivation : public ReplaceWithNew {
@@ -169,7 +153,7 @@ class FuseConvActivation : public ReplaceWithNew {
     ORT_ENFORCE(activation != nullptr, "Expected activation node.");
 
     const auto& activation_op_type = activation->OpType();
-    SetStringAttribute("activation", activation_op_type, extra_fused_conv_attributes);
+    utils::SetNodeAttribute(utils::MakeAttribute("activation", activation_op_type), extra_fused_conv_attributes);
 
     InlinedVector<float> activation_params;
     if (activation_op_type == "LeakyRelu") {
@@ -190,7 +174,8 @@ class FuseConvActivation : public ReplaceWithNew {
     }
 
     if (!activation_params.empty()) {
-      SetFloatsAttribute("activation_params", activation_params, extra_fused_conv_attributes);
+      utils::SetNodeAttribute(utils::MakeAttribute("activation_params", activation_params),
+                              extra_fused_conv_attributes);
     }
 
     return extra_fused_conv_attributes;
@@ -215,7 +200,7 @@ class FuseConvAddRelu : public ReplaceWithNew {
 
   NodeAttributes ExtraAttributes(const RuntimeState&) const override {
     NodeAttributes extra_fused_conv_attributes;
-    SetStringAttribute("activation", "Relu", extra_fused_conv_attributes);
+    utils::SetNodeAttribute(utils::MakeAttribute("activation", "Relu"), extra_fused_conv_attributes);
     return extra_fused_conv_attributes;
   }
 
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 9bf517226fa7f..6b9c0f897a4e4 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -38,11 +38,7 @@ static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_
                              kOnnxDomain);
 
   // Add attribute: "to" = 6
-  ONNX_NAMESPACE::AttributeProto to;
-  to.set_name("to");
-  to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-  to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
-  node.AddAttribute("to", std::move(to));
+  node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_INT32});
 
   node.SetExecutionProviderType(provider_type);
   return &cast32;
@@ -525,7 +521,7 @@ static void CreateEmbedLayernormNode(Graph& graph,
   NodeAttributes ln_attrs = layer_norm_node.GetAttributes();
   NodeAttributes::const_iterator epsilon = ln_attrs.find("epsilon");
   if (epsilon != ln_attrs.end()) {
-    embed_layer_norm_node.AddAttribute("epsilon", epsilon->second);
+    embed_layer_norm_node.AddAttributeProto(epsilon->second);
   } else {
     embed_layer_norm_node.AddAttribute("epsilon", contrib::kDefaultEmbedLayerNormEpsilon);
   }
diff --git a/onnxruntime/core/optimizer/gemm_activation_fusion.cc b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
index 21e3b40e10d1c..9c0f0a8d202b2 100644
--- a/onnxruntime/core/optimizer/gemm_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
@@ -83,7 +83,7 @@ Status GemmActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
     for (const auto& attr : attrs) {
       AttributeProto fused_gemm_attr(attr.second);
       fused_gemm_attr.set_name("activation_" + attr.first);
-      fused_gemm.AddAttribute("activation_" + attr.first, std::move(fused_gemm_attr));
+      fused_gemm.AddAttributeProto(std::move(fused_gemm_attr));
     }
 
     // move output definitions and edges from act_node to fused_gemm. delete gemm_node and act_node.
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index 019b5a3949799..af8183b9e63bc 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -447,7 +447,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
       continue;
     }
     bool cast_1_present = false;
-    int64_t cast_1_to_attr;
+    int64_t cast_1_to_attr{};
     // check if there are Casts as input to the Pow and Div
     if (p_div_input == p_pow_input) {
       const Node* p_pow_input_node = graph_utils::GetInputNode(pow_node, 0);
@@ -574,7 +574,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
     // Assign provider to this new node. Provider should be same as the provider for old node.
     layer_norm_node.SetExecutionProviderType(reduce_mean_node.GetExecutionProviderType());
 
-    if (allow_precision_change_ && p_cast_2 != nullptr) {
+    if (allow_precision_change_ && cast_1_present && p_cast_2 != nullptr) {
       ONNX_NAMESPACE::TensorProto_DataType cast_1_type = gsl::narrow_cast<ONNX_NAMESPACE::TensorProto_DataType>(cast_1_to_attr);
       const ONNX_NAMESPACE::TypeProto* casted_type = DataTypeImpl::TensorTypeFromONNXEnum(cast_1_type)->GetTypeProto();
       NodeArg* LN_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("layer_norm_out"), casted_type);
diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
index ad8fb13b56a49..a81ca67052db0 100644
--- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
@@ -248,7 +248,7 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     NodeAttributes ln_attrs = ln_node.GetAttributes();
     NodeAttributes::const_iterator epsilon = ln_attrs.find("epsilon");
     if (epsilon != ln_attrs.end()) {
-      skip_layer_norm_node.AddAttribute("epsilon", epsilon->second);
+      skip_layer_norm_node.AddAttributeProto(epsilon->second);
     } else {
       skip_layer_norm_node.AddAttribute("epsilon", contrib::kDefaultSkipLayerNormEpsilon);
     }
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
index 26e99574b1871..a4efbf6a90e74 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
@@ -326,7 +326,7 @@ void ApiNode::CopyAttributes(const api::NodeRef& node) {
   const ApiNode& ort_node = static_cast<const ApiNode&>(node);
   const NodeAttributes& attributes = ort_node.node_.GetAttributes();
   for (const auto& pair : attributes) {
-    node_.AddAttribute(pair.first, pair.second);
+    node_.AddAttributeProto(pair.second);
   }
 }
 
diff --git a/onnxruntime/core/platform/posix/ort_mutex.cc b/onnxruntime/core/platform/posix/ort_mutex.cc
index 8a5d41eb36080..e124ce168085f 100644
--- a/onnxruntime/core/platform/posix/ort_mutex.cc
+++ b/onnxruntime/core/platform/posix/ort_mutex.cc
@@ -5,7 +5,6 @@
 #include "core/platform/ort_mutex.h"
 #include <assert.h>
 #include <stdexcept>
-#include <system_error>
 #include <sstream>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 1a3818b758def..1a1f30fdf97a5 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include "core/platform/env.h"
 
-#include <Shlwapi.h>
 #include <Windows.h>
 
 #include <fstream>
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc
index 81cdb6d5577f3..f9faae787ac4a 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc
@@ -202,7 +202,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop,
 
   // Transpose021 has a specialized Transpose3DImpl kernel
   dim3 grid_size, block_size;
-  if (CanDoTranspose3D(prop, new_rank, new_input_dims, new_permutations, grid_size, block_size)) {
+  if (CanDoTranspose3D(prop, static_cast<size_t>(new_rank), new_input_dims, new_permutations, grid_size, block_size)) {
     TensorPitches new_input_strides(new_input_dims);
     return Transpose3DImpl(stream, element_size, ToConstSpan(new_input_dims), ToConstSpan(new_input_strides),
                            input.DataRaw(), output.MutableDataRaw(), output.Shape().Size(), grid_size, block_size);
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
index 0b77a2012ef14..3d79e126a2e7a 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
@@ -7,44 +7,52 @@
 namespace onnxruntime {
 namespace cuda {
 
-constexpr unsigned int TILE_DIM = 16;
+constexpr unsigned int NUM_ELE_PER_THREAD = 4;
 
-template <typename T>
-__global__ void Transpose3DKernel(const TArray<int64_t> input_shape,
-                                  const TArray<int64_t> input_strides,
+template <typename T, unsigned int TILE_DIM>
+__global__ void Transpose3DKernel(const TArray<int64_t> input_shape, const TArray<int64_t> input_strides,
                                   const T* input_data, T* output_data) {
-  __shared__ T tile[TILE_DIM * (TILE_DIM + 1)];
+  __shared__ T tile[TILE_DIM][TILE_DIM + 1];
 
   int x = blockIdx.x * TILE_DIM + threadIdx.x;
   int y = blockIdx.y * TILE_DIM + threadIdx.y;
 
-  tile[threadIdx.y * TILE_DIM + threadIdx.x] = input_data[blockIdx.z * input_strides[0] + y * input_shape[2] + x];
+#pragma unroll
+  for (unsigned int i = 0; i < TILE_DIM; i += (TILE_DIM / NUM_ELE_PER_THREAD)) {
+    tile[threadIdx.y + i][threadIdx.x] = input_data[blockIdx.z * input_strides[0] + (y + i) * input_shape[2] + x];
+  }
   __syncthreads();
 
   x = blockIdx.y * TILE_DIM + threadIdx.x;
   y = blockIdx.x * TILE_DIM + threadIdx.y;
 
-  output_data[blockIdx.z * input_strides[0] + y * input_shape[1] + x] = tile[threadIdx.x * TILE_DIM + threadIdx.y];
+#pragma unroll
+  for (unsigned int i = 0; i < TILE_DIM; i += (TILE_DIM / NUM_ELE_PER_THREAD)) {
+    output_data[blockIdx.z * input_strides[0] + (y + i) * input_shape[1] + x] = tile[threadIdx.x][threadIdx.y + i];
+  }
 }
 
-bool CanDoTranspose3D(const cudaDeviceProp& prop,
-                      int32_t rank,
-                      const gsl::span<const int64_t>& input_dims,
-                      const gsl::span<const size_t>& permutations,
-                      dim3& grid_size, dim3& block_size) {
-  if (rank == 3 &&
-      // permutation is done in the last two dimensions.
-      permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2) &&
-      // the last two dimensions are aligned with TILE_DIM.
-      input_dims[rank - 2] % TILE_DIM == 0 && input_dims[rank - 1] % TILE_DIM == 0) {
-    int grid_size_x = static_cast<int>(input_dims[2] / TILE_DIM);
-    int grid_size_y = static_cast<int>(input_dims[1] / TILE_DIM);
+bool CanDoTranspose3D(const cudaDeviceProp& prop, size_t rank, const gsl::span<const int64_t>& input_dims,
+                      const gsl::span<const size_t>& permutations, dim3& grid_size, dim3& block_size) {
+  // Permutation is done in the last two dimensions and the last two dimensions are aligned with TILE_DIM.
+  if (rank == 3 && permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2)) {
+    unsigned int tile_dim = 0;
+    if (input_dims[rank - 2] % 32 == 0 && input_dims[rank - 1] % 32 == 0) {
+      tile_dim = 32;
+    } else if (input_dims[rank - 2] % 16 == 0 && input_dims[rank - 1] % 16 == 0) {
+      tile_dim = 16;
+    } else {
+      return false;
+    }
+
+    int grid_size_x = static_cast<int>(input_dims[2] / tile_dim);
+    int grid_size_y = static_cast<int>(input_dims[1] / tile_dim);
     int grid_size_z = static_cast<int>(input_dims[0]);
 
-    if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] && grid_size_z <= prop.maxGridSize[2]) {
-      block_size = dim3(TILE_DIM, TILE_DIM);
-      grid_size = dim3(static_cast<unsigned int>(grid_size_x),
-                       static_cast<unsigned int>(grid_size_y),
+    if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] &&
+        grid_size_z <= prop.maxGridSize[2]) {
+      block_size = dim3(tile_dim, tile_dim / NUM_ELE_PER_THREAD);
+      grid_size = dim3(static_cast<unsigned int>(grid_size_x), static_cast<unsigned int>(grid_size_y),
                        static_cast<unsigned int>(grid_size_z));
       return true;
     } else {
@@ -54,34 +62,28 @@ bool CanDoTranspose3D(const cudaDeviceProp& prop,
   return false;
 }
 
-Status Transpose3DImpl(cudaStream_t stream, size_t element_size,
-                       const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides,
-                       const void* input_data, void* output_data, int64_t N, const dim3& grid_size, const dim3& block_size) {
+#define CALL_TRANSPOSE_3D(type, tile_dim)                                                            \
+  Transpose3DKernel<type, tile_dim><<<grid_size, block_size, 0, stream>>>(                           \
+      input_shape, input_strides, reinterpret_cast<const ToCudaType<type>::MappedType*>(input_data), \
+      reinterpret_cast<ToCudaType<type>::MappedType*>(output_data))
+
+#define HANDLE_TRANSPOSE_3D_TILE_DIM(type) \
+  case sizeof(type): {                     \
+    if (block_size.x == 32) {              \
+      CALL_TRANSPOSE_3D(type, 32);         \
+    } else {                               \
+      CALL_TRANSPOSE_3D(type, 16);         \
+    }                                      \
+  } break
+
+Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
+                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
+                       const dim3& grid_size, const dim3& block_size) {
   switch (element_size) {
-    case sizeof(int8_t):
-      Transpose3DKernel<int8_t><<<grid_size, block_size, 0, stream>>>(
-          input_shape, input_strides,
-          reinterpret_cast<const ToCudaType<int8_t>::MappedType*>(input_data),
-          reinterpret_cast<ToCudaType<int8_t>::MappedType*>(output_data));
-      break;
-    case sizeof(int16_t):
-      Transpose3DKernel<int16_t><<<grid_size, block_size, 0, stream>>>(
-          input_shape, input_strides,
-          reinterpret_cast<const ToCudaType<int16_t>::MappedType*>(input_data),
-          reinterpret_cast<ToCudaType<int16_t>::MappedType*>(output_data));
-      break;
-    case sizeof(int32_t):
-      Transpose3DKernel<int32_t><<<grid_size, block_size, 0, stream>>>(
-          input_shape, input_strides,
-          reinterpret_cast<const ToCudaType<int32_t>::MappedType*>(input_data),
-          reinterpret_cast<ToCudaType<int32_t>::MappedType*>(output_data));
-      break;
-    case sizeof(int64_t):
-      Transpose3DKernel<int64_t><<<grid_size, block_size, 0, stream>>>(
-          input_shape, input_strides,
-          reinterpret_cast<const ToCudaType<int64_t>::MappedType*>(input_data),
-          reinterpret_cast<ToCudaType<int64_t>::MappedType*>(output_data));
-      break;
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int16_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int32_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int64_t);
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ",
                              element_size);
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h
index 4e4d7d8bcabab..96d2686170e1d 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h
@@ -9,7 +9,7 @@ namespace onnxruntime {
 namespace cuda {
 
 bool CanDoTranspose3D(const cudaDeviceProp& prop,
-                      int32_t rank, const gsl::span<const int64_t>& input_dims, const gsl::span<const size_t>& permutations,
+                      size_t rank, const gsl::span<const int64_t>& input_dims, const gsl::span<const size_t>& permutations,
                       dim3& grid_size, dim3& block_size);
 Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides, const void* input_data,
                        void* output_data, int64_t N,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
index b029b56bb1b43..54223e450f925 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
@@ -214,7 +214,7 @@ namespace Dml
                 // Change the name of the attribute to its fused node version
                 std::string fusedAttributeName = Dml::FusionHelpers::GetFusedAttributeName(attribute.first);
                 attribute.second.set_name(fusedAttributeName);
-                node.AddAttribute(fusedAttributeName, attribute.second);
+                node.AddAttributeProto(attribute.second);
             }
         }
     }
diff --git a/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc b/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc
deleted file mode 100644
index f066c6023ebf2..0000000000000
--- a/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright (c) 2022 Hailo Technologies Ltd. All rights reserved.
- * Distributed under the MIT license (https://opensource.org/licenses/MIT)
- **/
-
-#include "core/providers/shared_library/provider_api.h"
-#include "hailo_global_vdevice.h"
-
-namespace onnxruntime {
-
-std::mutex GlobalVDevice::m_mutex;
-
-GlobalVDevice& GlobalVDevice::get_instance()
-{
-    static GlobalVDevice instance;
-    return instance;
-}
-    
-std::shared_ptr<VDevice> GlobalVDevice::get_vdevice()
-{
-    std::lock_guard<std::mutex> lock(m_mutex);
-    if (!m_vdevice) {
-        m_vdevice = create_vdevice();
-    }
-
-    return m_vdevice;
-}
-
-void GlobalVDevice::release()
-{
-    std::lock_guard<std::mutex> lock(m_mutex);
-    if (m_vdevice.use_count() <= 1) {
-        m_vdevice.reset();
-    }
-}
-
-std::shared_ptr<VDevice> GlobalVDevice::create_vdevice()
-{
-    hailo_vdevice_params_t params;
-    auto status = hailo_init_vdevice_params(&params);
-    HAILO_ORT_ENFORCE(HAILO_SUCCESS == status, "Failed init vdevice_params, status = ", status);
-    params.scheduling_algorithm = HAILO_SCHEDULING_ALGORITHM_ROUND_ROBIN;
-
-    auto vdevice = VDevice::create(params);
-    HAILO_CHECK_EXPECTED(vdevice, "Creating VDevice failed");
-    return vdevice.release();
-}
-
-}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/hailo/hailo_global_vdevice.h b/onnxruntime/core/providers/hailo/hailo_global_vdevice.h
deleted file mode 100644
index df8f81cd1410c..0000000000000
--- a/onnxruntime/core/providers/hailo/hailo_global_vdevice.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright (c) 2022 Hailo Technologies Ltd. All rights reserved.
- * Distributed under the MIT license (https://opensource.org/licenses/MIT)
- **/
-
-#pragma once
-#include "hailo/hailort.hpp"
-#include "utils.h"
-
-namespace onnxruntime {
-
-using hailort::VDevice;
-
-class GlobalVDevice {
-public:
-    static GlobalVDevice& get_instance();
-    std::shared_ptr<VDevice> get_vdevice();
-    void release();
-
-    static std::mutex m_mutex;
-
-private:
-    GlobalVDevice() : m_vdevice() {}
-    GlobalVDevice(GlobalVDevice const&) = delete;
-    void operator=(GlobalVDevice const&) = delete;
-
-    std::shared_ptr<VDevice> create_vdevice();
-
-    std::shared_ptr<VDevice> m_vdevice;
-};
-
-}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/hailo/hailo_op.cc b/onnxruntime/core/providers/hailo/hailo_op.cc
index dbe603b13ce9d..65b8e8906b92a 100644
--- a/onnxruntime/core/providers/hailo/hailo_op.cc
+++ b/onnxruntime/core/providers/hailo/hailo_op.cc
@@ -6,7 +6,6 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "hailo_op.h"
 #include "utils.h"
-#include "hailo_global_vdevice.h"
 
 #include <iostream>
 #include <mutex>
@@ -50,7 +49,16 @@ HailoKernel::HailoKernel(const OpKernelInfo& info) : OpKernel(info), m_mutex()
     HAILO_ORT_ENFORCE(status.IsOK(), "attribute '",  OUTPUT_ORDER_ATTRIBUTE, "' is not set");
 
     m_hef = create_hef_from_memory(binary_hef.c_str(), binary_hef.length());
-    m_vdevice = GlobalVDevice::get_instance().get_vdevice();
+
+    hailo_vdevice_params_t params;
+    auto hailo_status = hailo_init_vdevice_params(&params);
+    HAILO_ORT_ENFORCE(HAILO_SUCCESS == hailo_status, "Failed init vdevice_params, status = ", hailo_status);
+    params.scheduling_algorithm = HAILO_SCHEDULING_ALGORITHM_ROUND_ROBIN;
+    params.group_id = "SHARED";
+    auto expected_vdevice = VDevice::create(params);
+    HAILO_CHECK_EXPECTED(expected_vdevice, "Failed to create VDevice");
+    m_vdevice = std::move(expected_vdevice.value());
+
     m_network_group = configure_network_group(*m_vdevice.get());
 
     auto output_nodes = info.node().OutputDefs();
@@ -64,7 +72,6 @@ HailoKernel::~HailoKernel()
     m_network_group.reset();
     m_vdevice.reset();
     m_hef.reset();
-    GlobalVDevice::get_instance().release();
 }
 
 std::unique_ptr<Hef> HailoKernel::create_hef_from_memory(const void* binary_hef, size_t size)
diff --git a/onnxruntime/core/providers/tvm/custom_logging.cc b/onnxruntime/core/providers/tvm/custom_logging.cc
index 08053e456aed9..3140683825da6 100644
--- a/onnxruntime/core/providers/tvm/custom_logging.cc
+++ b/onnxruntime/core/providers/tvm/custom_logging.cc
@@ -11,31 +11,33 @@
 #include <string>
 #include <vector>
 
+
 // TODO(agladyshev): Make conditional choice of sep for Windows and UNIX
 std::string GetFileName(const std::string& file_path, char sep = '/') {
-    return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1),
-            file_path.end()};
+  return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1),
+          file_path.end()};
 }
 
 std::string GetTimedLogMessage(const std::string& file, int lineno, const std::string& message) {
-    std::stringstream sstream;
-    std::string file_name = GetFileName(file);
-    std::time_t t = std::time(nullptr);
-    sstream << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "][TVM] "
-            << file_name << ":" << lineno << ": " + message;
-    return sstream.str();
+  std::stringstream sstream;
+  std::string file_name = GetFileName(file);
+  std::time_t t = std::time(nullptr);
+  sstream << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "][TVM] "
+          << file_name << ":" << lineno << ": " + message;
+  return sstream.str();
 }
 
 namespace tvm {
 namespace runtime {
 namespace detail {
-    void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
-        throw std::runtime_error(GetTimedLogMessage(file, lineno, message));
-    }
+  void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+    throw std::runtime_error(GetTimedLogMessage(file, lineno, message));
+  }
+
+  void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+    std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl;
+  }
 
-    void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
-        std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl;
-    }
 }  // namespace detail
 }  // namespace runtime
 }  // namespace tvm
diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.cc b/onnxruntime/core/providers/tvm/tvm_allocator.cc
index 4fadff5c57b9a..ef06e1f59a94a 100644
--- a/onnxruntime/core/providers/tvm/tvm_allocator.cc
+++ b/onnxruntime/core/providers/tvm/tvm_allocator.cc
@@ -8,6 +8,7 @@
 
 
 namespace onnxruntime {
+namespace tvm {
 
 void* TVMAllocator::Alloc(size_t size) {
   void* p = nullptr;
@@ -24,4 +25,5 @@ void TVMAllocator::Free(void* p) {
     TVMDeviceFreeDataSpace(ctx, p);
 }
 
-}  // namespace onnxruntime
+}   // namespace tvm
+}   // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.h b/onnxruntime/core/providers/tvm/tvm_allocator.h
index 11854024879c9..50a13f890ac86 100644
--- a/onnxruntime/core/providers/tvm/tvm_allocator.h
+++ b/onnxruntime/core/providers/tvm/tvm_allocator.h
@@ -7,7 +7,9 @@
 #include "core/framework/allocator.h"
 #include "tvm_common.h"
 
+
 namespace onnxruntime {
+namespace tvm {
 
 #define TVM_ALLOC_ALIGN 128
 
@@ -22,14 +24,14 @@ class TVMAllocator : public IAllocator {
     : IAllocator(info) {
       switch (info.device.Type()) {
       case OrtDevice::CPU:
-          ctx = {kDLCPU, info.device.Id()};
-          break;
+        ctx = {kDLCPU, info.device.Id()};
+        break;
       case OrtDevice::GPU:
-          ctx = {kDLVulkan, info.device.Id()};
-          break;
+        ctx = {kDLVulkan, info.device.Id()};
+        break;
       default:
-          ORT_NOT_IMPLEMENTED("Unsupported device");
-          break;
+        ORT_NOT_IMPLEMENTED("Unsupported device");
+        break;
       }
     }
 
@@ -38,5 +40,7 @@ class TVMAllocator : public IAllocator {
   DLDevice ctx;
 };
 
-}  // namespace onnxruntime
+}   // namespace tvm
+}   // namespace onnxruntime
+
 #endif // TVM_ALLOCATOR
diff --git a/onnxruntime/core/providers/tvm/tvm_api.cc b/onnxruntime/core/providers/tvm/tvm_api.cc
index f225e06528c7c..ff61c6c43d33e 100644
--- a/onnxruntime/core/providers/tvm/tvm_api.cc
+++ b/onnxruntime/core/providers/tvm/tvm_api.cc
@@ -17,16 +17,9 @@ using TvmPackedFunc = ::tvm::PackedFunc;
 
 TvmModule TVMCompile(const std::string& onnx_txt,
                      const std::string& model_path,
-                     const std::string& executor,
-                     const std::string& target,
-                     const std::string& target_host,
-                     int opt_level,
+                     const TvmEPOptions& options,
                      int opset,
-                     bool freeze_params,
-                     const std::vector<std::vector<int64_t>>& input_shapes,
-                     bool nhwc,
-                     const std::string& tuning_logfile,
-                     const std::string& tuning_type)
+                     const TVMTensorShapes& input_shapes)
 {
   ::tvm::Array<TvmIntArray> shapes;
   for (size_t i = 0; i < input_shapes.size(); ++i)
@@ -41,19 +34,18 @@ TvmModule TVMCompile(const std::string& onnx_txt,
 
   const TvmPackedFunc* compile = ::tvm::runtime::Registry::Get("tvm_onnx_import_and_compile");
   ORT_ENFORCE(compile != nullptr, "Unable to retrieve 'tvm_onnx_import_and_compile'.");
-  TvmModule mod = (*compile)(
-          TVMByteArray{onnx_txt.data(), onnx_txt.size()},
-          model_path,
-          executor,
-          target,
-          target_host,
-          opt_level,
-          opset,
-          freeze_params,
-          shapes,
-          nhwc,
-          tuning_logfile,
-          tuning_type);
+  TvmModule mod = (*compile)(TVMByteArray{onnx_txt.data(), onnx_txt.size()},
+                             model_path,
+                             options.executor,
+                             options.target,
+                             options.target_host,
+                             options.opt_level,
+                             opset,
+                             options.freeze_weights,
+                             shapes,
+                             options.to_nhwc,
+                             options.tuning_file_path,
+                             options.tuning_type);
   ORT_ENFORCE(mod.get() != nullptr, "Compiled TVM Module is nullptr!");
   return mod;
 }
@@ -108,20 +100,19 @@ void TVM_VM_GetOutputs(TvmModule& mod,
 }
 
 void TVMGetOutputShapes(TvmModule& mod,
-                        size_t num_outputs,
-                        std::vector<std::vector<int64_t>>& output_shapes)
+                        TVMTensorShapes& output_shapes)
 {
-  output_shapes.clear();
+  size_t size = output_shapes.size();
   TvmPackedFunc get_output = mod.GetFunction("get_output", false);
-  for (size_t i = 0; i < num_outputs; ++i) {
+  for (size_t i = 0; i < size; ++i) {
     ::tvm::runtime::NDArray output_array = get_output(i);
     ::tvm::runtime::ShapeTuple shape_tuple = output_array.Shape();
     size_t dims_num = shape_tuple.size();
-    std::vector<int64_t> dims;
+    TensorShapeVector dims;
     for (size_t j = 0; j < dims_num; ++j) {
       dims.push_back(int64_t(shape_tuple[j]));
     }
-    output_shapes.push_back(dims);
+    output_shapes[i] = dims;
   }
 }
 
diff --git a/onnxruntime/core/providers/tvm/tvm_api.h b/onnxruntime/core/providers/tvm/tvm_api.h
index 291da438618a9..77241def6e745 100644
--- a/onnxruntime/core/providers/tvm/tvm_api.h
+++ b/onnxruntime/core/providers/tvm/tvm_api.h
@@ -9,31 +9,27 @@
 
 #include "tvm_common.h"
 #include "tvm_defaults.h"
+#include "tvm_ep_options.h"
+
 
 namespace onnxruntime {
 namespace tvm {
-    TvmModule TVMCompile(const std::string& onnx_txt,
-                         const std::string& model_path,
-                         const std::string& executor,
-                         const std::string& target,
-                         const std::string& target_host,
-                         int opt_level,
-                         int opset,
-                         bool freeze_params,
-                         const std::vector<std::vector<int64_t>>& input_shapes,
-                         bool nhwc = false,
-                         const std::string& tuning_logfile = "",
-                         const std::string& tuning_type = std::string(onnxruntime::tvm::default_tuning_type));
-    void TVMSetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
-    void TVM_VM_SetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
-    void TVMGetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
-    void TVM_VM_GetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
-    void TVMGetOutputShapes(TvmModule& mod,
-                            size_t num_outputs,
-                            std::vector<std::vector<int64_t>>& output_shapes);
-    void TVMRun(TvmModule& mod);
-    void TVM_VM_Run(TvmModule& mod);
+
+  TvmModule TVMCompile(const std::string& onnx_txt,
+                       const std::string& model_path,
+                       const TvmEPOptions& options,
+                       int opset,
+                       const TVMTensorShapes& input_shapes);
+  void TVMSetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
+  void TVM_VM_SetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
+  void TVMGetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
+  void TVM_VM_GetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
+  void TVMGetOutputShapes(TvmModule& mod,
+                          TVMTensorShapes& output_shapes);
+  void TVMRun(TvmModule& mod);
+  void TVM_VM_Run(TvmModule& mod);
+
 }  // namespace tvm
 }  // namespace onnxruntime
 
-#endif  // TVM_API_H
\ No newline at end of file
+#endif  // TVM_API_H
diff --git a/onnxruntime/core/providers/tvm/tvm_common.h b/onnxruntime/core/providers/tvm/tvm_common.h
index 5b3a0c4dea9ec..17120eb2a2afe 100644
--- a/onnxruntime/core/providers/tvm/tvm_common.h
+++ b/onnxruntime/core/providers/tvm/tvm_common.h
@@ -4,10 +4,20 @@
 #ifndef TVM_COMMON_H
 #define TVM_COMMON_H
 
+#include <vector>
+#include <map>
+
 #include <dlpack/dlpack.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/vm/vm.h>
 
-using TvmModule = tvm::runtime::Module;
+
+namespace onnxruntime {
+namespace tvm {
+
+using TvmModule = ::tvm::runtime::Module;
+
+}   // namespace tvm
+}   // namespace onnxruntime
 
 #endif // TVM_COMMON_H
diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.cc b/onnxruntime/core/providers/tvm/tvm_compiler.cc
new file mode 100644
index 0000000000000..dfbf182506556
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_compiler.cc
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <utility>
+
+#include "tvm_compiler.h"
+#include "tvm_api.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+TVMCompiler::TVMCompiler(std::string&& onnx_model_str,
+                         const std::string& model_path,
+                         int opset) :
+onnx_model_str_(std::move(onnx_model_str)),
+model_path_(model_path),
+opset_(opset) {
+}
+
+auto TVMCompiler::operator()(const TvmEPOptions& options,
+                             const TVMTensorShapes& input_shapes) -> ModulePtr {
+  if (mod_) {
+    return mod_;
+  }
+
+  mod_ = std::make_shared<TvmModule>();
+  *mod_ = tvm::TVMCompile(onnx_model_str_,
+                          model_path_,
+                          options,
+                          opset_,
+                          input_shapes);
+  onnx_model_str_.clear();
+  return mod_;
+}
+
+}   // namespace tvm
+}   // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.h b/onnxruntime/core/providers/tvm/tvm_compiler.h
new file mode 100644
index 0000000000000..057ed058fd4c4
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_compiler.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef TVM_COMPILER_H
+#define TVM_COMPILER_H
+
+#include <string>
+#include <memory>
+
+#include "tvm_common.h"
+#include "tvm_ep_options.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+class TVMCompiler {
+  using ModulePtr = std::shared_ptr<TvmModule>;
+public:
+  TVMCompiler() = delete;
+  ~TVMCompiler() = default;
+
+  TVMCompiler(std::string&& onnx_model_str,
+              const std::string& model_path,
+              int opset);
+
+  ModulePtr operator()(const TvmEPOptions& options,
+                       const TVMTensorShapes& input_shapes);
+
+private:
+  ModulePtr mod_;
+  std::string onnx_model_str_;
+  std::string model_path_;
+  int opset_;
+};
+
+}   // namespace tvm
+}   // namespace onnxruntime
+
+#endif  // TVM_COMPILER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_defaults.h b/onnxruntime/core/providers/tvm/tvm_defaults.h
index e7928a1941176..030a4ea05d56e 100644
--- a/onnxruntime/core/providers/tvm/tvm_defaults.h
+++ b/onnxruntime/core/providers/tvm/tvm_defaults.h
@@ -11,7 +11,7 @@ constexpr const char* default_executor_type = "vm";
 constexpr const char* vm_executor_type = "vm";
 constexpr const char* graph_executor_type = "graph";
 
-constexpr const char* default_target_str = "cpu";
+constexpr const char* default_target_str = "llvm";
 constexpr const char* llvm_target_str = "llvm";
 
 constexpr const char* cpu_target_str = "cpu";
diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.cc b/onnxruntime/core/providers/tvm/tvm_ep_options.cc
new file mode 100644
index 0000000000000..6e2a077835d64
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_ep_options.cc
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <unordered_set>
+#include <regex>
+
+#include "core/common/common.h"
+#include "core/common/cpuid_info.h"
+#include "core/framework/provider_options_utils.h"
+
+#include "tvm_ep_options.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+namespace provider_option_names {
+constexpr const char* kExecutor = "executor";
+constexpr const char* kTarget = "target";
+constexpr const char* kTargetHost = "target_host";
+constexpr const char* kOptLevel = "opt_level";
+constexpr const char* kFreezeWeights = "freeze_weights";
+constexpr const char* kToNHWC = "to_nhwc";
+constexpr const char* kTuningFilePath = "tuning_file_path";
+constexpr const char* kTuningType = "tuning_type";
+constexpr const char* kInputNames = "input_names";
+constexpr const char* kInputShapes = "input_shapes";
+
+static const std::unordered_set<std::string> valid_keys {
+  std::string{kExecutor},
+  std::string{kTarget},
+  std::string{kTargetHost},
+  std::string{kOptLevel},
+  std::string{kFreezeWeights},
+  std::string{kToNHWC},
+  std::string{kTuningFilePath},
+  std::string{kTuningType},
+  std::string{kInputNames},
+  std::string{kInputShapes}
+};
+
+}  // namespace provider_option_names
+
+size_t split(const std::string &src, std::vector<std::string> &dst, char ch) {
+  dst.clear();
+
+  size_t pos = src.find( ch );
+  size_t initialPos = 0;
+  while( pos != std::string::npos ) {
+    dst.push_back( src.substr( initialPos, pos - initialPos ) );
+    initialPos = pos + 1;
+
+    pos = src.find( ch, initialPos );
+  }
+  dst.push_back( src.substr( initialPos, std::min( pos, src.size() ) - initialPos + 1 ) );
+
+  return dst.size();
+}
+
+TvmEPOptions TvmEPOptionsHelper::FromOptionsString(const char* opt_str) {
+  std::string settings{opt_str};
+  ProviderOptions options;
+  if (!settings.empty()) {
+    const std::string& str = settings;
+
+    // tokenize settings
+    std::regex reg("\\s*,\\s*");
+    std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1);
+    std::sregex_token_iterator iter_end;
+    std::vector<std::string> pairs(iter, iter_end);
+
+    ORT_ENFORCE(pairs.size() > 0);
+
+    for(const auto& pair : pairs) {
+      auto pos_colon = pair.find(':');
+      ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair.");
+      std::string key = pair.substr(0, pos_colon);
+      std::string value = pair.substr(pos_colon + 1);
+
+      // trim leading and trailing spaces from key/value
+      key = whitespace_trimming(key);
+      value = whitespace_trimming(value);
+
+      // Check keys of obtained options
+      if (tvm::provider_option_names::valid_keys.count(key) == 0) {
+        ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")");
+      }
+
+      options[key] = value;
+    }
+  }
+
+  return TvmEPOptionsHelper::FromProviderOptions(options);
+}
+
+std::string TvmEPOptionsHelper::whitespace_trimming(const std::string& str) {
+  const std::string WHITESPACE = " \n\r\t\f\v";
+  size_t start = str.find_first_not_of(WHITESPACE);
+  if (start == std::string::npos) {
+    return "";
+  } else {
+    size_t end = str.find_last_not_of(WHITESPACE);
+    ORT_ENFORCE(end != std::string::npos);
+    return str.substr(start, end + 1);
+  }
+}
+
+TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_options) {
+  TvmEPOptions options{};
+
+  ORT_THROW_IF_ERROR(
+    ProviderOptionsParser{}
+      .AddAssignmentToReference(tvm::provider_option_names::kExecutor, options.executor)
+      .AddAssignmentToReference(tvm::provider_option_names::kTarget, options.target)
+      .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host)
+      .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level)
+      .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights)
+      .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc)
+      .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path)
+      .AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type)
+      .AddAssignmentToReference(tvm::provider_option_names::kInputNames, options.input_names_str)
+      .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, options.input_shapes_str)
+      .Parse(pr_options));
+
+  optionsPostprocess(options);
+
+  return options;
+}
+
+void TvmEPOptionsHelper::optionsPostprocess(TvmEPOptions& options) {
+  setInputShapes(options);
+  targetPostprocess(options.target);
+  targetHostPostprocess(options.target, options.target_host);
+  optLevelPostprocess(options.opt_level);
+}
+
+bool TvmEPOptionsHelper::checkCPUTarget(const std::string& target) {
+  bool check = target.find("llvm") != std::string::npos;
+  return check;
+}
+
+bool TvmEPOptionsHelper::checkGPUTarget(const std::string& target) {
+  bool check = (
+    target.find("cuda") != std::string::npos ||
+    target.find("opencl") != std::string::npos ||
+    target.find("metal") != std::string::npos ||
+    target.find("vulkan") != std::string::npos
+  );
+  return check;
+}
+
+void TvmEPOptionsHelper::setInputShapes(TvmEPOptions& options) {
+  if (options.input_names_str.empty() && options.input_shapes_str.empty())
+    return;
+  ORT_ENFORCE(!options.input_names_str.empty() && !options.input_shapes_str.empty(),
+    "Both provider options \"input_names\" and \"input_shapes\" should be empty or full");
+
+  std::vector<std::string> name_set;
+  std::string trimmed_names = whitespace_trimming(options.input_names_str);
+  size_t inp_tensors_num = split(trimmed_names, name_set, ' ');
+  ORT_ENFORCE(inp_tensors_num, "There is no any input tensor names!");
+
+  std::string trimmed_shapes = whitespace_trimming(options.input_shapes_str);
+  size_t end_pos = trimmed_shapes.find_last_of(']');
+  ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found");
+  ORT_ENFORCE(end_pos == (trimmed_shapes.size() - 1),
+              "Invalid string for input shapes. Symbol ] should be last after whitespace trimming");
+
+  std::vector<std::string> shape_set;
+  split(trimmed_shapes, shape_set, ']');
+  shape_set.pop_back();
+  ORT_ENFORCE( shape_set.size() == inp_tensors_num,
+              "Number of shapes is not the same as number of input tensor names");
+
+  for (size_t i = 0; i < inp_tensors_num; ++i) {
+    size_t pos = shape_set[i].find('[');
+    ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]");
+    std::string numbers = shape_set[i].substr(pos + 1);
+    std::vector<std::string> number_set;
+    ORT_ENFORCE(split(numbers, number_set, ' '), "There is no any number between [ and ] symbols");
+
+    TensorShapeVector dims;
+    for(const auto& number : number_set) {
+      dims.push_back(std::stoi(number));
+    }
+
+    options.input_shapes[name_set[i]] = dims;
+  }
+}
+
+void TvmEPOptionsHelper::targetPostprocess(std::string& target) {
+  if(target == tvm::cpu_target_str ||
+     target == tvm::llvm_target_str) {
+    ProcessCPUTarget(target);
+  } else if(target == tvm::gpu_target_str) {
+    ProcessGPUTarget();
+  } else if(target.empty()) {
+    ORT_NOT_IMPLEMENTED("target option is empty!");
+  } else {
+    // TODO(vvchernov): extend mechanism of auto-definition of target
+    // target is gotten from option set up by client
+  }
+}
+
+void TvmEPOptionsHelper::ProcessCPUTarget(std::string& target) {
+  const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo();
+  // auto detect from CPU ID
+  if (cpu_id_info.HasAVX512Skylake()) {
+    target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512;
+  } else if (cpu_id_info.HasAVX512f()) {
+    target = tvm::cpu_targets::LLVM_TARGET_AVX512;
+  } else if (cpu_id_info.HasAVX2()) {
+    target = tvm::cpu_targets::LLVM_TARGET_AVX2;
+  } else if (cpu_id_info.HasAVX()) {
+    target = tvm::cpu_targets::LLVM_TARGET_AVX;
+  } else  {
+    // TODO(vvchernov): extend mechanism of auto-definition of cpu target
+    target = tvm::llvm_target_str;
+  }
+}
+
+void TvmEPOptionsHelper::ProcessGPUTarget() {
+  ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!");
+}
+
+void TvmEPOptionsHelper::targetHostPostprocess(const std::string& target, std::string& target_host) {
+  if((target_host == tvm::cpu_target_str ||
+      target_host == tvm::llvm_target_str) &&
+      target_host != target) {
+    target_host = target;
+  } else if (target_host.empty()) {
+    target_host = target;
+  } else {
+    // TODO(vvchernov): extend mechanism of auto-definition of target host
+    // target host is gotten from option set up by client
+  }
+}
+
+void TvmEPOptionsHelper::optLevelPostprocess(unsigned int& opt_level) {
+  if(opt_level < 1) {
+    opt_level = tvm::default_opt_level;
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) {
+  out << "TVM EP options:\n" <<
+  "executor type: " << options.executor << "\n" <<
+  "target: " << options.target << "\n" <<
+  "target_host: " << options.target_host << "\n" <<
+  "opt level: " << options.opt_level << "\n" <<
+  "freeze weights: " << options.freeze_weights << "\n" <<
+  "tuning file path: " << options.tuning_file_path << "\n" <<
+  "tuning type: " << options.tuning_type << "\n" <<
+  "convert layout to NHWC: " << options.to_nhwc << "\n" <<
+  "input tensor names: " << options.input_names_str << "\n" <<
+  "input tensor shapes: " << options.input_shapes_str;
+  return out;
+}
+
+}  // namespace tvm
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.h b/onnxruntime/core/providers/tvm/tvm_ep_options.h
similarity index 52%
rename from onnxruntime/core/providers/tvm/tvm_execution_provider_info.h
rename to onnxruntime/core/providers/tvm/tvm_ep_options.h
index b3c8932ab686f..7918b37a6bca4 100644
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.h
+++ b/onnxruntime/core/providers/tvm/tvm_ep_options.h
@@ -1,17 +1,20 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifndef TVM_EXECUTION_PROVIDER_INFO_H
-#define TVM_EXECUTION_PROVIDER_INFO_H
+#ifndef TVM_EXECUTION_PROVIDER_OPTIONS_H
+#define TVM_EXECUTION_PROVIDER_OPTIONS_H
 
 #include <unordered_map>
 #include <vector>
 #include <string>
+#include <iostream>
 
 #include "core/framework/provider_options.h"
+#include "core/framework/tensor_shape.h"
 
 #include "tvm_defaults.h"
 
+
 namespace onnxruntime {
 
 namespace tvm {
@@ -22,12 +25,13 @@ const std::string LLVM_TARGET_AVX2 = "llvm -mcpu=core-avx2";
 const std::string LLVM_TARGET_SKYLAKE_AVX512 = "llvm -mcpu=skylake-avx512";
 const std::string LLVM_TARGET_AVX512 = "llvm -mcpu=skylake-avx512";
 }  // namespace cpu_targets
-}  // namespace tvm
 
-using TVMInputShapes = std::unordered_map<std::string, std::vector<int64_t>>;
+using TVMTensorShapes = std::vector<TensorShapeVector>;
+using TVMInputShapes = std::unordered_map<std::string, TensorShapeVector>;
+using InputsInfoMap = std::unordered_map<size_t, TensorShapeVector>;
 
 // Information needed to construct an TVM execution provider.
-struct TvmExecutionProviderInfo {
+struct TvmEPOptions {
   std::string executor{tvm::default_executor_type};
   std::string target{tvm::default_target_str};
   std::string target_host{tvm::default_target_str};
@@ -39,12 +43,31 @@ struct TvmExecutionProviderInfo {
   std::string input_names_str{""};
   std::string input_shapes_str{""};
   TVMInputShapes input_shapes{};
+  TVMTensorShapes output_shapes{};
+};
+
+std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options);
 
+class TvmEPOptionsHelper {
+public:
+  static TvmEPOptions FromOptionsString(const char* options);
+  static TvmEPOptions FromProviderOptions(const ProviderOptions& options);
   static std::string whitespace_trimming(const std::string& str);
-  static TvmExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
-  static TvmExecutionProviderInfo FromOptionsString(const char* options);
+
+  static bool checkCPUTarget(const std::string& target);
+  static bool checkGPUTarget(const std::string& target);
+
+private:
+  static void optionsPostprocess(TvmEPOptions& options);
+  static void setInputShapes(TvmEPOptions& options);
+  static void targetPostprocess(std::string& target);
+  static void ProcessCPUTarget(std::string& target);
+  static void ProcessGPUTarget();
+  static void targetHostPostprocess(const std::string& target, std::string& target_host);
+  static void optLevelPostprocess(unsigned int& opt_level);
 };
 
+}  // namespace tvm
 }  // namespace onnxruntime
 
-#endif  // TVM_EXECUTION_PROVIDER_INFO_H
+#endif  // TVM_EXECUTION_PROVIDER_OPTIONS_H
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc
index cafd1561f0a5f..12eae2262c435 100644
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc
+++ b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc
@@ -10,7 +10,6 @@
 #include "core/framework/compute_capability.h"
 #include "core/platform/env.h"
 #include "core/graph/model.h"
-#include "core/common/cpuid_info.h"
 
 #include "tvm_execution_provider.h"
 #include "xpu_data_transfer.h"
@@ -22,239 +21,19 @@
 using namespace ONNX_NAMESPACE;
 
 namespace onnxruntime {
+namespace tvm {
 
 // Information to construct kernel function state.
 struct TVMFuncState {
   AllocateFunc allocate_func = nullptr;
   DestroyFunc release_func = nullptr;
   AllocatorHandle allocator = nullptr;
-  TvmModule* module = nullptr;
-  std::function<TvmModule*(std::string func_name,
-                const std::vector<std::vector<int64_t>>& input_shapes)> compiler = nullptr;
+  std::shared_ptr<tvm::TVMCompiler> compiler = nullptr;
 };
 
-class TVMRunner {
-  public:
-    using TVMTensorShape = std::vector<int64_t>;
-    using TVMTensorShapes = std::vector<TVMTensorShape>;
-    using InputsInfoMap = std::map<size_t, TVMTensorShape>;
-    using ORTGraphNodes = std::vector<const NodeArg*>;
-
-    TVMRunner() = delete;
-    ~TVMRunner() = default;
-
-    TVMRunner(TvmExecutionProvider* ep,
-               const std::string& name,
-               const Graph& graph) :
-      use_vm_(ep->info_.executor == "vm") {
-        // Extract input shapes
-        const ORTGraphNodes& all_nodes = graph.GetInputsIncludingInitializers();
-        TVMTensorShapes input_shapes;
-        size_t indx = 0;
-        if (ep->info_.freeze_weights) {
-          for (const auto* node : all_nodes) {
-            const auto& node_name = node->Name();
-            if(!graph.IsInitializedTensor(node_name)) {
-              TVMTensorShape ishape;
-              if(!ep->info_.input_shapes.empty() &&
-                  ep->info_.input_shapes.count(node_name)) {
-                ishape = ep->info_.input_shapes[node_name];
-                inputs_info_[indx] = ishape;
-                update_output_shapes_ = true;
-              } else {
-                getTensorInfo(*node->Shape(), ishape, indx);
-              }
-              input_shapes.emplace_back(ishape);
-            }
-            ++indx;
-          }
-        } else {
-          for (const auto* node : all_nodes) {
-            const auto& node_name = node->Name();
-            TVMTensorShape ishape;
-            if(!ep->info_.input_shapes.empty() &&
-                ep->info_.input_shapes.count(node_name)) {
-              ishape = ep->info_.input_shapes[node_name];
-              inputs_info_[indx++] = ishape;
-              update_output_shapes_ = true;
-            } else {
-              getTensorInfo(*node->Shape(), ishape, indx++);
-            }
-            if(!graph.IsInitializedTensor(node_name)) {
-              input_shapes.emplace_back(ishape);
-            }
-          }
-        }
-
-        // Get module from tvm
-        mod_ = ep->CompileFunc(name, input_shapes);
-
-        // Prepare draft for output tvm tensors
-        const ORTGraphNodes& ort_outputs_info = graph.GetOutputs();
-        size_t num_outputs = ort_outputs_info.size();
-
-        if (update_output_shapes_) {
-          if (!use_vm_) {
-            tvm::TVMGetOutputShapes(*mod_, num_outputs, output_shapes_);
-          }
-        } else {
-          for (auto i = 0u; i < num_outputs; i++) {
-            TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(*ort_outputs_info[i]->Shape());
-            int dims = ort_shape.NumDimensions();
-
-            TVMTensorShape oshape(dims);
-            for (int j = 0; j < dims; ++j) {
-              oshape[j] = int64_t(ort_shape[j]);
-            }
-            output_shapes_.emplace_back(oshape);
-          }
-        }
-
-        for (auto i = 0u; i < num_outputs; i++) {
-          DLTensor t;
-          // Draft for tensor, correct data is defined during inference
-          t.strides = nullptr;
-          t.byte_offset = 0;
-          t.data = nullptr;
-          if (!(use_vm_ && update_output_shapes_)) {
-            t.ndim = output_shapes_[i].size();
-            t.shape = output_shapes_[i].data();
-          } else {
-            t.ndim = 0;
-            t.shape = nullptr;
-          }
-
-          tensors_outputs_.push_back(t);
-        }
-      }
-
-    common::Status operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) {
-      Ort::CustomOpApi ort{*api};
-
-      size_t num = inputs_info_.size();
-      std::vector<size_t> inds(num);
-      std::vector<DLTensor> dl_tensors_inputs(num);
-      size_t counter = 0u;
-      for (auto& info : inputs_info_) {
-        // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z
-        auto& i = info.first;
-        auto& shape = info.second;
-        const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i);
-        ORT_ENFORCE(input_tensor->IsTensor());
-        const Tensor& tensor = input_tensor->Get<Tensor>();
-        const OrtDevice& device = tensor.Location().device;
-        auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
-        auto tensor_type = ort.GetTensorElementType(tensor_info);
-        if (!update_output_shapes_) {
-          std::vector<int64_t> ort_shape = ort.GetTensorShape(tensor_info);
-          ORT_ENFORCE(compare_shapes(shape, ort_shape));
-        }
-        ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
-
-        DLTensor t;
-        t.device = GetDLDevice(device);
-        t.dtype = GetDataType(tensor_type);
-        t.strides = nullptr;
-        t.byte_offset = 0;
-        t.data = const_cast<void*>(ort.GetTensorData<void>(input_tensor));
-        t.ndim = shape.size();
-        t.shape = shape.data();
-        dl_tensors_inputs[counter] = t;
-        inds[counter++] = i;
-      }
-      if (use_vm_) {
-        tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs);
-        // Infer once for calculating of output shapes
-        if(!probe_infer_) {
-          tvm::TVM_VM_Run(*mod_);
-          size_t num_outputs = tensors_outputs_.size();
-          tvm::TVMGetOutputShapes(*mod_, num_outputs, output_shapes_);
-          for (size_t i = 0; i < num_outputs; ++i) {
-            tensors_outputs_[i].ndim = output_shapes_[i].size();
-            tensors_outputs_[i].shape = output_shapes_[i].data();
-          }
-          probe_infer_ = true;
-        }
-      } else {
-        tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs);
-      }
-
-      size_t num_outputs = tensors_outputs_.size();
-      for (auto i = 0u; i < num_outputs; i++) {
-        //setup output tensor property
-        OrtValue* output_tensor = ort.KernelContext_GetOutput(context,
-                                                              i,
-                                                              output_shapes_[i].data(),
-                                                              output_shapes_[i].size());
-        ORT_ENFORCE(output_tensor->IsTensor());
-        const Tensor& tensor = output_tensor->Get<Tensor>();
-        const OrtDevice& device = tensor.Location().device;
-        auto tensor_info = ort.GetTensorTypeAndShape(output_tensor);
-        auto tensor_type = ort.GetTensorElementType(tensor_info);
-        ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
-
-        tensors_outputs_[i].device = GetDLDevice(device);
-        tensors_outputs_[i].dtype = GetDataType(tensor_type);
-        tensors_outputs_[i].data = ort.GetTensorMutableData<void>(output_tensor);
-      }
-
-      if (use_vm_) {
-        tvm::TVM_VM_Run(*mod_);
-        tvm::TVM_VM_GetOutputs(*mod_, tensors_outputs_);
-      } else {
-        tvm::TVMRun(*mod_);
-        tvm::TVMGetOutputs(*mod_, tensors_outputs_);
-      }
-
-      return Status::OK();
-    }
-  private:
-    void getTensorInfo(const TensorShapeProto& shape_proto,
-                       TVMTensorShape& ishape,
-                       size_t indx) {
-      TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto);
-      int dims = ort_shape.NumDimensions();
-
-      ishape.resize(dims);
-      for (int j = 0; j < dims; ++j) {
-        int64_t dim = int64_t(ort_shape[j]);
-        ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " +
-          "Please use provider options to setup input_names and input_shapes");
-        ishape[j] = dim;
-      }
-      inputs_info_[indx] = ishape;
-    }
-
-    bool compare_shapes(const TVMTensorShape& shape1, const TVMTensorShape& shape2) {
-      size_t size = shape1.size();
-      if (shape2.size() == size) {
-        for (size_t i = 0; i < size; ++i) {
-          if(shape1[i] != shape2[i]) {
-            return false;
-          }
-        }
-      } else {
-        return false;
-      }
-
-      return true;
-    }
-
-  private:
-    TvmModule* mod_;
-    bool use_vm_ = true;
-    bool probe_infer_ = false;
-    InputsInfoMap inputs_info_{};
-    bool update_output_shapes_ = false;
-    TVMTensorShapes output_shapes_;
-    std::vector<DLTensor> tensors_outputs_;
-};
-
-TvmExecutionProvider::TvmExecutionProvider(const TvmExecutionProviderInfo& info)
+TvmExecutionProvider::TvmExecutionProvider(const TvmEPOptions& options)
     : IExecutionProvider{kTvmExecutionProvider},
-      info_{info} {
-  ProcessInfo();
-
+      options_{options} {
   AllocatorCreationInfo default_memory_info = {[](int) {
                                                  return std::make_unique<TVMAllocator>();
                                                },
@@ -273,10 +52,6 @@ TvmExecutionProvider::TvmExecutionProvider(const TvmExecutionProviderInfo& info)
 
 TvmExecutionProvider::~TvmExecutionProvider() {}
 
-AllocatorPtr TvmExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const {
-  return allocator_;
-}
-
 std::vector<std::unique_ptr<ComputeCapability>>
 TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                      const std::vector<const KernelRegistry*>& /*kernel_registries*/) const {
@@ -327,8 +102,8 @@ TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
 }
 
 common::Status TvmExecutionProvider::Compile(const std::vector<Node*>& nodes,
-                                              std::vector<NodeComputeInfo>& node_compute_funcs) {
-  PrintProviderOptions();
+                                             std::vector<NodeComputeInfo>& node_compute_funcs) {
+  printOptions();
   for (auto* fused_node : nodes) {
     auto func_body = fused_node->GetFunctionBody();
     if (!func_body)
@@ -345,32 +120,28 @@ common::Status TvmExecutionProvider::Compile(const std::vector<Node*>& nodes,
     opset->set_domain(kOnnxDomain);
     opset->set_version(node_graph.DomainToVersionMap().at(kOnnxDomain));
 
-    std::string string_buf;
-    model_proto.SerializeToString(&string_buf);
-    buffers_[func_name] = string_buf;
-    opsets_[func_name] = int(opset->version());
-    model_paths_[func_name] = fused_node->ModelPath().ToPathString();;
+    std::string onnx_model_str;
+    model_proto.SerializeToString(&onnx_model_str);
+    compilers_[func_name] = std::make_shared<Compiler>(std::move(onnx_model_str),
+                              fused_node->ModelPath().ToPathString(),
+                              int(opset->version()));
+    InputsInfoMap all_input_shapes;
+    auto mod = compileModel(func_name, node_graph, all_input_shapes);
+
+    std::vector<DLTensor> output_tensors;
+    prepareOutputTensors(mod, output_tensors, node_graph.GetOutputs().size());
+
+    runners_[func_name] = std::make_shared<Runner>(options_, mod, all_input_shapes, output_tensors);
 
     if (dump_subgraphs_) {
-        std::fstream dump("/tmp/" + fused_node->Name() + ".onnx",
+        std::fstream dump("/tmp/" + func_name + ".onnx",
                           std::ios::out | std::ios::trunc | std::ios::binary);
         model_proto.SerializeToOstream(&dump);
     }
 
-    NodeComputeInfo compute_info;
-    compute_info.create_state_func = std::bind(&TvmExecutionProvider::CreateStateFunc,
-                                               this,
-                                               std::placeholders::_1,
-                                               std::placeholders::_2);
-
-    compute_info.release_state_func = [](FunctionState state) {
-      if (state)
-        delete static_cast<TVMFuncState*>(state);
-    };
     // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs
     // if the checking fails due to unsupported op(s)
-    runners_[func_name] = std::make_shared<TVMRunner>(this, func_name, node_graph);
-    compute_info.compute_func = *runners_[func_name].get();
+    NodeComputeInfo compute_info = prepareComputeInfo(func_name);
 
     node_compute_funcs.push_back(compute_info);
   }
@@ -378,182 +149,156 @@ common::Status TvmExecutionProvider::Compile(const std::vector<Node*>& nodes,
 }
 
 std::unique_ptr<IDataTransfer> TvmExecutionProvider::GetDataTransfer() const {
-  if (GPUTargetCheck()) {
-    return std::make_unique<onnxruntime::XPUDataTransfer>();
-  } else if (info_.target.find("llvm") != std::string::npos) {
-    return std::make_unique<onnxruntime::TvmCPUDataTransfer>();
+  //TODO(vvchernov): target or target host?
+  if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) {
+    return std::make_unique<XPUDataTransfer>();
+  } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) {
+    return std::make_unique<TvmCPUDataTransfer>();
   } else {
-    ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", info_.target);
+    ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target);
   }
 }
 
-bool TvmExecutionProvider::GPUTargetCheck() const {
-  //TODO(vvchernov): target or target host?
-  bool check = (
-    info_.target.find("cuda") != std::string::npos ||
-    info_.target.find("opencl") != std::string::npos ||
-    info_.target.find("metal") != std::string::npos ||
-    info_.target.find("vulkan") != std::string::npos
-  );
-  return check;
+AllocatorPtr TvmExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const {
+  return allocator_;
 }
 
-size_t TvmExecutionProvider::split(const std::string &txt, std::vector<std::string> &strs, char ch) const {
-    size_t pos = txt.find( ch );
-    size_t initialPos = 0;
-    strs.clear();
+void TvmExecutionProvider::printOptions() {
+  LOGS(*GetLogger(), INFO) << options_;
+}
 
-    while( pos != std::string::npos ) {
-        strs.push_back( txt.substr( initialPos, pos - initialPos ) );
-        initialPos = pos + 1;
+std::shared_ptr<TvmModule> TvmExecutionProvider::compileModel(const std::string& func_name,
+                                                              const Graph& graph,
+                                                              InputsInfoMap& all_input_shapes) {
+  all_input_shapes.clear();
 
-        pos = txt.find( ch, initialPos );
-    }
+  TVMTensorShapes input_shapes;
+  if (options_.freeze_weights) {
+    setInputShapesForFreezedNN(graph, input_shapes, all_input_shapes);
+  } else {
+    setInputShapesForUnfreezedNN(graph, input_shapes, all_input_shapes);
+  }
 
-    strs.push_back( txt.substr( initialPos, std::min( pos, txt.size() ) - initialPos + 1 ) );
+  std::shared_ptr<TvmModule> mod = compilers_[func_name]->operator()(options_, input_shapes);
 
-    return strs.size();
+  return mod;
 }
 
-void TvmExecutionProvider::ProcessInfo() {
-  if(!info_.input_shapes_str.empty()) {
-    ORT_ENFORCE(!info_.input_names_str.empty(),
-                "Please insert input tensor names. Input shapes only is invalid case");
-    // Parse strings and set to input_shapes map
-    std::vector<std::string> tmp_strs;
-    std::vector<std::string> names_strs;
-
-    std::string names_str = TvmExecutionProviderInfo::whitespace_trimming(info_.input_names_str);
-    std::string shapes_str = TvmExecutionProviderInfo::whitespace_trimming(info_.input_shapes_str);
-
-    ORT_ENFORCE(split(names_str, names_strs, ' '), "There is no any input tensor names!");
-    size_t inp_tensors_num = names_strs.size();
-
-    size_t end_pos = shapes_str.find_last_of(']');
-    ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found");
-    ORT_ENFORCE(end_pos == (shapes_str.size() - 1),
-                "Invalid string for input shapes. Symbol ] should be last after whitespace trimming");
-    split(shapes_str, tmp_strs, ']');
-    tmp_strs.pop_back();
-    ORT_ENFORCE( tmp_strs.size() == inp_tensors_num,
-                "Number of shapes is not the same as number of input tensor names");
-    for (size_t i = 0; i < inp_tensors_num; ++i) {
-      size_t pos = tmp_strs[i].find('[');
-      ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]");
-      std::string nums_str = tmp_strs[i].substr(pos + 1);
-      std::vector<std::string> nums_strs;
-      ORT_ENFORCE(split(nums_str, nums_strs, ' '), "There is no any numbers between [ and ] symbols");
-      std::vector<int64_t> dims;
-      for(const auto& num_str : nums_strs) {
-        dims.push_back(std::stoi(num_str));
-      }
-
-      info_.input_shapes[names_strs[i]] = dims;
+void TvmExecutionProvider::setInputShapesForFreezedNN(const Graph& graph,
+                                                      TVMTensorShapes& input_shapes,
+                                                      InputsInfoMap& all_input_shapes) {
+  const std::vector<const NodeArg*>& all_nodes = graph.GetInputsIncludingInitializers();
+
+  size_t indx = 0;
+  for (const auto* node : all_nodes) {
+    if(!graph.IsInitializedTensor(node->Name())) {
+      TensorShapeVector shape = getInputShape(node);
+      all_input_shapes[indx++] = shape;
+      input_shapes.emplace_back(shape);
     }
   }
+}
 
-  if(info_.target == tvm::cpu_target_str ||
-     info_.target == tvm::llvm_target_str) {
-    ProcessCPUTarget();
-  } else if(info_.target == tvm::gpu_target_str) {
-    ProcessGPUTarget();
-  } else if(info_.target.empty()) {
-    ORT_NOT_IMPLEMENTED("target option is empty!");
-  } else {
-    // TODO(vvchernov): extend mechanism of auto-definition of target
-    // target is gotten from option set up by client
+void TvmExecutionProvider::setInputShapesForUnfreezedNN(const Graph& graph,
+                                                        TVMTensorShapes& input_shapes,
+                                                        InputsInfoMap& all_input_shapes) {
+  const std::vector<const NodeArg*>& all_nodes = graph.GetInputsIncludingInitializers();
+
+  size_t indx = 0;
+  for (const auto* node : all_nodes) {
+    TensorShapeVector shape = getInputShape(node);
+    all_input_shapes[indx++] = shape;
+    if(!graph.IsInitializedTensor(node->Name())) {
+      input_shapes.emplace_back(shape);
+    }
   }
+}
 
-  if((info_.target_host == tvm::cpu_target_str ||
-      info_.target_host == tvm::llvm_target_str) &&
-      info_.target_host != info_.target) {
-    info_.target_host = info_.target;
-  } else if (info_.target_host.empty()) {
-    info_.target_host = info_.target;
-  } else {
-    // TODO(vvchernov): extend mechanism of auto-definition of target host
-    // target host is gotten from option set up by client
-  }
+TensorShapeVector TvmExecutionProvider::getInputShape(const NodeArg* node) {
+    TensorShapeVector shape;
+    const auto& node_name = node->Name();
+    if(!options_.input_shapes.empty() &&
+        options_.input_shapes.count(node_name)) {
+      shape = options_.input_shapes[node_name];
+    } else {
+      shape = convertTensorShape(*node->Shape());
+    }
 
-  if(info_.opt_level < 1) {
-    info_.opt_level = tvm::default_opt_level;
-  }
+    return shape;
 }
 
-void TvmExecutionProvider::ProcessCPUTarget() {
-  const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo();
-  // auto detect from CPU ID
-  if (cpu_id_info.HasAVX512Skylake()) {
-    info_.target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512;
-  } else if (cpu_id_info.HasAVX512f()) {
-    info_.target = tvm::cpu_targets::LLVM_TARGET_AVX512;
-  } else if (cpu_id_info.HasAVX2()) {
-    info_.target = tvm::cpu_targets::LLVM_TARGET_AVX2;
-  } else if (cpu_id_info.HasAVX()) {
-    info_.target = tvm::cpu_targets::LLVM_TARGET_AVX;
-  } else  {
-    // TODO(vvchernov): extend mechanism of auto-definition of cpu target
-    info_.target = tvm::llvm_target_str;
+TensorShapeVector TvmExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) {
+  TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto);
+  size_t dims = ort_shape.NumDimensions();
+
+  TensorShapeVector shape(dims);
+  for (size_t j = 0; j < dims; ++j) {
+    int64_t dim = int64_t(ort_shape[j]);
+    ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " +
+      "Please use provider options to setup input_names and input_shapes");
+    shape[j] = dim;
   }
+
+  return shape;
 }
 
-void TvmExecutionProvider::ProcessGPUTarget() {
-  ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!");
+void TvmExecutionProvider::prepareOutputTensors(const std::shared_ptr<tvm::TvmModule>& mod,
+                                                std::vector<DLTensor>& output_tensors,
+                                                size_t num) {
+  ORT_ENFORCE(mod != nullptr, "TVM module is not compiled");
+  output_tensors.clear();
+  options_.output_shapes.clear();
+  options_.output_shapes.resize(num);
+
+  if (options_.executor != "vm") {
+    tvm::TVMGetOutputShapes(*mod, options_.output_shapes);
+  }
+
+  for (auto& output_shape : options_.output_shapes) {
+    DLTensor t;
+    // Draft for tensor, correct data is defined during inference
+    t.strides = nullptr;
+    t.byte_offset = 0;
+    t.data = nullptr;
+    if (options_.executor == "vm") {
+      t.ndim = 0;
+      t.shape = nullptr;
+    } else {
+      t.ndim = output_shape.size();
+      t.shape = output_shape.data();
+    }
+
+    output_tensors.push_back(t);
+  }
 }
 
-void TvmExecutionProvider::PrintProviderOptions() const {
-  LOGS(*GetLogger(), INFO) << "TVM EP options:\n" <<
-  "executor type: " << info_.executor << "\n" <<
-  "target: " << info_.target << "\n" <<
-  "target_host: " << info_.target_host << "\n" <<
-  "opt level: " << info_.opt_level << "\n" <<
-  "freeze weights: " << info_.freeze_weights << "\n" <<
-  "tuning file path: " << info_.tuning_file_path << "\n" <<
-  "tuning type: " << info_.tuning_type << "\n" <<
-  "convert layout to NHWC: " << info_.to_nhwc << "\n" <<
-  "input tensor names: " << info_.input_names_str << "\n" <<
-  "input tensor shapes: " << info_.input_shapes_str;
+NodeComputeInfo TvmExecutionProvider::prepareComputeInfo(const std::string& func_name) {
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = std::bind(&TvmExecutionProvider::createStateFunc,
+                                              this,
+                                              std::placeholders::_1,
+                                              std::placeholders::_2);
+
+  compute_info.release_state_func = [](FunctionState state) {
+    if (state)
+      delete static_cast<TVMFuncState*>(state);
+  };
+
+  compute_info.compute_func = *runners_[func_name].get();
+
+  return compute_info;
 }
 
-int TvmExecutionProvider::CreateStateFunc(ComputeContext* context, FunctionState* state) {
+int TvmExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) {
   auto* state_ptr = new TVMFuncState();
   *state_ptr = {context->allocate_func,
-                 context->release_func,
-                 context->allocator_handle,
-                 nullptr,
-                 std::bind(&TvmExecutionProvider::CompileFunc,
-                           this,
-                           std::placeholders::_1,
-                           std::placeholders::_2)};
+                context->release_func,
+                context->allocator_handle,
+                compilers_[context->node_name]};
+  // TODO(vvchernov): Who and when release state?
   *state = state_ptr;
   return 0;
 }
 
-TvmModule* TvmExecutionProvider::CompileFunc(std::string func_name,
-                                             const TVMTensorShapes& input_shapes) {
-  if (modules_.count(func_name)) {
-    return modules_[func_name].get();
-  }
-
-  TvmModule mod_f = tvm::TVMCompile(buffers_[func_name],
-                                    model_paths_[func_name],
-                                    info_.executor,
-                                    info_.target,
-                                    info_.target_host,
-                                    info_.opt_level,
-                                    opsets_[func_name],
-                                    info_.freeze_weights,
-                                    input_shapes,
-                                    info_.to_nhwc,
-                                    info_.tuning_file_path,
-                                    info_.tuning_type);
-  auto module_ptr = std::make_shared<TvmModule>();
-  *module_ptr = mod_f;
-  modules_[func_name] = module_ptr;
-  // Release memory after module generation
-  buffers_.erase(func_name);
-  opsets_.erase(func_name);
-  return modules_[func_name].get();
-}
-
+}  // namespace tvm
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_execution_provider.h
index 6a5d2a2b4c6ac..9d891ee292976 100644
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider.h
+++ b/onnxruntime/core/providers/tvm/tvm_execution_provider.h
@@ -13,28 +13,27 @@
 #include "core/framework/execution_provider.h"
 #include "core/platform/ort_mutex.h"
 
-#include "tvm_common.h"
-#include "tvm_execution_provider_info.h"
+#include "tvm_compiler.h"
+#include "tvm_runner.h"
 
-namespace onnxruntime {
 
+namespace onnxruntime {
+  class Graph;
+  class NodeArg;
 namespace tvm {
+
 namespace env_vars {
    static const std::string kDumpSubgraphs = "ORT_TVM_DUMP_SUBGRAPHS";
 }  // namespace env_vars
-}  // namespace tvm
-
-class TVMRunner;
 
 class TvmExecutionProvider : public IExecutionProvider {
-  friend TVMRunner;
+  using Compiler = tvm::TVMCompiler;
+  using Compilers = std::unordered_map<std::string, std::shared_ptr<Compiler>>;
+  using Runner = tvm::TVMRunner;
+  using Runners = std::unordered_map<std::string, std::shared_ptr<Runner>>;
 
-  using TVMTensorShape = std::vector<int64_t>;
-  using TVMTensorShapes = std::vector<TVMTensorShape>;
-  using TVMRunners = std::unordered_map<std::string, std::shared_ptr<TVMRunner>>;
-  using TVMModules = std::unordered_map<std::string, std::shared_ptr<TvmModule>>;
  public:
-  explicit TvmExecutionProvider(const TvmExecutionProviderInfo& info);
+  explicit TvmExecutionProvider(const TvmEPOptions& options);
   virtual ~TvmExecutionProvider();
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -47,27 +46,27 @@ class TvmExecutionProvider : public IExecutionProvider {
   AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const override;
 
  private:
-  bool GPUTargetCheck() const;
-  size_t split(const std::string &txt, std::vector<std::string> &strs, char ch) const;
-  void ProcessInfo();
-  void ProcessCPUTarget();
-  void ProcessGPUTarget();
-  void PrintProviderOptions() const;
-  // Bindings for compute info
-  int CreateStateFunc(ComputeContext*, FunctionState*);
-  TvmModule* CompileFunc(std::string func_name, const TVMTensorShapes& input_shapes);
+  void printOptions();
+  std::shared_ptr<tvm::TvmModule> compileModel(const std::string& func_name,
+                                               const Graph& graph,
+                                               InputsInfoMap& inputs_info);
+  void setInputShapesForFreezedNN(const Graph& graph, TVMTensorShapes& input_shapes, InputsInfoMap& all_input_shapes);
+  void setInputShapesForUnfreezedNN(const Graph& graph, TVMTensorShapes& input_shapes, InputsInfoMap& all_input_shapes);
+  TensorShapeVector getInputShape(const NodeArg* node);
+  TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto);
+  void prepareOutputTensors(const std::shared_ptr<tvm::TvmModule>& mod, std::vector<DLTensor>& output_tensors, size_t num);
+  NodeComputeInfo prepareComputeInfo(const std::string& func_name);
+  int createStateFunc(ComputeContext*, FunctionState*);
  private:
-  TVMRunners runners_;
-  std::unordered_map<std::string, std::string> buffers_;
-  std::unordered_map<std::string, int> opsets_;
-  std::unordered_map<std::string, std::string>  model_paths_;
+  TvmEPOptions options_;
+  Compilers compilers_;
+  Runners runners_;
   bool dump_subgraphs_ = false;
   OrtMutex tvm_mu_;
   AllocatorPtr allocator_;
-  TvmExecutionProviderInfo info_;
-  TVMModules modules_;
 };
 
+}  // namespace tvm
 }  // namespace onnxruntime
 
 #endif  // TVM_EXECUTION_PROVIDER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc
deleted file mode 100644
index 2bb2d1d6923e3..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <unordered_set>
-#include <regex>
-
-#include "core/common/common.h"
-#include "core/framework/provider_options_utils.h"
-
-#include "tvm_execution_provider_info.h"
-
-
-namespace onnxruntime {
-namespace tvm {
-namespace provider_option_names {
-constexpr const char* kExecutor = "executor";
-constexpr const char* kTarget = "target";
-constexpr const char* kTargetHost = "target_host";
-constexpr const char* kOptLevel = "opt_level";
-constexpr const char* kFreezeWeights = "freeze_weights";
-constexpr const char* kToNHWC = "to_nhwc";
-constexpr const char* kTuningFilePath = "tuning_file_path";
-constexpr const char* kTuningType = "tuning_type";
-constexpr const char* kInputNames = "input_names";
-constexpr const char* kInputShapes = "input_shapes";
-
-static const std::unordered_set<std::string> valid_keys {
-  std::string{kExecutor},
-  std::string{kTarget},
-  std::string{kTargetHost},
-  std::string{kOptLevel},
-  std::string{kFreezeWeights},
-  std::string{kToNHWC},
-  std::string{kTuningFilePath},
-  std::string{kTuningType},
-  std::string{kInputNames},
-  std::string{kInputShapes}
-};
-
-}  // namespace provider_option_names
-}  // namespace tvm
-
-std::string TvmExecutionProviderInfo::whitespace_trimming(const std::string& str) {
-  const std::string WHITESPACE = " \n\r\t\f\v";
-  size_t start = str.find_first_not_of(WHITESPACE);
-  if (start == std::string::npos) {
-    return "";
-  } else {
-    size_t end = str.find_last_not_of(WHITESPACE);
-    ORT_ENFORCE(end != std::string::npos);
-    return str.substr(start, end + 1);
-  }
-}
-
-TvmExecutionProviderInfo TvmExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
-  TvmExecutionProviderInfo info{};
-
-  ORT_THROW_IF_ERROR(
-      ProviderOptionsParser{}
-          .AddAssignmentToReference(tvm::provider_option_names::kExecutor, info.executor)
-          .AddAssignmentToReference(tvm::provider_option_names::kTarget, info.target)
-          .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, info.target_host)
-          .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, info.opt_level)
-          .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, info.freeze_weights)
-          .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, info.to_nhwc)
-          .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, info.tuning_file_path)
-          .AddAssignmentToReference(tvm::provider_option_names::kTuningType, info.tuning_type)
-          .AddAssignmentToReference(tvm::provider_option_names::kInputNames, info.input_names_str)
-          .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, info.input_shapes_str)
-          .Parse(options));
-
-  return info;
-}
-
-TvmExecutionProviderInfo TvmExecutionProviderInfo::FromOptionsString(const char* opt_str) {
-  std::string settings{opt_str};
-  ProviderOptions options;
-  if (!settings.empty()) {
-    const std::string& str = settings;
-
-    // tokenize settings
-    std::regex reg("\\s*,\\s*");
-    std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1);
-    std::sregex_token_iterator iter_end;
-    std::vector<std::string> pairs(iter, iter_end);
-
-    ORT_ENFORCE(pairs.size() > 0);
-
-    for(const auto& pair : pairs) {
-      auto pos_colon = pair.find(':');
-      ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair.");
-      std::string key = pair.substr(0, pos_colon);
-      std::string value = pair.substr(pos_colon + 1);
-
-      // trim leading and trailing spaces from key/value
-      key = whitespace_trimming(key);
-      value = whitespace_trimming(value);
-
-      // Check keys of obtained options
-      if (tvm::provider_option_names::valid_keys.count(key) == 0) {
-        ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")");
-      }
-
-      options[key] = value;
-    }
-  }
-
-  return FromProviderOptions(options);
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc
index b63077e3b311c..bcfeb637bd461 100644
--- a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc
+++ b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc
@@ -13,32 +13,31 @@
 namespace onnxruntime {
 
 struct TvmProviderFactory : IExecutionProviderFactory {
-  TvmProviderFactory(const TvmExecutionProviderInfo& info) : info_{info} {}
+  TvmProviderFactory(const tvm::TvmEPOptions& options) : options_{options} {}
   ~TvmProviderFactory() = default;
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<TvmExecutionProvider>(info_);
+    return std::make_unique<tvm::TvmExecutionProvider>(options_);
   }
 
- private:
-    TvmExecutionProviderInfo info_;
+private:
+  tvm::TvmEPOptions options_;
 };
 
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const char* settings) {
-    TvmExecutionProviderInfo info = TvmExecutionProviderInfo::FromOptionsString(settings);
-    return std::make_shared<TvmProviderFactory>(info);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const char* opt_str) {
+  tvm::TvmEPOptions options = tvm::TvmEPOptionsHelper::FromOptionsString(opt_str);
+  return std::make_shared<TvmProviderFactory>(options);
 }
 
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const TvmExecutionProviderInfo& info)
-{
-    return std::make_shared<TvmProviderFactory>(info);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const tvm::TvmEPOptions& options) {
+  return std::make_shared<TvmProviderFactory>(options);
 }
 }  // namespace onnxruntime
 
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm,
                     _In_ OrtSessionOptions* options,
-                    _In_ const char* settings) {
-  onnxruntime::TvmExecutionProviderInfo info = onnxruntime::TvmExecutionProviderInfo::FromOptionsString(settings);
-  options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Tvm(info));
+                    _In_ const char* opt_str) {
+  onnxruntime::tvm::TvmEPOptions tvm_options = onnxruntime::tvm::TvmEPOptionsHelper::FromOptionsString(opt_str);
+  options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Tvm(tvm_options));
   return nullptr;
 }
diff --git a/onnxruntime/core/providers/tvm/tvm_runner.cc b/onnxruntime/core/providers/tvm/tvm_runner.cc
new file mode 100644
index 0000000000000..117ecea680ea5
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_runner.cc
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/graph/model.h"
+#include "core/framework/tensorprotoutils.h"
+
+#include "tvm_runner.h"
+
+
+using namespace ONNX_NAMESPACE;
+namespace onnxruntime {
+namespace tvm {
+
+TVMRunner::TVMRunner(const TvmEPOptions& options,
+                     const std::shared_ptr<TvmModule>& mod,
+                     const InputsInfoMap& inputs_info,
+                     const std::vector<DLTensor>& output_tensors) {
+  runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors);
+}
+
+common::Status TVMRunner::operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) {
+  return runner_->run(api, context);
+}
+
+}   // namespace tvm
+}   // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_runner.h b/onnxruntime/core/providers/tvm/tvm_runner.h
new file mode 100644
index 0000000000000..85d37ccec1042
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_runner.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef TVM_RUNNER_H
+#define TVM_RUNNER_H
+
+#include <vector>
+#include <memory>
+
+#include "tvm_runner_impl.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+class TVMRunner {
+public:
+  TVMRunner() = delete;
+  virtual ~TVMRunner() = default;
+
+  TVMRunner(const TvmEPOptions& options,
+            const std::shared_ptr<TvmModule>& mod,
+            const InputsInfoMap& inputs_info,
+            const std::vector<DLTensor>& output_tensor);
+
+  common::Status operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context);
+
+private:
+  std::shared_ptr<RunnerImpl> runner_;
+};
+
+}   // namespace tvm
+}   // namespace onnxruntime
+
+#endif  // TVM_TVM_RUNNER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc
new file mode 100644
index 0000000000000..bade84b6803f3
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc
@@ -0,0 +1,165 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/tensorprotoutils.h"
+
+#include "tvm_runner_impl.h"
+#include "tvm_utils.h"
+#include "tvm_api.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+/* ------------------------------------ RunnerImplFactory ----------------------------- */
+
+std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
+                                             const TvmEPOptions& options,
+                                             const InputsInfoMap& inputs_info,
+                                             const std::vector<DLTensor> output_tensors) {
+    const std::string& name = options.executor;
+    if (name == "graph") {
+        return std::make_shared<GERunnerImpl>(mod, inputs_info, options.output_shapes, output_tensors);
+    } else if (name == "vm") {
+        return std::make_shared<VMRunnerImpl>(mod, inputs_info, options.output_shapes, output_tensors);
+    }
+    return nullptr;
+}
+
+/* ------------------------------------ RunnerImpl ------------------------------------ */
+
+RunnerImpl::RunnerImpl(const std::shared_ptr<TvmModule>& mod,
+                       const InputsInfoMap& inputs_info,
+                       const TVMTensorShapes output_shapes,
+                       const std::vector<DLTensor> output_tensors) :
+  mod_(mod),
+  inputs_info_(inputs_info),
+  output_shapes_(output_shapes),
+  output_tensors_(output_tensors) {
+}
+
+void RunnerImpl::convert_input_tensors2dl_tensors(Ort::CustomOpApi& ort,
+                                                  OrtKernelContext* context,
+                                                  std::vector<DLTensor>& dst,
+                                                  std::vector<size_t>& dst_inds) {
+  size_t num = inputs_info_.size();
+  dst.reserve(num);
+  dst_inds.reserve(num);
+  for (auto& info : inputs_info_) {
+    // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z
+    auto& i = info.first;
+    auto& shape = info.second;
+    const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i);
+    ORT_ENFORCE(input_tensor->IsTensor());
+    const Tensor& tensor = input_tensor->Get<Tensor>();
+    const OrtDevice& device = tensor.Location().device;
+    auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
+    auto tensor_type = ort.GetTensorElementType(tensor_info);
+    ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
+
+    DLTensor t;
+    t.device = GetDLDevice(device);
+    t.dtype = GetDataType(tensor_type);
+    t.strides = nullptr;
+    t.byte_offset = 0;
+    t.data = const_cast<void*>(ort.GetTensorData<void>(input_tensor));
+    t.ndim = shape.size();
+    t.shape = shape.data();
+    dst.emplace_back(t);
+    dst_inds.push_back(i);
+  }
+}
+
+void RunnerImpl::add_device_type_data2output_tensors(Ort::CustomOpApi& ort,
+                                                     OrtKernelContext* context) {
+  size_t num_outputs = output_tensors_.size();
+  for (auto i = 0u; i < num_outputs; i++) {
+    //setup output tensor property
+    OrtValue* output_tensor = ort.KernelContext_GetOutput(context,
+                                                          i,
+                                                          output_shapes_[i].data(),
+                                                          output_shapes_[i].size());
+    ORT_ENFORCE(output_tensor->IsTensor());
+    const Tensor& tensor = output_tensor->Get<Tensor>();
+    const OrtDevice& device = tensor.Location().device;
+    auto tensor_info = ort.GetTensorTypeAndShape(output_tensor);
+    auto tensor_type = ort.GetTensorElementType(tensor_info);
+    ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
+
+    output_tensors_[i].device = GetDLDevice(device);
+    output_tensors_[i].dtype = GetDataType(tensor_type);
+    output_tensors_[i].data = ort.GetTensorMutableData<void>(output_tensor);
+  }
+}
+
+/* ------------------------------------ GERunnerImpl ------------------------------------ */
+
+GERunnerImpl::GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
+                           const InputsInfoMap& inputs_info,
+                           const TVMTensorShapes output_shapes,
+                           const std::vector<DLTensor> output_tensors) :
+  RunnerImpl(mod, inputs_info, output_shapes, output_tensors) {
+}
+
+void GERunnerImpl::set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) {
+  std::vector<size_t> inds;
+  std::vector<DLTensor> dl_tensors_inputs;
+  convert_input_tensors2dl_tensors(ort, context, dl_tensors_inputs, inds);
+
+  tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs);
+}
+
+void GERunnerImpl::connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) {
+  add_device_type_data2output_tensors(ort, context);
+}
+
+void GERunnerImpl::run_and_get_output() {
+  tvm::TVMRun(*mod_);
+  tvm::TVMGetOutputs(*mod_, output_tensors_);
+}
+
+/* ------------------------------------ VMRunnerImpl ------------------------------------ */
+
+VMRunnerImpl::VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
+                           const InputsInfoMap& inputs_info,
+                           const TVMTensorShapes output_shapes,
+                           const std::vector<DLTensor> output_tensors) :
+  RunnerImpl(mod, inputs_info, output_shapes, output_tensors) {
+}
+
+void VMRunnerImpl::set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) {
+  std::vector<size_t> inds;
+  std::vector<DLTensor> dl_tensors_inputs;
+  convert_input_tensors2dl_tensors(ort, context, dl_tensors_inputs, inds);
+
+  tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs);
+}
+
+void VMRunnerImpl::connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) {
+  if(!probe_infer_) {
+    infer_once_to_get_output_shapes();
+  }
+
+  add_device_type_data2output_tensors(ort, context);
+}
+
+void VMRunnerImpl::run_and_get_output() {
+  tvm::TVM_VM_Run(*mod_);
+  tvm::TVM_VM_GetOutputs(*mod_, output_tensors_);
+}
+
+void VMRunnerImpl::infer_once_to_get_output_shapes() {
+  tvm::TVM_VM_Run(*mod_);
+  size_t num_outputs = output_tensors_.size();
+  // TODO(vvchernov): check it
+  output_shapes_.resize(num_outputs);
+  tvm::TVMGetOutputShapes(*mod_, output_shapes_);
+  for (size_t i = 0; i < num_outputs; ++i) {
+    output_tensors_[i].ndim = output_shapes_[i].size();
+    output_tensors_[i].shape = output_shapes_[i].data();
+  }
+  probe_infer_ = true;
+}
+
+}   // namespace tvm
+}   // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.h b/onnxruntime/core/providers/tvm/tvm_runner_impl.h
new file mode 100644
index 0000000000000..e9104859c78e6
--- /dev/null
+++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.h
@@ -0,0 +1,104 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef TVM_RUNNER_IMPL_H
+#define TVM_RUNNER_IMPL_H
+
+#include <string>
+#include <memory>
+#include <map>
+
+#include "core/framework/func_api.h"
+#include "core/session/onnxruntime_cxx_api.h"
+
+#include "tvm_common.h"
+#include "tvm_ep_options.h"
+
+
+namespace onnxruntime {
+namespace tvm {
+
+class RunnerImpl {
+public:
+  RunnerImpl() = delete;
+  RunnerImpl(const std::shared_ptr<TvmModule>& mod,
+             const InputsInfoMap& inputs_info,
+             const TVMTensorShapes output_shapes,
+             const std::vector<DLTensor> tensors_outputs);
+  virtual ~RunnerImpl() = default;
+
+  virtual common::Status run(const OrtCustomOpApi* api, OrtKernelContext* context) {
+    Ort::CustomOpApi ort{*api};
+
+    set_input(ort, context);
+    connect_output_tensors2ort(ort, context);
+    run_and_get_output();
+
+    return Status::OK();
+  }
+
+  virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) = 0;
+  virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) = 0;
+  virtual void run_and_get_output() = 0;
+
+protected:
+  void convert_input_tensors2dl_tensors(Ort::CustomOpApi& ort,
+                                        OrtKernelContext* context,
+                                        std::vector<DLTensor>& dst,
+                                        std::vector<size_t>& dst_inds);
+  void add_device_type_data2output_tensors(Ort::CustomOpApi& ort,
+                                           OrtKernelContext* context);
+
+protected:
+  std::shared_ptr<TvmModule> mod_;
+  InputsInfoMap inputs_info_;
+  TVMTensorShapes output_shapes_;
+  std::vector<DLTensor> output_tensors_;
+};
+
+
+class GERunnerImpl : public RunnerImpl {
+public:
+  GERunnerImpl() = delete;
+  GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
+               const InputsInfoMap& inputs_info,
+               const TVMTensorShapes output_shapes,
+               const std::vector<DLTensor> tensors_outputs);
+  virtual ~GERunnerImpl() = default;
+
+  virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) override final;
+  virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) override final;
+  virtual void run_and_get_output() override final;
+};
+
+
+class VMRunnerImpl : public RunnerImpl {
+public:
+  VMRunnerImpl() = delete;
+  VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
+               const InputsInfoMap& inputs_info,
+               const TVMTensorShapes output_shapes,
+               const std::vector<DLTensor> tensors_outputs);
+  virtual ~VMRunnerImpl() = default;
+
+  virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) override final;
+  virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) override final;
+  virtual void run_and_get_output() override final;
+
+private:
+    void infer_once_to_get_output_shapes();
+
+private:
+    bool probe_infer_ = false;
+};
+
+
+std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
+                                             const TvmEPOptions& options,
+                                             const InputsInfoMap& inputs_info,
+                                             const std::vector<DLTensor> output_tensors);
+
+}   // namespace tvm
+}   // namespace onnxruntime
+
+#endif  // TVM_TVM_RUNNER_IMPL_H
diff --git a/onnxruntime/core/providers/tvm/tvm_utils.h b/onnxruntime/core/providers/tvm/tvm_utils.h
index ab0e8da5652f9..9471afb135578 100644
--- a/onnxruntime/core/providers/tvm/tvm_utils.h
+++ b/onnxruntime/core/providers/tvm/tvm_utils.h
@@ -10,7 +10,9 @@
 #include "core/framework/ortdevice.h"
 #include "core/common/common.h"
 
+
 namespace onnxruntime {
+namespace tvm {
 
 inline DLDataType GetDataType(ONNXTensorElementDataType type) {
   if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
@@ -50,6 +52,7 @@ inline DLDevice GetDLDevice(const OrtDevice& device) {
   return context;
 }
 
-}  // namespace onnxruntime
+}   // namespace tvm
+}   // namespace onnxruntime
 
 #endif // TVM_UTILS_H
diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc
index 4efb171dda849..5247382566aad 100644
--- a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc
+++ b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc
@@ -6,7 +6,10 @@
 #include "xpu_data_transfer.h"
 #include "tvm_utils.h"
 
+
 namespace onnxruntime {
+namespace tvm {
+
 XPUDataTransfer::XPUDataTransfer() {
 }
 
@@ -14,8 +17,8 @@ XPUDataTransfer::~XPUDataTransfer() {
 }
 
 bool XPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-    return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) ||
-    (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU);
+  return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) ||
+  (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU);
 }
 
 common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int _exec_queue_id) const {
@@ -27,11 +30,11 @@ common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int _
   const OrtDevice& dst_device = dst.Location().device;
 
   if ((src_device.Type() == OrtDevice::CPU) && (dst_device.Type() == OrtDevice::CPU)) {
-      if (src_data == dst_data) {
-        // no need copying as both pointers are referring to same piece of memory.
-        return Status::OK();
-      }
-      memcpy(dst_data, src_data, bytes);
+    if (src_data == dst_data) {
+      // no need copying as both pointers are referring to same piece of memory.
+      return Status::OK();
+    }
+    memcpy(dst_data, src_data, bytes);
   } else {
     DLTensor tvm_src, tvm_dst;
     DLDataType dl_type{kDLInt, 8, 1};
@@ -80,4 +83,5 @@ common::Status TvmCPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, in
   return Status::OK();
 }
 
-}  // namespace onnxruntime
+}   // namespace tvm
+}   // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.h b/onnxruntime/core/providers/tvm/xpu_data_transfer.h
index f07c11794390a..0b38f71baa22e 100644
--- a/onnxruntime/core/providers/tvm/xpu_data_transfer.h
+++ b/onnxruntime/core/providers/tvm/xpu_data_transfer.h
@@ -7,10 +7,12 @@
 #include "core/framework/data_transfer.h"
 #include "tvm_common.h"
 
+
 namespace onnxruntime {
+namespace tvm {
 
 class XPUDataTransfer : public IDataTransfer {
- public:
+public:
   XPUDataTransfer();
   ~XPUDataTransfer();
 
@@ -23,7 +25,7 @@ class XPUDataTransfer : public IDataTransfer {
 };
 
 class TvmCPUDataTransfer : public IDataTransfer {
- public:
+public:
   TvmCPUDataTransfer() = default;
   // Dampen MSVC warning about not fully overriding CopyTensor
   using IDataTransfer::CopyTensor;
@@ -31,5 +33,7 @@ class TvmCPUDataTransfer : public IDataTransfer {
   common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
 };
 
-}  // namespace onnxruntime
+}   // namespace tvm
+}   // namespace onnxruntime
+
 #endif // XPU_DATA_TRANSFER
diff --git a/onnxruntime/python/tools/microbench/attention.py b/onnxruntime/python/tools/microbench/attention.py
new file mode 100644
index 0000000000000..bc9daae4455c5
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/attention.py
@@ -0,0 +1,57 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import argparse
+from dataclasses import dataclass
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    batch_size: int
+    seq_len: int
+    hidden_size: int
+    length: int
+    data_type: type
+
+
+class BenchmarkAttention(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
+        bias = np.random.rand(op_param.length).astype(op_param.data_type)
+        mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
+        output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
+        outputs = {"return_val": output_data}
+        return inputs, outputs
+
+    def create_cases(self):
+        model = "models/attention_fp16.onnx" if self.args.precision == "fp16" else "models/attention_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+        # bert-base
+        op_param = OpParam(1, 384, 768, 768 * 3, data_type)
+        self.add_case(op_param, model)
+
+    def case_profile(cls, op_param, time):
+        profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkAttention(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index cb8e5f57c20f0..86fa98e153146 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -1,63 +1,94 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser
-import time
+import logging
 import numpy
 import onnxruntime as ort
+import time
 import torch
 
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 
 def numpy_type(torch_type):
     type_map = {torch.float32: numpy.float32,
-                torch.float16: numpy.float16}
+                torch.float16: numpy.float16,
+                torch.int32: numpy.int32}
     return type_map[torch_type]
 
 
 def add_arguments(parser: ArgumentParser):
-    parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
-    parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
-    parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
+    parser.add_argument("--provider", required=False, type=str,
+                        choices=["cuda", "rocm", "cpu", None], default=None,
+                        help=("Execution provider to use. By default, a "
+                              "provider is selected in the priority order "
+                              "(cuda|rocm, cpu) depending on availability."))
+    parser.add_argument("--precision", required=False, type=str,
+                        choices=["fp16", "fp32"], default="fp16",
+                        help="Number format to use")
+    parser.add_argument('--profiling', required=False, type=bool,
+                        default=False, help='If enable profiling')
+
+
+def provider_name(name):
+    provider_map = {"cuda": "CUDAExecutionProvider",
+                    "rocm": "ROCMExecutionProvider",
+                    "cpu": "CPUExecutionProvider"}
+    return provider_map[name]
+
+
+def get_default_provider():
+    if "CUDAExecutionProvider" in ort.get_available_providers():
+        return "CUDAExecutionProvider"
+    if "ROCMExecutionProvider" in ort.get_available_providers():
+        return "ROCMExecutionProvider"
+    return "CPUExecutionProvider"
 
 
 class Benchmark:
     def __init__(self, model, inputs, outputs, args):
-        self.provider = args.provider
+        self.provider = (get_default_provider() if args.provider == None
+                         else provider_name(args.provider))
+        logger.info(f"Execution provider: {self.provider}")
         self.profiling = args.profiling
         self.model = model
+        logger.info(f"Model: {self.model}")
         self.inputs = inputs
         self.outputs = outputs
 
     def create_input_output_tensors(self):
-        device = "cuda"
-        input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
-        output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
+        on_gpu = (self.provider == "CUDAExecutionProvider" 
+                  or self.provider == "ROCMExecutionProvider")
+        device = "cuda" if on_gpu else "cpu"
+        input_tensors = {name: torch.from_numpy(array).to(device)
+                         for name, array in self.inputs.items()}
+        output_tensors = {name: torch.from_numpy(array).to(device)
+                          for name, array in self.outputs.items()}
         return input_tensors, output_tensors
 
     @classmethod
     def create_io_binding(cls, sess, input_tensors, output_tensors):
         io_binding = sess.io_binding()
         for name, tensor in input_tensors.items():
-            io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+            io_binding.bind_input(name, tensor.device.type, 0,
+                                  numpy_type(tensor.dtype), tensor.shape,
+                                  tensor.data_ptr())
         for name, tensor in output_tensors.items():
-            io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+            io_binding.bind_output(name, tensor.device.type, 0,
+                                   numpy_type(tensor.dtype), tensor.shape,
+                                   tensor.data_ptr())
         return io_binding
 
     def create_session(self):
         sess_opt = ort.SessionOptions()
         sess_opt.enable_profiling = self.profiling
-        if self.provider == "rocm":
-            execution_provider = ["ROCMExecutionProvider"]
-        elif self.provider == "cuda":
-            execution_provider = ["CUDAExecutionProvider"]
-        else:
-            raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
-
-        sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
-
-        if self.provider == "rocm":
-            assert 'ROCMExecutionProvider' in sess.get_providers()
-        elif self.provider == "cuda":
-            assert 'CUDAExecutionProvider' in sess.get_providers()
-
+        sess = ort.InferenceSession(self.model, sess_options=sess_opt,
+                                    providers=[self.provider])
         return sess
 
     def benchmark(self):
diff --git a/onnxruntime/python/tools/microbench/cast.py b/onnxruntime/python/tools/microbench/cast.py
new file mode 100644
index 0000000000000..d6ae83a236c85
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/cast.py
@@ -0,0 +1,75 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import argparse
+from dataclasses import dataclass
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    x : int
+    y : int
+    m : int
+    n : int
+    input_data_type : type
+    output_data_type : type
+
+
+@dataclass
+class ModelParam:
+    token_type_ids_dim0 : int
+    input_ids_dim1 : int
+
+
+class BenchmarkCast(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.x, op_param.y, op_param.m, op_param.n).astype(op_param.input_data_type)
+        output_data = np.random.rand(op_param.x, op_param.y, op_param.m, op_param.n).astype(op_param.output_data_type)
+        inputs = {"X": input_data}
+        outputs = {"Y": output_data}
+        return inputs, outputs
+
+    def add_model_cases(self, mp, model, input_data_type, output_data_type):
+        self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1024, input_data_type, output_data_type), model)
+        self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1, input_data_type, output_data_type), model)
+        self.add_case(OpParam(16, mp.token_type_ids_dim0, mp.input_ids_dim1, mp.input_ids_dim1, input_data_type, output_data_type), model)
+
+    def create_cases(self):
+        model = "models/cast_fp16tofp32.onnx" if self.args.precision == "fp16" else "models/cast_fp32tofp16.onnx"
+        input_data_type = np.float16 if self.args.precision == "fp16" else np.float32
+        output_data_type = np.float32 if self.args.precision == "fp16" else np.float16
+        # huggingface bert-large
+        self.add_case(OpParam(1, 1, 1, 1024, input_data_type, output_data_type), model)
+        self.add_case(OpParam(1, 1, 1024, 1024, input_data_type, output_data_type), model)
+        self.add_case(OpParam(1, 1, 1024, 4096, input_data_type, output_data_type), model)
+        self.add_case(OpParam(1, 1, 1024, 30522, input_data_type, output_data_type), model)
+        # huggingface bert-large with default dims
+        model_param = ModelParam(8, 512)
+        self.add_model_cases(model_param, model, input_data_type, output_data_type)
+        # huggingface bert-large with large input dims
+        model_param = ModelParam(32, 1024)
+        self.add_model_cases(model_param, model, input_data_type, output_data_type)
+
+    def case_profile(cls, op_param, time):
+        profile = f"(x y m n input_data_type) = ({op_param.x} {op_param.y} {op_param.m} {op_param.n} {op_param.input_data_type}), {time:7.4f} ms"
+        return profile
+    
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkCast(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/microbench/fast_gelu.py b/onnxruntime/python/tools/microbench/fast_gelu.py
index 3014cf6234644..2d50e256a0642 100644
--- a/onnxruntime/python/tools/microbench/fast_gelu.py
+++ b/onnxruntime/python/tools/microbench/fast_gelu.py
@@ -1,3 +1,8 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
 import argparse
 from dataclasses import dataclass
 import numpy as np
diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py
index 8c091d97f0086..1de45ee5c75b3 100644
--- a/onnxruntime/python/tools/microbench/matmul.py
+++ b/onnxruntime/python/tools/microbench/matmul.py
@@ -1,3 +1,8 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
 import argparse
 from dataclasses import dataclass
 import numpy as np
diff --git a/onnxruntime/python/tools/microbench/skip_layer_norm.py b/onnxruntime/python/tools/microbench/skip_layer_norm.py
new file mode 100644
index 0000000000000..b6f8c5f9e15e0
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/skip_layer_norm.py
@@ -0,0 +1,59 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import argparse
+from dataclasses import dataclass
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    batch_size: int
+    seq_len: int
+    hidden_size: int
+    data_type: type
+
+
+class BenchmarkSkipLayerNorm(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+    
+        inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
+        outputs = {"return_val": output_data}
+     
+        return inputs, outputs
+
+    def create_cases(self):
+        model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+        # bert-large
+        op_param = OpParam(1, 384, 1024, data_type)
+        self.add_case(op_param, model)
+
+    def case_profile(cls, op_param, time):
+        profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkSkipLayerNorm(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index c559112028399..3165c5be8ea2c 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -608,7 +608,7 @@ def compute_percentile(self):
             cdf = np.cumsum(hist/total)
             if self.symmetric:
                 idx_right = np.searchsorted(cdf, percentile / 100.0)
-                thresholds_dict[tensor] = (-float(hist_edges[idx_ringht]), float(hist_edges[idx_right]))
+                thresholds_dict[tensor] = (-float(hist_edges[idx_right]), float(hist_edges[idx_right]))
             else:
                 percent_to_cut_one_side = (100.0 - percentile) / 200.0
                 idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 0d2e7feee7c13..889adf0c4531d 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -1803,6 +1803,21 @@ def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
         vi = self.known_vi_[node.output[output_index]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[output_index], output_dtype, shape))
 
+    def _is_none_dim(self, dim_value):
+        if type(dim_value) != str:
+            return False
+        if "unk__" not in dim_value:
+            return False
+        if dim_value in self.symbolic_dims_.keys():
+            return False
+        return True
+    
+    def _is_shape_contains_none_dim(self, out_shape):
+        for out in out_shape:
+            if self._is_none_dim(out):
+                return out
+        return None
+    
     def _infer_impl(self, start_sympy_data=None):
         self.sympy_data_ = start_sympy_data or {}
         self.out_mp_.graph.ClearField('value_info')
@@ -1956,7 +1971,8 @@ def get_prereq(node):
                     if node.output[i_o] in self.sympy_data_:
                         logger.debug('  Sympy Data: ' + str(self.sympy_data_[node.output[i_o]]))
 
-                if (out_shape is not None and None in out_shape) or out_type_undefined:
+                # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
+                if (out_shape is not None and (None in out_shape or self._is_shape_contains_none_dim(out_shape))) or out_type_undefined:
                     if self.auto_merge_:
                         if node.op_type in [
                                 'Add', 'Sub', 'Mul', 'Div', 'MatMul', 'MatMulInteger', 'MatMulInteger16', 'Concat',
@@ -1964,8 +1980,11 @@ def get_prereq(node):
                         ]:
                             shapes = [self._get_shape(node, i) for i in range(len(node.input))]
                             if node.op_type in ['MatMul', 'MatMulInteger', 'MatMulInteger16']:
-                                if None in out_shape:
-                                    idx = out_shape.index(None)
+                                if None in out_shape or self._is_shape_contains_none_dim(out_shape):
+                                    if None in out_shape:
+                                        idx = out_shape.index(None)
+                                    else:
+                                        idx = out_shape.index(self._is_shape_contains_none_dim(out_shape))
                                     dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
                                     # only support auto merge for MatMul for dim < rank-2 when rank > 2
                                     assert len(shapes[0]) > 2 and dim_idx[0] < len(shapes[0]) - 2
@@ -1978,7 +1997,7 @@ def get_prereq(node):
 
                         if shapes:
                             for idx in range(len(out_shape)):
-                                if out_shape[idx] is not None:
+                                if out_shape[idx] is not None and not self._is_none_dim(out_shape[idx]):
                                     continue
                                 # note that the broadcasting rule aligns from right to left
                                 # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 57741f6e1d238..ba8694da4d51e 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -34,6 +34,8 @@
             python benchmark.py -e torchscript -g -p "fp16"
         Run ONNXRuntime and TorchScript on CPU for all models with quantization:
             python benchmark.py -e torchscript onnxruntime -p "int8" -o
+        Run OnnxRuntime with the ROCM provider and graph optimization script:
+            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
 
     It is recommended to use run_benchmark.sh to launch benchmark.
 """
@@ -51,6 +53,7 @@
 from benchmark_helper import (OptimizerInfo, create_onnxruntime_session, Precision, setup_logger, get_latency_result,
                               output_details, output_summary, output_fusion_statistics, inference_ort,
                               inference_ort_with_io_binding, allocateOutputBuffers, ConfigModifier)
+from fusion_options import FusionOptions
 from quantize_helper import QuantizeHelper
 from onnx_exporter import create_onnxruntime_input, load_pretrained_model, export_onnx_model_from_pt, export_onnx_model_from_tf
 
@@ -71,7 +74,7 @@
 def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes,
                     sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir,
                     verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
-                    model_source):
+                    model_source, args):
     import onnxruntime
 
     results = []
@@ -92,6 +95,9 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
             )
             return results
 
+    if optimizer_info == OptimizerInfo.NOOPT:
+        logger.warning(f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied.")
+
     for model_name in model_names:
         all_input_names = MODELS[model_name][0]
         for num_inputs in input_counts:
@@ -99,18 +105,20 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier
                 break
 
             input_names = all_input_names[:num_inputs]
+            args.model_type = MODELS[model_name][3]
+            fusion_options = FusionOptions.parse(args)
 
             if 'pt' in model_source:
                 with torch.no_grad():
                     onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt(
                         model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
                         config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
-                        validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics)
+                        validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
             if 'tf' in model_source:
                 onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf(
                     model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class,
                     config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
-                    validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics)
+                    validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options)
 
             if not is_valid_onnx_model:
                 continue
@@ -198,7 +206,7 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
 
     for model_name in model_names:
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
-        config_modifier(config)
+        config_modifier.modify(config)
         model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class)
         tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -240,6 +248,7 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n
                     result = {
                         "engine": "torchscript" if torchscript else "torch",
                         "version": torch.__version__,
+                        "providers": "NA",
                         "device": "cuda" if use_gpu else "cpu",
                         "optimizer": "",
                         "precision": precision,
@@ -315,7 +324,7 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision
 
     for model_name in model_names:
         config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
-        config_modifier(config)
+        config_modifier.modify(config)
 
         model = load_pretrained_model(model_name,
                                       config=config,
@@ -373,6 +382,7 @@ def lxmert_forward():
                     result = {
                         "engine": "tensorflow",
                         "version": tf.__version__,
+                        "providers": "NA",
                         "device": "cuda" if use_gpu else "cpu",
                         "optimizer": "",
                         "precision": precision,
@@ -517,6 +527,8 @@ def parse_arguments():
                         default=None,
                         help="Manually set the model's layer number")
 
+    FusionOptions.add_arguments(parser)
+
     args = parser.parse_args()
     return args
 
@@ -584,7 +596,7 @@ def main():
                                            args.test_times, args.input_counts, args.optimizer_info, args.validate_onnx,
                                            args.cache_dir, args.onnx_dir, args.verbose, args.overwrite,
                                            args.disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics,
-                                           args.model_source)
+                                           args.model_source, args)
             except:
                 logger.error(f"Exception", exc_info=True)
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 6b621492b2ec2..7c13ca3c8d945 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -21,6 +21,7 @@
 import psutil
 import csv
 import numpy as np
+import torch
 import random
 from datetime import datetime
 import multiprocessing
@@ -36,6 +37,7 @@ class TestSetting:
     test_cases: int
     test_times: int
     use_gpu: bool
+    use_io_binding: bool
     provider: str
     intra_op_num_threads: int
     seed: int
@@ -119,6 +121,55 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op
 
     return session
 
+def numpy_type(torch_type):
+    type_map = {torch.float32: np.float32,
+                torch.float16: np.float16,
+                torch.int32: np.int32,
+                torch.int64: np.longlong}
+    return type_map[torch_type]
+
+def create_input_output_tensors(inputs, outputs, device):
+    input_tensors = {name: torch.from_numpy(array).to(device)
+                        for name, array in inputs.items()}
+    output_tensors = {name: torch.from_numpy(array).to(device)
+                        for name, array in outputs.items()}
+    return input_tensors, output_tensors
+
+def create_io_binding(sess, input_tensors, output_tensors):
+    io_binding = sess.io_binding()
+    for name, tensor in input_tensors.items():
+        io_binding.bind_input(name, tensor.device.type, 0,
+                                numpy_type(tensor.dtype), tensor.shape,
+                                tensor.data_ptr())
+    for name, tensor in output_tensors.items():
+        io_binding.bind_output(name, tensor.device.type, 0,
+                                numpy_type(tensor.dtype), tensor.shape,
+                                tensor.data_ptr())
+    return io_binding
+
+def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting):
+    results = []
+    latency_list = []
+    device = 'cuda' if test_setting.use_gpu else 'cpu'
+    for test_case_id, inputs in enumerate(all_inputs):
+        result = session.run(output_names, inputs)
+        results.append(result)
+        outputs = {}
+        for i in range(len(output_names)):
+            outputs[output_names[i]] = result[i]
+
+        input_tensors, output_tensors = create_input_output_tensors(inputs, outputs, device)
+        io_binding = create_io_binding(session, input_tensors, output_tensors)
+
+        # warm up once
+        session.run_with_iobinding(io_binding)
+
+        start_time = timeit.default_timer()
+        session.run_with_iobinding(io_binding)
+        latency = timeit.default_timer() - start_time
+        latency_list.append(latency)
+
+    return results, latency_list
 
 def onnxruntime_inference(session, all_inputs, output_names):
     if len(all_inputs) > 0:
@@ -135,7 +186,6 @@ def onnxruntime_inference(session, all_inputs, output_names):
         latency_list.append(latency)
     return results, latency_list
 
-
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
     option = "model={},".format(os.path.basename(model_path))
@@ -159,9 +209,14 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
     print("Running test:", key)
 
     all_latency_list = []
-    for i in range(test_setting.test_times):
-        results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
-        all_latency_list.extend(latency_list)
+    if test_setting.use_io_binding:
+        for i in range(test_setting.test_times):
+            results, latency_list = onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting)
+            all_latency_list.extend(latency_list)
+    else:
+        for i in range(test_setting.test_times):
+            results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
+            all_latency_list.extend(latency_list)
 
     # latency in miliseconds
     latency_ms = np.array(all_latency_list) * 1000
@@ -269,6 +324,9 @@ def parse_arguments():
     parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
     parser.set_defaults(use_gpu=False)
 
+    parser.add_argument('--use_io_binding', required=False, action='store_true', help="use io_binding")
+    parser.set_defaults(use_io_binding=False)
+
     parser.add_argument("--provider",
                         required=False,
                         type=str,
@@ -311,7 +369,7 @@ def main():
                                  args.opt_level)
 
     for batch_size in batch_size_set:
-        test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu,
+        test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu, args.use_io_binding,
                                    args.provider, args.intra_op_num_threads, args.seed, args.verbose)
 
         print("test setting", test_setting)
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index 3823e001e0634..b5b26c0b046a7 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -280,6 +280,12 @@ def convert_float_to_float16(model,
                         if n.name not in graph_io_to_skip:
                             n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
                             value_info_list.append(n)
+                    if n.type.HasField('sequence_type'):
+                        if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                            if n.name not in graph_io_to_skip:
+                                n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                                value_info_list.append(n)
+                            
         queue = next_level
 
     for key, value in fp32_initializers.items():
diff --git a/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py b/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py
index 2570673692a05..93d42ffb65c4b 100644
--- a/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py
+++ b/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py
@@ -17,6 +17,7 @@
 from transformers import GPT2LMHeadModel, GPT2Config
 from benchmark_helper import Precision
 from gpt2_helper import Gpt2Helper, Gpt2Inputs, GPT2ModelNoPastState, MyGPT2Model, MyGPT2LMHeadModel, MyGPT2LMHeadModel_NoPadding
+from torch_onnx_export_helper import torch_onnx_export
 
 logger = logging.getLogger(__name__)
 
@@ -36,7 +37,7 @@ def create_helper(helper_type="default"):
 
 
 class GPT2LMHeadModel_BeamSearchStep(GPT2LMHeadModel):
-    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one 
+    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one
     step beam search."""
     def __init__(self, config, batch_size, beam_size):
         super().__init__(config)
@@ -120,7 +121,7 @@ def forward(
 
 
 class GPT2LMHeadModel_ConfigurableOneStepSearch(GPT2LMHeadModel):
-    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one 
+    """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one
     step beam search with configuration support."""
     def __init__(self,
                  config,
@@ -628,7 +629,7 @@ def export_onnx(model,
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-        torch.onnx.export(
+        torch_onnx_export(
             model,
             args=tuple(input_list),
             f=onnx_model_path,
diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py
index d0a2b92c5f16f..cc7712e163df9 100644
--- a/onnxruntime/python/tools/transformers/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/gpt2_helper.py
@@ -21,6 +21,7 @@
 from fusion_utils import FusionUtils
 from benchmark_helper import Precision
 from io_binding_helper import IOBindingHelper
+from torch_onnx_export_helper import torch_onnx_export
 
 logger = logging.getLogger(__name__)
 
@@ -402,7 +403,7 @@ def export_onnx(model,
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-        torch.onnx.export(model,
+        torch_onnx_export(model,
                           args=tuple(input_list),
                           f=onnx_model_path,
                           input_names=input_names,
diff --git a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py
index 3e9922bef3cfa..fb7b0adefb1fa 100644
--- a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py
@@ -15,6 +15,8 @@
 #
 # For inference of the onnx model, you will need onnxruntime-gpu 1.7.0 or above.
 
+import sys
+import os
 import torch
 import numpy as np
 import argparse
@@ -25,6 +27,9 @@
 from pathlib import Path
 from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS
 
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from torch_onnx_export_helper import torch_onnx_export
+
 
 @parse_args('v', 'v', 'v', 'v', 'v', 'v', 'v', 'i', 'i')
 def my_longformer_attention(g, input, weight, bias, mask, global_weight, global_bias, global_mask, num_heads, window):
@@ -223,7 +228,7 @@ def export_longformer(model, onnx_model_path, export_padding):
 
     Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-    torch.onnx.export(model,
+    torch_onnx_export(model,
                       example_inputs,
                       onnx_model_path,
                       opset_version=11,
diff --git a/onnxruntime/python/tools/transformers/models/t5/past_helper.py b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
index 3c585c23c8058..0a9eb37be9443 100644
--- a/onnxruntime/python/tools/transformers/models/t5/past_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
@@ -11,7 +11,6 @@
 
 class PastKeyValuesHelper:
     """ Helper functions to process past key values for encoder-decoder model"""
-
     @staticmethod
     def get_past_names(num_layers, present: bool = False):
         past_self_names = []
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
index 5bfd530581d6e..26e5d9733e0c8 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
@@ -6,6 +6,8 @@
 
 from pathlib import Path
 from typing import List, Union
+import sys
+import os
 import logging
 import numpy
 import torch
@@ -14,6 +16,9 @@
 from t5_encoder import T5EncoderInputs
 from past_helper import PastKeyValuesHelper
 
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from torch_onnx_export_helper import torch_onnx_export
+
 logger = logging.getLogger(__name__)
 
 
@@ -21,7 +26,6 @@ class T5DecoderInit(torch.nn.Module):
     """ A T5 decoder with LM head to create initial past key values.
         This model is only called once during starting decoding.
     """
-
     def __init__(self,
                  decoder: torch.nn.Module,
                  lm_head: torch.nn.Module,
@@ -58,7 +62,6 @@ def forward(self, decoder_input_ids: torch.Tensor, encoder_attention_mask: torch
 
 class T5Decoder(torch.nn.Module):
     """ A T5 decoder with LM head and past key values"""
-
     def __init__(self, decoder, lm_head, config):
         super().__init__()
         self.decoder = decoder
@@ -89,7 +92,6 @@ def forward(self, decoder_input_ids, encoder_attention_mask, encoder_hidden_stat
 
 
 class T5DecoderInputs:
-
     def __init__(self, decoder_input_ids, encoder_attention_mask, encoder_hidden_states, past_key_values=None):
         self.decoder_input_ids: torch.LongTensor = decoder_input_ids
         self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask
@@ -160,7 +162,6 @@ def to_list(self) -> List:
 
 
 class T5DecoderHelper:
-
     @staticmethod
     def export_onnx(decoder: Union[T5Decoder, T5DecoderInit],
                     device: torch.device,
@@ -250,7 +251,7 @@ def export_onnx(decoder: Union[T5Decoder, T5DecoderInit],
                     }
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-        torch.onnx.export(decoder,
+        torch_onnx_export(decoder,
                           args=tuple(input_list),
                           f=onnx_model_path,
                           export_params=True,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
index c0086896b74d3..cf0f7f97abcf3 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
@@ -5,6 +5,8 @@
 # --------------------------------------------------------------------------
 
 import random
+import sys
+import os
 from pathlib import Path
 from typing import List
 import logging
@@ -13,12 +15,14 @@
 from transformers import T5Config
 from onnxruntime import InferenceSession
 
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from torch_onnx_export_helper import torch_onnx_export
+
 logger = logging.getLogger(__name__)
 
 
 class T5Encoder(torch.nn.Module):
     """ T5 encoder outputs only the last hidden state"""
-
     def __init__(self, encoder, config: T5Config):
         super().__init__()
         self.encoder = encoder
@@ -29,7 +33,6 @@ def forward(self, input_ids, attention_mask):
 
 
 class T5EncoderInputs:
-
     def __init__(self, input_ids, attention_mask):
         self.input_ids: torch.LongTensor = input_ids
         self.attention_mask: torch.LongTensor = attention_mask
@@ -44,7 +47,7 @@ def create_dummy(batch_size: int, sequence_length: int, vocab_size: int,
             sequence_length (int): sequence length
             vocab_size (int): vocaburary size
             device (torch.device): device of output tensors
-        
+
         Returns:
             T5EncoderInputs: dummy inputs for encoder
         """
@@ -67,7 +70,6 @@ def to_list(self) -> List:
 
 
 class T5EncoderHelper:
-
     @staticmethod
     def export_onnx(encoder: T5Encoder,
                     device: torch.device,
@@ -93,7 +95,7 @@ def export_onnx(encoder: T5Encoder,
             outputs = encoder(encoder_inputs.input_ids, encoder_inputs.attention_mask)
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-        torch.onnx.export(encoder,
+        torch_onnx_export(encoder,
                           args=tuple(encoder_inputs.to_list()),
                           f=onnx_model_path,
                           export_params=True,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
index 29b82cda191f6..bbfff80591fc3 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
@@ -6,6 +6,8 @@
 
 from pathlib import Path
 from typing import List
+import sys
+import os
 import logging
 import numpy
 import torch
@@ -15,13 +17,15 @@
 from t5_decoder import T5DecoderInit
 from past_helper import PastKeyValuesHelper
 
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from torch_onnx_export_helper import torch_onnx_export
+
 logger = logging.getLogger(__name__)
 
 
 class T5EncoderDecoderInit(torch.nn.Module):
     """ A combination of T5Encoder and T5DecoderInit.
     """
-
     def __init__(self,
                  encoder: torch.nn.Module,
                  decoder: torch.nn.Module,
@@ -44,7 +48,6 @@ def forward(self,
 
 
 class T5EncoderDecoderInitInputs:
-
     def __init__(self, encoder_input_ids, encoder_attention_mask, decoder_input_ids=None):
         self.encoder_input_ids: torch.LongTensor = encoder_input_ids
         self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask
@@ -70,7 +73,6 @@ def to_list(self) -> List:
 
 
 class T5EncoderDecoderInitHelper:
-
     @staticmethod
     def export_onnx(model: T5EncoderDecoderInit,
                     device: torch.device,
@@ -153,7 +155,7 @@ def export_onnx(model: T5EncoderDecoderInit,
                 dynamic_axes[name] = {0: 'batch_size', 1: num_heads, 2: sequence_length, 3: head_size}
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-        torch.onnx.export(model,
+        torch_onnx_export(model,
                           args=tuple(input_list),
                           f=onnx_model_path,
                           export_params=True,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
index f04fa9941c45f..4bcb5d428463c 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
@@ -22,7 +22,6 @@
 
 
 class T5Helper:
-
     @staticmethod
     def get_onnx_path(output_dir: str, model_name_or_path: str, suffix: str = "", new_folder: bool = False) -> str:
         """Build onnx path
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 30d767e93076a..04228cd02e888 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -15,6 +15,7 @@
 from gpt2_helper import GPT2ModelNoPastState, PRETRAINED_GPT2_MODELS, TFGPT2ModelNoPastState
 from quantize_helper import QuantizeHelper
 from huggingface_models import MODEL_CLASSES
+from torch_onnx_export_helper import torch_onnx_export
 
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
@@ -184,13 +185,14 @@ def optimize_onnx_model_by_ort(onnx_model_path, ort_model_path, use_gpu, overwri
 
 def optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size,
                         use_gpu, precision, use_raw_attention_mask, overwrite, model_fusion_statistics,
-                        use_external_data_format):
+                        use_external_data_format, optimization_options=None):
     if overwrite or not os.path.exists(optimized_model_path):
         Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True)
 
         from optimizer import optimize_model
         from fusion_options import FusionOptions
-        optimization_options = FusionOptions(model_type)
+        if optimization_options == None:
+          optimization_options = FusionOptions(model_type)
         optimization_options.use_raw_attention_mask(use_raw_attention_mask)
         if Precision.FLOAT16 == precision:
             optimization_options.enable_gelu_approximation = True
@@ -317,7 +319,8 @@ def validate_and_optimize_onnx(model_name,
                                onnx_model_path,
                                example_inputs,
                                example_outputs_flatten,
-                               output_names=None):
+                               output_names,
+                               fusion_options):
     is_valid_onnx_model = True
     if validate_onnx:
         is_valid_onnx_model = validate_onnx_model(onnx_model_path, example_inputs, example_outputs_flatten, use_gpu,
@@ -330,7 +333,7 @@ def validate_and_optimize_onnx(model_name,
                                                   False, use_external_data_format)
         optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, config.num_attention_heads,
                             config.hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite,
-                            model_fusion_statistics, use_external_data_format)
+                            model_fusion_statistics, use_external_data_format, fusion_options)
 
         onnx_model_path = optimized_model_path
         if validate_onnx:
@@ -352,7 +355,7 @@ def validate_and_optimize_onnx(model_name,
 
 def export_onnx_model_from_pt(model_name, opset_version, use_external_data_format, model_type, model_class,
                               config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
-                              validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics):
+                              validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options):
 
     config, model = load_pt_model(model_name, model_class, cache_dir, config_modifier)
     # config, model = load_pt_model_from_tf(model_name)
@@ -384,7 +387,7 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma
         dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
 
         replace_torch_functions()
-        torch.onnx.export(model=model,
+        torch_onnx_export(model=model,
                           args=tuple(example_inputs.values()),
                           f=onnx_model_path,
                           input_names=list(example_inputs.keys()),
@@ -401,14 +404,14 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma
     onnx_model_file, is_valid_onnx_model, vocab_size = validate_and_optimize_onnx(
         model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu, precision, optimizer_info,
         validate_onnx, use_raw_attention_mask, overwrite, config, model_fusion_statistics, onnx_model_path,
-        example_inputs, example_outputs_flatten, None)
+        example_inputs, example_outputs_flatten, None, fusion_options)
 
     return onnx_model_file, is_valid_onnx_model, vocab_size, max_input_size
 
 
 def export_onnx_model_from_tf(model_name, opset_version, use_external_data_format, model_type, model_class,
                               config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info,
-                              validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics):
+                              validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options):
     # Use CPU to export
     import tensorflow as tf
     tf.config.set_visible_devices([], 'GPU')
@@ -495,6 +498,6 @@ def export_onnx_model_from_tf(model_name, opset_version, use_external_data_forma
     opt_onnx_model_file, onnx_model_file, is_valid_onnx_model, vocab_size = validate_and_optimize_onnx(
         model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu, precision, optimizer_info,
         validate_onnx, use_raw_attention_mask, overwrite, config, model_fusion_statistics, onnx_model_path,
-        example_inputs, example_outputs_flatten, output_names)
+        example_inputs, example_outputs_flatten, output_names, fusion_options)
 
     return opt_onnx_model_file, onnx_model_file, is_valid_onnx_model, vocab_size, max_input_size
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
new file mode 100644
index 0000000000000..0912ee396f20e
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import torch
+TrainingMode = torch.onnx.TrainingMode
+from packaging.version import Version
+
+def torch_onnx_export(
+        model,
+        args,
+        f,
+        export_params=True,
+        verbose=False,
+        training=TrainingMode.EVAL,
+        input_names=None,
+        output_names=None,
+        operator_export_type=None,
+        opset_version=None,
+        _retain_param_name=None,
+        do_constant_folding=True,
+        example_outputs=None,
+        strip_doc_string=None,
+        dynamic_axes=None,
+        keep_initializers_as_inputs=None,
+        custom_opsets=None,
+        enable_onnx_checker=None,
+        use_external_data_format=None,
+        export_modules_as_functions=False):
+    if Version(torch.__version__) >= Version("1.11.0"):
+        torch.onnx.export(
+            model=model,
+            args=args,
+            f=f,
+            export_params=export_params,
+            verbose=verbose,
+            training=training,
+            input_names=input_names,
+            output_names=output_names,
+            operator_export_type=operator_export_type,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            dynamic_axes=dynamic_axes,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            custom_opsets=custom_opsets,
+            export_modules_as_functions=export_modules_as_functions)
+    else:
+        torch.onnx.export(
+            model=model,
+            args=args,
+            f=f,
+            export_params=export_params,
+            verbose=verbose,
+            training=training,
+            input_names=input_names,
+            output_names=output_names,
+            operator_export_type=operator_export_type,
+            opset_version=opset_version,
+            _retain_param_name=_retain_param_name,
+            do_constant_folding=do_constant_folding,
+            example_outputs=example_outputs,
+            strip_doc_string=strip_doc_string,
+            dynamic_axes=dynamic_axes,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            custom_opsets=custom_opsets,
+            enable_onnx_checker=enable_onnx_checker,
+            use_external_data_format=use_external_data_format)
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index c6c8e9a890d9d..05699be42c9de 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1412,11 +1412,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
         std::vector<onnxruntime::NodeArg*> inputs = {&node_1};
         std::vector<onnxruntime::NodeArg*> outputs = {&node_2};
         auto& cast_node = graph.AddNode("cast_1", "Cast", "node 2", inputs, outputs);
-        ONNX_NAMESPACE::AttributeProto to;
-        to.set_name("to");
-        to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-        to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
-        cast_node.AddAttribute("to", to);
+        cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT});
       }
       {
         std::vector<onnxruntime::NodeArg*> inputs = {&node_2, &data_0};
@@ -1462,11 +1458,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
     std::vector<onnxruntime::NodeArg*> inputs = {&if_cond_input};
     std::vector<onnxruntime::NodeArg*> outputs = {&graph_if_input};
     auto& cast_node = graph.AddNode("cast_9", "Cast", "node 2", inputs, outputs);
-    ONNX_NAMESPACE::AttributeProto to;
-    to.set_name("to");
-    to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-    to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
-    cast_node.AddAttribute("to", to);
+    cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT});
   }
 
   std::vector<onnxruntime::NodeArg*> inputs = {&if_cond_input};
@@ -1600,11 +1592,7 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
     std::vector<onnxruntime::NodeArg*> inputs = {&graph_0__value_1};
     std::vector<onnxruntime::NodeArg*> outputs = {&graph_0__value_2};
     auto& cast_node = graph.AddNode("graph_0__cast_0", "Cast", "cast node in main graph", inputs, outputs);
-    ONNX_NAMESPACE::AttributeProto to;
-    to.set_name("to");
-    to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-    to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
-    cast_node.AddAttribute("to", to);
+    cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT});
   }
   {
     std::vector<onnxruntime::NodeArg*> inputs = {&graph_0__value_2, &input_0};
diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc
index a8af3e2be4b5b..9c9f3d3d2df99 100644
--- a/onnxruntime/test/framework/shape_inference_test.cc
+++ b/onnxruntime/test/framework/shape_inference_test.cc
@@ -84,18 +84,7 @@ TEST_F(ShapeInferenceTest, BasicTest) {
   Input("X1", type1);
 
   auto& node = Node("Cast", "X1", "Y1");
-  //AttributeProto squeezed_axes;
-  //squeezed_axes.set_name("axes");
-  //squeezed_axes.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
-  //squeezed_axes.add_ints(0);
-  //p_node->AddAttribute("axes", squeezed_axes);
-  AttributeProto cast_to;
-  cast_to.set_name("to");
-  cast_to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
-  cast_to.set_i(ONNX_NAMESPACE::TensorProto_DataType_INT32);
-  //cast_to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING);
-  //cast_to.set_s("INT16");
-  node.AddAttribute("to", cast_to);
+  node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_INT32});
 
   DoShapeInference();
   // check inferred shapes
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 1d658387ebcd9..96b725c75593d 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -1537,21 +1537,11 @@ TEST_F(GraphTest, AddTensorAttribute) {
 }
 
 void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, int64_t attr_value) {
-  AttributeProto attr;
-  attr.set_name(attr_name);
-  attr.set_type(AttributeProto_AttributeType_INT);
-  attr.set_i(attr_value);
-  p_node.AddAttribute(attr_name, attr);
+  p_node.AddAttribute(attr_name, attr_value);
 }
 
 void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, std::initializer_list<int64_t> attr_value) {
-  AttributeProto attr;
-  attr.set_name(attr_name);
-  attr.set_type(AttributeProto_AttributeType_INTS);
-  for (auto v : attr_value) {
-    attr.add_ints(v);
-  }
-  p_node.AddAttribute(attr_name, attr);
+  p_node.AddAttribute(attr_name, attr_value);
 }
 
 // Test that output type can be inferred for ops with a type-attribute
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.cc b/onnxruntime/test/optimizer/qdq_test_utils.cc
index 607049917f976..9e3318dc0f79f 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.cc
+++ b/onnxruntime/test/optimizer/qdq_test_utils.cc
@@ -98,5 +98,36 @@ GetQDQTestCaseFn BuildQDQConcatTestCase(const std::vector<std::vector<int64_t>>&
   };
 }
 
+GetQDQTestCaseFn BuildQDQConcatTestCaseUnsupportedInputScaleZp() {
+  return [](ModelTestBuilder& builder) {
+    const std::vector<std::vector<int64_t>> input_shapes = {
+        {1, 6, 36},
+        {1, 6, 8},
+        {1, 6, 2},
+    };
+    int64_t axis = 2;
+
+    std::vector<NodeArg*> input_args;
+    std::vector<NodeArg*> q_input_args;
+
+    // set unmatched input scales/zp for test purpose
+    input_args.push_back(builder.MakeInput<float>(input_shapes[0], -1.f, 1.f));
+    q_input_args.push_back(AddQDQNodePair<uint8_t>(builder, input_args.back(), 0.05f, 128));
+    input_args.push_back(builder.MakeInput<float>(input_shapes[1], -1.f, 1.f));
+    q_input_args.push_back(AddQDQNodePair<uint8_t>(builder, input_args.back(), 0.04f, 127));
+    input_args.push_back(builder.MakeInput<float>(input_shapes[2], -1.f, 1.f));
+    q_input_args.push_back(AddQDQNodePair<uint8_t>(builder, input_args.back(), 0.03f, 126));
+
+    auto* concat_output = builder.MakeIntermediate();
+    Node& concat_node = builder.AddNode("Concat", q_input_args, {concat_output});
+    concat_node.AddAttribute("axis", axis);
+
+    auto* q_concat_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<uint8_t>(concat_output, 0.05f, 128, q_concat_output);
+    auto* output_arg = builder.MakeOutput();
+    builder.AddDequantizeLinearNode<uint8_t>(q_concat_output, 0.05f, 128, output_arg);
+  };
+}
+
 }  // namespace test
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index affa5baf9d1d3..2ee6abcb548f2 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -256,8 +256,9 @@ GetQDQTestCaseFn BuildQDQTransposeTestCase(
 }
 
 template <typename InputType, typename OutputType>
-GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector<int64_t>& input_shape, const int64_t& axis = -1) {
-  return [input_shape, axis](ModelTestBuilder& builder) {
+GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector<int64_t>& input_shape, const int64_t& axis,
+                                         float output_scales, OutputType output_zero_point) {
+  return [input_shape, axis, output_scales, output_zero_point](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<InputType>(input_shape,
                                                    std::numeric_limits<InputType>::min(),
                                                    std::numeric_limits<InputType>::max());
@@ -275,7 +276,7 @@ GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector<int64_t>& input_shape
     softmax_node.AddAttribute("axis", axis);
 
     // add Q
-    builder.AddQuantizeLinearNode<OutputType>(softmax_output, 1.f / 256, 0, output_arg);
+    builder.AddQuantizeLinearNode<OutputType>(softmax_output, output_scales, output_zero_point, output_arg);
   };
 }
 
@@ -288,5 +289,7 @@ GetQDQTestCaseFn BuildQDQConcatTestCase(const std::vector<std::vector<int64_t>>&
                                         bool has_input_int8 = false,
                                         bool has_output_int8 = false);
 
+GetQDQTestCaseFn BuildQDQConcatTestCaseUnsupportedInputScaleZp();
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 0ee91ee05f763..2a7a04c122655 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -147,6 +147,7 @@ Status PerformanceRunner::Run() {
             << "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"
             // Time between start and end of run. Less than Total time cost when running requests in parallel.
             << "Total inference run time: " << inference_duration.count() << " s\n"
+            << "Number of inferences per second: " << performance_result_.time_costs.size() / inference_duration.count() << " \n"
             << "Avg CPU usage: " << performance_result_.average_CPU_usage << " %\n"
             << "Peak working set size: " << performance_result_.peak_workingset_size << " bytes"
             << std::endl;
@@ -188,7 +189,9 @@ Status PerformanceRunner::RunParallelDuration() {
       count++;
       counter++;
       tpool->Schedule([this, &counter, &m, &cv]() {
-        session_->ThreadSafeRun();
+        auto status = RunOneIteration<false>();
+        if (!status.IsOK())
+          std::cerr << status.ErrorMessage();
         // Simplified version of Eigen::Barrier
         std::lock_guard<OrtMutex> lg(m);
         counter--;
diff --git a/onnxruntime/test/providers/cpu/controlflow/if_test.cc b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
index 5facccbc1e4e7..0b41549d4e320 100644
--- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
@@ -84,7 +84,7 @@ class IfOpTester : public OpTester {
         *split_attribute->Add() = 1;  // split "unevenly" to create different shapes across the "then" and "else" branches
         *split_attribute->Add() = 2;
 
-        split_node.AddAttribute("split", attr_proto);
+        split_node.AddAttributeProto(std::move(attr_proto));
       }
     }
 
@@ -382,7 +382,7 @@ class IfOpTesterOnlyConstantNodesInConditionalBranches : public OpTester {
         then_constant_attr_tensor_proto->add_dims(1);
         then_constant_attr_tensor_proto->add_float_data(value);  // Constant value of 10.f
 
-        then_constant_node.AddAttribute("value", then_constant_attr_proto);
+        then_constant_node.AddAttributeProto(std::move(then_constant_attr_proto));
 
         auto status_then = graph_then.Resolve();
         EXPECT_EQ(status_then, Status::OK());
diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
index 7628c3454a7b9..c64c9a87a4237 100644
--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@@ -802,7 +802,7 @@ TEST(Loop, Opset11WithNoVariadicInputsAndOutputs) {
       constant_attribute_tensor_proto->set_data_type(TensorProto_DataType_FLOAT);  // float scalar
       *constant_attribute_tensor_proto->mutable_float_data()->Add() = 1.0f;        // float scalar with value 1.0f
 
-      constant_node.AddAttribute("value", attr_proto);
+      constant_node.AddAttributeProto(std::move(attr_proto));
     }
 
     graph.SetInputs({&iter_num_in, &cond_in});
diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
index 5bdac54702c60..dd485636a0ddd 100644
--- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
+++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@@ -4,7 +4,10 @@
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 #include "core/common/logging/logging.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h"
+#include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h"
+#include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
 #include "core/session/inference_session.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
 #include "test/util/include/asserts.h"
@@ -271,9 +274,10 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) {
       << "No node should be taken by the NNAPI EP";
 }
 
-static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case,
-                            const char* test_description,
-                            const EPVerificationParams& params = EPVerificationParams()) {
+static void RunQDQModelTest(
+    const GetQDQTestCaseFn& build_test_case,
+    const char* test_description,
+    const EPVerificationParams& params = EPVerificationParams()) {
   onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger());
   Graph& graph = model.MainGraph();
   ModelTestBuilder helper(graph);
@@ -290,15 +294,22 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case,
                             std::make_unique<NnapiExecutionProvider>(0),
                             helper.feeds_, params);
 #else
-  ORT_UNUSED_PARAMETER(params);
   // test load only
   SessionOptions so;
   InferenceSessionWrapper session_object{so, GetEnvironment()};
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::make_unique<NnapiExecutionProvider>(0)));
   ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
   ASSERT_STATUS_OK(session_object.Initialize());
-  ASSERT_GT(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0)
-      << "Some nodes should have been taken by the NNAPI EP";
+  if (params.ep_node_assignment == ExpectedEPNodeAssignment::None) {
+    ASSERT_EQ(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0)
+        << "No node should have been taken by the NNAPI EP";
+  } else if (params.ep_node_assignment == ExpectedEPNodeAssignment::All) {
+    ASSERT_EQ(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), session_object.GetGraph().NumberOfNodes())
+        << "All nodes should have been taken by the NNAPI EP";
+  } else {
+    ASSERT_GT(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0)
+        << "Some nodes should have been taken by the NNAPI EP";
+  }
 #endif
 }
 
@@ -310,7 +321,7 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) {
                       {1, 1, 5, 5} /* input_shape */,
                       {1, 1, 3, 3} /* weights_shape */),
                   "nnapi_qdq_test_graph_conv",
-                  {true /* verify_entire_graph_use_ep */});
+                  {ExpectedEPNodeAssignment::All});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQResize) {
@@ -326,7 +337,14 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) {
                                          "linear" /* mode */,
                                          "asymmetric" /* coordinate_transformation_mode */),
                   "nnapi_qdq_test_graph_resize",
-                  {false /* verify_entire_graph_use_ep */});
+                  {ExpectedEPNodeAssignment::Some});
+}
+
+TEST(NnapiExecutionProviderTest, TestQDQResize_UnsupportedDefaultSetting) {
+  RunQDQModelTest(BuildQDQResizeTestCase({1, 3, 64, 64} /* input_shape */,
+                                         {1, 3, 32, 32} /* sizes_data */),
+                  "nnapi_qdq_test_graph_resize_unsupported",
+                  {ExpectedEPNodeAssignment::None});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQAveragePool) {
@@ -336,7 +354,7 @@ TEST(NnapiExecutionProviderTest, TestQDQAveragePool) {
                       {1, 3, 32, 32} /* input_shape */),
                   "nnapi_qdq_test_graph_averagepool",
                   {
-                      true /* verify_entire_graph_use_ep */,
+                      ExpectedEPNodeAssignment::All,
                       1e-2f /* fp32_abs_err */,
                   });
 }
@@ -348,7 +366,7 @@ TEST(NnapiExecutionProviderTest, TestQDQAdd) {
                       {1, 23, 13, 13} /* input_shape */,
                       "Add" /* op_type */),
                   "nnapi_qdq_test_graph_add",
-                  {true /* verify_entire_graph_use_ep */});
+                  {ExpectedEPNodeAssignment::All});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQMul) {
@@ -360,8 +378,8 @@ TEST(NnapiExecutionProviderTest, TestQDQMul) {
                       "Mul" /* op_type */),
                   "nnapi_qdq_test_graph_mul",
                   {
-                      true /* verify_entire_graph_use_ep */,
-                      1e-2f /* fp32_abs_err */,
+                      ExpectedEPNodeAssignment::All,
+                      1e-2f /* fp32_abs_err */
                   });
 }
 
@@ -371,28 +389,36 @@ TEST(NnapiExecutionProviderTest, TestQDQTranspose) {
                       {1, 3, 32, 32} /* input_shape */,
                       {0, 3, 1, 2} /* perms */),
                   "nnapi_qdq_test_graph_transpose",
-                  {
-                      true /* verify_entire_graph_use_ep */
-                  });
+                  {ExpectedEPNodeAssignment::All});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQReshape) {
   RunQDQModelTest(BuildQDQReshapeTestCase({1, 3, 64, 64} /* input_shape */,
                                           {1, 64, 64, 3} /* reshape_shape */),
                   "nnapi_qdq_test_graph_reshape",
-                  {
-                      true /* verify_entire_graph_use_ep */
-                  });
+                  {ExpectedEPNodeAssignment::All});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQSoftMax) {
   RunQDQModelTest(BuildQDQSoftMaxTestCase<uint8_t, uint8_t>(
                       {1, 32} /* input_shape */,
-                      static_cast<int64_t>(1) /* axis */),
+                      static_cast<int64_t>(1) /* axis */,
+                      1.f / 256 /* output_scales */,
+                      0 /* output_zp */),
                   "nnapi_qdq_test_graph_softmax",
-                  {
-                      true /* verify_entire_graph_use_ep */
-                  });
+                  {ExpectedEPNodeAssignment::All});
+}
+
+// This is to verify when Nnapi required scale and zero point are not satisfied
+// the model can work as expected. (no nodes should be handled by Nnapi)
+TEST(NnapiExecutionProviderTest, TestQDQSoftMax_UnsupportedOutputScaleAndZp) {
+  RunQDQModelTest(BuildQDQSoftMaxTestCase<uint8_t, uint8_t>(
+                      {1, 32} /* input_shape */,
+                      static_cast<int64_t>(1) /* axis */,
+                      0.002f /* output_scales */,
+                      1 /* output_zp */),
+                  "nnapi_qdq_test_graph_softmax_unsupported",
+                  {ExpectedEPNodeAssignment::None});
 }
 
 TEST(NnapiExecutionProviderTest, TestQDQConcat) {
@@ -403,11 +429,26 @@ TEST(NnapiExecutionProviderTest, TestQDQConcat) {
                           {1, 6, 2},
                       } /* input_shapes */,
                       2 /* axis */),
-                  "nnapi_qdq_test_graph_concat", {
-                                                     true /* verify_entire_graph_use_ep */
-                                                 });
+                  "nnapi_qdq_test_graph_concat",
+                  {ExpectedEPNodeAssignment::All});
 }
 
+#if defined(__ANDROID__)
+TEST(NnapiExecutionProviderTest, TestQDQConcat_UnsupportedInputScalesAndZp) {
+  // This is to verify all the inputs have the same scale and zp as input 0 for API 28-
+  // Currently, this test can only be run locally with a android emulator with API < 29
+  // See https://developer.android.com/studio/run/emulator-commandline for some info on
+  // starting a testing android emulator in command line. (Run an android build with emulator started)
+  // TODO: consider to configure this and enable it to run in Android CI.
+  const auto* nnapi = NnApiImplementation();
+  if (nnapi->nnapi_runtime_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) {
+    RunQDQModelTest(BuildQDQConcatTestCaseUnsupportedInputScaleZp(),
+                    "nnapi_qdq_test_graph_concat_unsupported",
+                    {ExpectedEPNodeAssignment::None});
+  }
+}
+#endif
+
 #endif  // !(ORT_MINIMAL_BUILD)
 
 TEST(NnapiExecutionProviderTest, NNAPIFlagsTest) {
diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h
index 74bae67c39b1e..37d957f60c94a 100644
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@@ -317,7 +317,6 @@ class OpTester {
     AddData(input_data_, name, dims_var, p_values, size, is_initializer, false, dim_params);
   }
 
-
   template <typename T>
   void AddInput(const char* name, std::initializer_list<int64_t> dims, const TensorShapeVector& values,
                 bool is_initializer = false, const std::vector<std::string>* dim_params = nullptr) {
@@ -500,7 +499,6 @@ class OpTester {
             values ? values->size() : 0, is_initializer, false, dim_params, 0.0f, 0.0f, true);
   }
 
-
   template <typename T>
   void AddOptionalTypeTensorOutput(const char* name, const DimsVariant& dims,
                                    const std::initializer_list<T>* expected_values = nullptr,
@@ -520,7 +518,6 @@ class OpTester {
             sort_output, nullptr /* dim_params */, rel_error, abs_error, true);
   }
 
-
   template <typename T>
   void AddOptionalTypeSeqInput(const char* name,
                                const SeqTensors<T>* seq_tensors) {
@@ -546,12 +543,12 @@ class OpTester {
   }
 
   /*
-  * Use this API to add an input *edge* to the node/op being tested that won't 
-  * have any data passed into.
-  * Such an edge will have the qualifier OpSchema::Optional in the schema.
-  * This is exposed to ensure the op kernel implementations can be tested to handle 
-  * presence/absence of such optional input edges.
-  */
+   * Use this API to add an input *edge* to the node/op being tested that won't
+   * have any data passed into.
+   * Such an edge will have the qualifier OpSchema::Optional in the schema.
+   * This is exposed to ensure the op kernel implementations can be tested to handle
+   * presence/absence of such optional input edges.
+   */
   template <typename T>
   void AddOptionalInputEdge() {
     std::string name;  // empty == input doesn't exist
@@ -575,7 +572,7 @@ class OpTester {
             sort_output, nullptr /* dim_params */, rel_error, abs_error);
   }
 
-    template <typename T>
+  template <typename T>
   void AddOutput(const char* name, std::initializer_list<int64_t> dims, const T* p_values, const size_t size,
                  bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
     const DimsVariant dims_var = std::vector<int64_t>(dims);
@@ -583,7 +580,6 @@ class OpTester {
             sort_output, nullptr /* dim_params */, rel_error, abs_error);
   }
 
-
   template <typename T>
   void AddOutput(const char* name, const DimsVariant& dims, std::initializer_list<T> expected_values,
                  bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) {
@@ -712,12 +708,12 @@ class OpTester {
 #endif
 
   /*
-  * Use this API to add an output *edge* to the node/op being tested that shouldn't have any 
-  * data produced into.
-  * Such an edge will have the qualifier OpSchema::Optional in the schema.
-  * This is exposed to ensure the op kernel implementations can be tested to handle 
-  * presence/absence of such optional output edges.
-  */
+   * Use this API to add an output *edge* to the node/op being tested that shouldn't have any
+   * data produced into.
+   * Such an edge will have the qualifier OpSchema::Optional in the schema.
+   * This is exposed to ensure the op kernel implementations can be tested to handle
+   * presence/absence of such optional output edges.
+   */
   template <typename T>
   void AddOptionalOutputEdge() {
     std::string name;  // empty == output doesn't exist
@@ -786,6 +782,12 @@ class OpTester {
     custom_output_verifier_ = custom_output_verifier;
   }
 
+  void AddAttributeProto(ONNX_NAMESPACE::AttributeProto attr) {
+    add_attribute_funcs_.emplace_back([attr = std::move(attr)](onnxruntime::Node& node) {
+      node.AddAttributeProto(attr);
+    });
+  }
+
   template <typename T>
   void AddAttribute(std::string name, T value) {
     // Generate a the proper AddAttribute call for later
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index fefd119f0c3da..34206ebbe7c87 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -36,7 +36,7 @@ void VerifyOutputs(const std::vector<OrtValue>& fetches, const std::vector<int64
  * Create a simple model with dynamic or non-dynamic input shape.
  * \param model_name - model name
  * \param graph_name - graph name
- * \params dims - input dimensions 
+ * \params dims - input dimensions
  *
  * input: "X", "Y" and "Z"
  *        you can specify input dimensions, for example (1, 3, 2), (1, 2) or (1, -1, -1)). Note: -1 means the dimension is dynamic.
@@ -46,9 +46,9 @@ void VerifyOutputs(const std::vector<OrtValue>& fetches, const std::vector<int64
  *      "X"  "Y"
  *        \  /
  *    "Z"  Add
- *      \  / 
+ *      \  /
  *       Add
- *       / 
+ *       /
  *     "M"
  *
  */
@@ -389,7 +389,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
       // check min/max shape ranges of dynamic shape dimensions
       for(auto it = shape_ranges.cbegin(); it != shape_ranges.cend(); ++it) {
-        auto ranges = it->second;  
+        auto ranges = it->second;
         for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) {
           if (it2->first == 1) {
             ASSERT_EQ(it2->second.first, 3);
@@ -439,7 +439,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
       // check min/max shape ranges of dynamic shape dimensions
       for(auto it = shape_ranges2.cbegin(); it != shape_ranges2.cend(); ++it) {
-        auto ranges = it->second;  
+        auto ranges = it->second;
         for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) {
           if (it2->first == 1) {
             ASSERT_EQ(it2->second.first, 1);
@@ -470,7 +470,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
  * We have following test parameters:
  * - engine_static: engine cache enabled with non-dynamic input shape
  * - engine_dynamic: engine cache enabled with dynamic input shape
- * - timing_static: will be added 
+ * - timing_static: will be added
  * - timing_dynamic: will be added
  */
 INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static",
@@ -591,11 +591,7 @@ TEST(TensorrtExecutionProviderTest, NodeIndexMappingTest) {
   auto& output_arg_1 = graph.GetOrCreateNodeArg("node_1_out", &uint8_tensor);
   outputs.push_back(&output_arg_1);
   auto& cast_node = graph.AddNode("cast1", "Cast", "node 1.", inputs, outputs);
-  AttributeProto attr_proto;
-  attr_proto.set_name("to");
-  attr_proto.set_type(AttributeProto_AttributeType_INT);
-  attr_proto.set_i(2);
-  cast_node.AddAttribute("to", attr_proto);
+  cast_node.AddAttribute("to", int64_t{2});
 
   inputs.clear();
   inputs.push_back(&output_arg_1);
@@ -603,11 +599,7 @@ TEST(TensorrtExecutionProviderTest, NodeIndexMappingTest) {
   outputs.clear();
   outputs.push_back(&output_arg_2);
   auto& cast_node_2 = graph.AddNode("cast2", "Cast", "node 2.", inputs, outputs);
-  AttributeProto attr_proto_2;
-  attr_proto_2.set_name("to");
-  attr_proto_2.set_type(AttributeProto_AttributeType_INT);
-  attr_proto_2.set_i(9);
-  cast_node_2.AddAttribute("to", attr_proto_2);
+  cast_node_2.AddAttribute("to", int64_t{9});
 
   auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor);
   auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor);
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e6e982aff1f5c..858f18c97e4da 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1068,6 +1068,33 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self):
         so2.log_severity_level = 1
         onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=so2, providers=onnxrt.get_available_providers())
 
+    def testMemoryArenaShrinkage(self):
+        if platform.architecture()[0] == '32bit' or 'ppc' in platform.machine() or 'powerpc' in platform.machine():
+            # on x86 or ppc builds, the CPU allocator does not use an arena
+            print("Skipping testMemoryArenaShrinkage in 32bit or powerpc platform.")
+        else:
+            x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+
+            sess1 = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=['CPUExecutionProvider'])
+            input_name = sess1.get_inputs()[0].name
+
+            # Shrink CPU memory after execution
+            ro1 = onnxrt.RunOptions()
+            ro1.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu:0")
+            self.assertEqual(ro1.get_run_config_entry("memory.enable_memory_arena_shrinkage"), "cpu:0")
+            sess1.run([], {input_name: x}, ro1)
+
+            available_providers = onnxrt.get_available_providers()
+            if 'CUDAExecutionProvider' in available_providers:
+                sess2 = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=available_providers)
+                input_name = sess2.get_inputs()[0].name
+
+                # Shrink CPU and GPU memory after execution
+                ro2 = onnxrt.RunOptions()
+                ro2.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu:0;gpu:0")
+                self.assertEqual(ro2.get_run_config_entry("memory.enable_memory_arena_shrinkage"), "cpu:0;gpu:0")
+                sess2.run([], {input_name: x}, ro2)
+
     def testCheckAndNormalizeProviderArgs(self):
         from onnxruntime.capi.onnxruntime_inference_collection import check_and_normalize_provider_args
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index 263dead104dc3..5abd6fdcffbde 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -25,21 +25,13 @@ def unique_element(lst):
 
 class TestSymbolicShapeInference(unittest.TestCase):
     def test_symbolic_shape_infer(self):
-        # skip these tests before this issue is fixed:
-        # https://github.com/microsoft/onnxruntime/issues/10761
-        test_skip_due_to_onnx_1_11_shape_inference_change = ["GPT2", "GPT2_LM_HEAD", "test_GPT2"]
-
+        
         cwd = os.getcwd()
         test_model_dir = os.path.join(cwd, '..', 'models')
         for filename in Path(test_model_dir).rglob('*.onnx'):
             if filename.name.startswith('.'):
                 continue  # skip some bad model files
 
-            if len(filename.parts) > 1 and \
-                filename.parts[len(filename.parts) - 2] in test_skip_due_to_onnx_1_11_shape_inference_change:
-                print("Skip symbolic shape inference on : " + str(filename))
-                continue
-
             print("Running symbolic shape inference on : " + str(filename))
             SymbolicShapeInference.infer_shapes(in_mp=onnx.load(str(filename)),
                                                 auto_merge=True,
diff --git a/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx b/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx
index e9568e381d21bb10c455011ad3596bd480a62d6a..debd5244abce510aceaccde788fd523158ac98b7 100644
GIT binary patch
delta 934
zcma)4J&)5s5RGG$$a6=CH;MA06c=;cA)|{NCvZ->fWkEhe!|McdzV$>55;S7M-w5X
zL<q4R1xiUlfpov&N+dccprq#~u#SXvL9s+Pv-@V|y*Kk?ay5F}S{Q9(@O}ixP_P*z
z9;3d8T|1nj@@9^y$GWC=<B=UQU4bm~Yq^z_g(Fa?7eowGCq%cBFb&>?-M|m&dQ4m~
zK-bf>s-+D7wuTH>^L73;*OW7GkFQD<{<&4;SjqEOioSphZ0U^lM(%UsJh3mx{HZ^8
z?HPS(ueli5kC>2w^mK$1x$ec<9Kq8e=jbpMyTBwK<WTPnhmqT;ry{j#%9pM}o&+JC
z`CjGI)#33<k_I=&zT0-`c<zV(DTAL9qzF<y{8tgkrqPG)+99k0FKFcjY+Wwo0<m0d
zAi2}CkP^&EC=LP}jlxO5hRJmpnvx`X5fm?S@r2k;Sj&oo&?``gJnGOn8OP4rvsy+x
zZ&1&kGTv0D`Vi6`+%%BCR_n?~3FOXw6M8~RJNJUnvkBuLN`vlBvQXU7P?99u-C`yG
zRXWH!qTd4x__O^zepKGA3gK3-fgTGNPRK06L&teY?U27IH*>cI*6Q(Z<!Q@|Ln{XN
u8+n1jot~-5B7kMg(CWXd-r>LZTYV&~?>4TnGyjv_<$3i>@p5_h`_3QO+b^O3

delta 5521
zcmb_gO>g7I8P=yI&w4kmlHGL(7x8MlT}0Tp9DdEvC{iWcY*M#wn>Yz_3Zb>xSXf(f
zML8Sv!WR7l1qAe*-g^mx-h0WhKcGMl{Q<f4-u8KCNO~w*P7zdw$q{GXXWsXj_lx@K
zt$&>S;a<vj?Y%Ef*gL(36Ikr=bdt<RlX(>MyZLzVh_yc*onFL)oz422ck;(k5RK;Z
z+1{R-+i%rswc+RW7TfPNT-&-)EaSET^;!;f4^VfpCe_<<yA1Z{uO5mFEc6U8UtR~M
zv3yy^`d8~OKgzL$ffaCIrJ2Elie?!P{A#dY^qSn}CYo<(#&a_tY<85H8JJ%J>o#k(
z6tvZ9q@F@FZ4FFYS0R>d?ba2X0VoX>g;P~AjR$2Ng;UY7q#`$s7XO3MqH1&j_--$4
za&8`q2n`+GoW_UUdWlF8mX$QkeitX>m^@0;>}ANb@Kd>DxQY2w!ZfLrW_+x09a#yG
z-zq;Oup&ZI3N~q;^eVLSNw;dl?I+|m2SD=!YooA(bMoult#@+@`v|zL{BeXun!$8A
zd<ieGOL*oXuf!{#aw`vDf(u;}&NHdoO;_mWRmjuXMd1k(&o}eSD)syo`Q_uZw|SK3
zyyRqb9!DnJFxSrOw%~vpYH2i&C&_fSzmvsUacg8FN`>^H<N3IPZ=TMOrCDnIKW<>U
z`!^b(^N-fIR=diA4=Wb9hJVSNp-0_jg`uq+H@|nY0WtGBTRR^m={&M~4Jq*-THV9*
z)A2m=-F{n*1~*u1^w~Js0+o%yE!IwGepnyYKd*P#kXh%`WIP{FCsBO%G(J8aPd-U@
zpU2}*PUdW*m>VA_C~n;9zCD_s#IuO^*Yt>m>`wZ8Iy=5Xe0@AQznDi_)-Gi2LUzEE
z)}d{&!(U&-@o(bD_xhb|tTm65AvEtWS{wEnAvf0gdTk29^5QMlaBTc@DD7D7H;#`Z
zr{Bbl-NRD{PhD&6@hF)`ZoftM!8&Wqr(58p!G3}zE|z%K#(VMc#Zi0^k0z1VXP4gw
zFA=Fw?LYKM=|Xf4USQp4aeRI}K1(zfDdOVa1D<cS4=<iZe!qzuGiO3Q6JE~vkO?3o
ztoARb&m+-q;-)kccq&Vp$#k`9t2KKWJ4l8$1FZ(P)`%h`J;l*oJO*>FW8o-f^N2hB
z77ZB5gqQ?@E(rYb_|u5H{pPC}us)uHHP^E`idgPdnSFDyS>N5p8m#lJwS$Y(h=UH?
z;fQ#|UR`|Ay1n>l^QA+q2r{h#t<?@z1gz=U@B&v_>j$H`0ucQ*J!BTGK(4f1M1jgf
zfj|_{i0w01VNaTD@yEvP6cB(3vn{0F;s=a`>EaQLZ>?wU9)!8imcmJC4Rt(_=z%fM
z$-|F_%i-6c@Y~G8FYONTM}7_O@N8B|puGkP(yjMmoll=!B$QM;*>@$xD?-pf@FgK$
z7IHEnRWda;4pS~~<y=a644Ob2)Qk6Lqe+645J#@vUoXB5zQbDQ@$76^8#a(vnq(EE
z1gYbgQXVg)(j;CXCb5|;ZRR_L*ws>SOJA@?2)7F1R^*mGVu`#GaTFpE2pAbU);enU
z(Fn;gNc(GgtRore@!u3G^ies0<Tx@8q79X|kb|#XD+aSe+6uiyK~EH1K~_p^fvX|h
zqQjse#NgV2b@PKslBGh9Mt!~bT262z;gW<XON1*WKvEdspDdDAs#pq;6p1AjR_Y>p
z>*upLiP2CYDQ9${mnrNvs|kArc*GP!DGZB<3|uoJ!;nG=**}X&Peo*kkVLrZNs&(q
zo`-8b#|uHB1tE7olf%1~PBrq1RHH)KrcmazKh8c6<$yvtr7BjAWhEg<LXrgR$s`2q
z(Zi|$r!6>b!7sNEYK!U)+XU{=#ZPGqBseMv!q$0TX-jNz-`Xh7JrdV>0qqM36gfje
zBF+l=Wxt_)d(D({l(%A0B*aRQJ$4K4sPG8t5za=*GqTvDi;rx~;Wl{tAgv{i@pOC?
zN7BYI(ufhKT+%Wk)`9H~edSI{2^JMGm!NJ%Nf1X4VLCpo5V}gptV*a^1)ye;zKKAI
z@<=6B&4!RgQNsmQjS7TBDxeCZW=RdyHrS8qri8<c{YNxIT%xkE!|AW)XQR)e5a}};
z7iW+B)?nYG)d)ad)VmNxa=DYZN2@3&sFf%uS<MPv<e#i)k*zLSgL-EqjnMmsqzCdL
zQmFy7lP8yofP0zF??cR!Y~ePK+S@4S?&7b_m(vG$>XY$4N~bR0wyfH7K+9+c-=iIh
z-U!k1ZCa4;DmYwHpWMOs(Qn{5IezU>=UL3_=`f%6jA^0)^$8bYb*NQBK<H*kDNBy&
zD>^QfMa#0(%B7dRTvh?DXyueP<a+^mSHP`~?wPXo#G)BW1OYVxs%_#T>zkA)Ic7i{
z2d>qj8imNv>G#1ynj&5UFQ*?3M8%zczW8TT?9eJ&3bs`L<A5#l@1GHf&m7g}yw<vZ
zQ|ssGrf%{3*7qB4vBev`M=33&92bC5WrKqzj+$d~gMc!SK-$%A;H=24f=G~_#h##h
zemg$Bpm)A}ym#|xdbWEuKAKGvM3Talq_jlDKYj%g=o_yvkepNci`(sIb$9W*w)RVO
z{doN?6!Na<*y*T4QV4GcCBO8p>DsS9X><+n%a9Hfjf3_}FKgZDIMuwKMj29W>t+4B
znx-_b!X7`pX}|QcCSEB^WT00mD=WXXU!n>FkDMK~U7x9+HS^?`DA2gjyP@(+G*r5u
zSDI1+qB`uLNNc~uyQ69g$S?7frts#c{Q>W16~Fx9;!o`dJG6?Hf-U8jewMTM2*hDK
zi-Q-p?>-%m5?X{1P@d=O==pMI?Qff}zo>qpqMEYb&EKL2#Y3^5zxmcKAIg3D0Z@Iu
K{QAz{Z~PZ;b8oZ&

diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h
index 50859d826fa2d..e1e2e8a542325 100644
--- a/onnxruntime/test/util/include/test_utils.h
+++ b/onnxruntime/test/util/include/test_utils.h
@@ -15,11 +15,17 @@ class Graph;
 
 namespace test {
 
+// If set to All: verify the entire graph is taken by ep
+// If set to Some: verify that at least one node is assigned to ep
+// If set to None: verify that no nodes is assigned to ep (typically for an expected failure path test case)
+enum class ExpectedEPNodeAssignment { None,
+                                      Some,
+                                      All, };
+
 // struct to hold some verification params for RunAndVerifyOutputsWithEP
 struct EPVerificationParams {
-  // Verify the entire graph is taken by the EP
-  // if this is set to false, then will verify that at least one node is assigned to 'execution_provider'
-  bool verify_entire_graph_use_ep{false};
+  
+  ExpectedEPNodeAssignment ep_node_assignment = ExpectedEPNodeAssignment::Some;
 
   // Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than
   // the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ]
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index b069b08810cb8..ac7996581225f 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -123,9 +123,12 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id
   // make sure that some nodes are assigned to the EP, otherwise this test is pointless...
   const auto& graph2 = session_object2.GetGraph();
   auto ep_nodes = CountAssignedNodes(graph2, provider_type);
-  if (params.verify_entire_graph_use_ep) {
+  if (params.ep_node_assignment == ExpectedEPNodeAssignment::All) {
     // Verify the entire graph is assigned to the EP
     ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type;
+  } else if (params.ep_node_assignment == ExpectedEPNodeAssignment::None) {
+    // Check if expected failure path is correctly handled by ep. (only used in NNAPI EP QDQ model test case for now)
+    ASSERT_EQ(ep_nodes, 0) << "No nodes are supposed to be assigned to " << provider_type;
   } else {
     ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
   }
diff --git a/orttraining/orttraining/eager/opgen/opgen/atenops.py b/orttraining/orttraining/eager/opgen/opgen/atenops.py
index 8bb882571bdbb..1d80f2a48b9ae 100644
--- a/orttraining/orttraining/eager/opgen/opgen/atenops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/atenops.py
@@ -8,6 +8,11 @@
 
 from opgen.onnxops import *
 
+import torch
+from packaging import version
+
+TORCH_API_CHANGE_VERSION = "1.11.1"
+
 kMSDomain = 'onnxruntime::kMSDomain'
 
 class ReluGrad(ONNXOp):
@@ -79,7 +84,6 @@ def __init__(self, dY, X):
   'aten::softshrink': Shrink('self', bias='lambd', lambd='lambd'), #yes, bias is set to 'lambd'
   'aten::hardshrink': Shrink('self', bias=0, lambd='lambd'),
   'aten::gelu' : Gelu('self'),
-  'aten::gelu_backward' : GeluGrad('grad', 'self'),
   'aten::max' : ReduceMax('self', keepdims=1),
   'aten::min' : ReduceMin('self', keepdims=1),
   'aten::_cat': Concat('tensors', 'dim'),
@@ -95,6 +99,13 @@ def __init__(self, dY, X):
   'aten::gt.Scalar_out' : MakeTorchFallback(),
 }
 
+# Signature of gelu_backward was changed in this commit id 983ba5e585485ed61a0c0012ef6944f5685e3d97 and PR 61439
+# This is done to make sure it is backward and future compatible
+if version.parse(torch.__version__) < version.parse(TORCH_API_CHANGE_VERSION):
+  hand_implemented['aten::gelu_backward'] = GeluGrad('grad', 'self')
+else:
+  hand_implemented['aten::gelu_backward'] = GeluGrad('grad_output', 'self')
+
 ops = {**ops, **hand_implemented} 
 # TODO: this is a temporary allowlist for ops need type promotion
 # Need to enhance the support for onnx type constrains to automatically
diff --git a/orttraining/orttraining/eager/ort_eager_common.h b/orttraining/orttraining/eager/ort_eager_common.h
index 3de3c2d1b8266..e7f54b8d33c68 100644
--- a/orttraining/orttraining/eager/ort_eager_common.h
+++ b/orttraining/orttraining/eager/ort_eager_common.h
@@ -4,4 +4,5 @@
 #pragma once
 // include the pybind header first, it will disable linking to pythonX_d.lib on Windows in debug mode
 #include "python/onnxruntime_pybind_state_common.h"
-#include <torch/extension.h>
\ No newline at end of file
+#include <torch/extension.h>
+#include <ATen/Operators.h>
\ No newline at end of file
diff --git a/orttraining/orttraining/eager/test/ort_eps_test.py b/orttraining/orttraining/eager/test/ort_eps_test.py
index 9122b09b2134f..9a5c8ba32b914 100644
--- a/orttraining/orttraining/eager/test/ort_eps_test.py
+++ b/orttraining/orttraining/eager/test/ort_eps_test.py
@@ -120,13 +120,12 @@ def test_import_custom_eps(self):
         ort_device = torch_ort.device(1)
     assert 'My EP provider created, with device id: 0, some_option: val' in out.capturedtext
 
-  #disable the print test for now as we need to merge a PR to pytorch first.
-  #def test_print(self):
-  #  x = torch.ones(1, 2)
-  #  ort_x = x.to('ort')
-  #  with OutputGrabber() as out:
-  #      print(ort_x)
-  #  assert "tensor([[1., 1.]], device='ort:0')" in out.capturedtext
+  def test_print(self):
+    x = torch.ones(1, 2)
+    ort_x = x.to('ort')
+    with OutputGrabber() as out:
+        print(ort_x)
+    assert "tensor([[1., 1.]], device='ort:0')" in out.capturedtext
 
 if __name__ == '__main__':
   unittest.main()
\ No newline at end of file
diff --git a/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp b/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp
index 30849d15768bc..9d9ac0cbb5682 100644
--- a/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp
+++ b/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp
@@ -38,8 +38,7 @@ inline std::unique_ptr<char[]> read_mnist_file(const std::string& path, uint32_t
   file.open(path, std::ios::in | std::ios::binary | std::ios::ate);
 
   if (!file) {
-    std::cout << "Error opening file " << path << std::endl;
-    std::cout << std::system_error(errno, std::system_category(), "failed to open " + path).what();
+    std::cout << "Error opening file " << path << " - system error " << errno << std::endl;
     return {};
   }
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index a721fc64a2615..cd3cd1e66c3f6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -257,22 +257,28 @@ def permute_and_reshape_tensor(g, tensor, is_lhs, rank, perm, matmul_output_axes
             remaining_axes = [axis for axis in range(rank) if axis not in axes_to_remove]
             # Calculate the new shape, use 0 or -1 if possible.
             shape_tensors = []
-            all_zeros = True
+            before_contiguous_axes = True
+            last_zero_dim = -1
+            has_neg_one_dim = False
             for axis in remaining_axes:
                 if axis == first_matmul_output_axis:
                     shape_tensors.append(matmul_output_numel_tensor)
-                    all_zeros = False
+                    before_contiguous_axes = False
                 elif axis == first_contraction_axis:
                     shape_tensors.append(contraction_numel_tensor)
-                    all_zeros = False
-                elif all_zeros:
+                    before_contiguous_axes = False
+                elif before_contiguous_axes:
                     shape_tensors.append(g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)))
+                    last_zero_dim = len(shape_tensors) - 1
                 elif axis == remaining_axes[-1]:
                     shape_tensors.append(g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+                    has_neg_one_dim = True
                 else:
                     single_axis_shape_tensor, _, shape_tensor = get_shape_tensor_by_axes(
                         g, tensor, shape_tensor, [axis], False)
                     shape_tensors.append(single_axis_shape_tensor)
+            if not has_neg_one_dim and last_zero_dim >= 0:
+                shape_tensors[last_zero_dim] = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
             # Adjust the perm.
             perm = [axis for axis in perm if axis not in axes_to_remove]
             new_axis = 0
@@ -458,16 +464,22 @@ def einsum(g, equation, tensor_list):
     # Need to Reshape the result for the example, the new shape is [size(s), size(m)].
     if len(lhs_matmul_output_axes) != 1 or len(rhs_matmul_output_axes) != 1:
         shape_tensors = [g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))] * len(batched_axes)
+        last_zero_dim = len(shape_tensors) - 1
+        has_neg_one_dim = False
         if lhs_matmul_output_axes:
             if len(lhs_matmul_output_axes) == 1:
                 shape_tensors.append(g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)))
+                last_zero_dim = len(shape_tensors) - 1
             else:
                 shape_tensors.append(lhs_matmul_output_shape_tensor)
         if rhs_matmul_output_axes:
             if len(rhs_matmul_output_axes) == 1:
                 shape_tensors.append(g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+                has_neg_one_dim = True
             else:
                 shape_tensors.append(rhs_matmul_output_shape_tensor)
+        if not has_neg_one_dim and last_zero_dim >= 0:
+            shape_tensors[last_zero_dim] = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
         result = reshape_tensor(g, result, shape_tensors)
 
     # Now output axes is ordered by [batched_axes, lhs_matmul_output_axes, rhs_matmut_output_axes],
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 7a1f0a07bc1f1..359f09114da70 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -311,6 +311,10 @@ def _export_model(self, *inputs, **kwargs):
         #       Model is not re-exported when the model parameters change. This can happen when the model is a stateful model,
         #       or the user explicitly changed model parameters after the onnx export.
 
+        # Record random states here and restore later in case any of them gets changed during the export,
+        # e.g., some sympy functions in symbolic_shape_infer will change Python's random state.
+        random_states = _utils.get_random_states()
+
         schema = _io._extract_schema(
             {'args': copy.copy(inputs), 'kwargs': copy.copy(kwargs)})
         if self._onnx_models.exported_model and schema == self._input_info.schema and not self._original_model_has_changed:
@@ -329,6 +333,9 @@ def _export_model(self, *inputs, **kwargs):
             self._onnx_models.exported_model = SymbolicShapeInference.infer_shapes(self._onnx_models.exported_model,
                                                                                    auto_merge=True, guess_output_rank=True)
 
+        # Restore the recorded random states
+        _utils.set_random_states(random_states)
+
         return True
 
     def _get_exported_model(self, input_schema, *inputs, **kwargs):
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index eb8d1ec3a9328..b617d574460b9 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -16,7 +16,6 @@
 from ._fallback import (ORTModuleFallbackException,
                         _FallbackPolicy,
                         _FallbackManager)
-from .torch_cpp_extensions.cpu.torch_interop_utils import clear_all_grad_fns
 
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi.onnxruntime_inference_collection import get_ort_device_type
@@ -40,10 +39,6 @@ def __init__(self, model, debug_options: DebugOptions, fallback_manager: _Fallba
     def execution_session_run_forward(execution_session, onnx_model, device, gradient_accumulation_manager, *inputs):
         """Runs the forward graph on execution_session with given model inputs and device"""
 
-        # Clear all gradient functions, to avoid a deadlock issue.
-        # Check the called function for more detailed comments.
-        clear_all_grad_fns()
-
         # TODO: Try to reuse the output buffers as some of the output tensors are same sizes,
         #   especially the backward graph outputs.
         # REVIEW(codemzs): Consolidate Training Agent with InferenceAgent on C++ side to not
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index 23dfea316a3eb..534efd2804ba8 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -25,6 +25,24 @@
 import types
 import warnings
 from distutils.version import LooseVersion
+import random
+import numpy as np
+
+def get_random_states():
+    r_state = random.getstate()
+    np_state = np.random.get_state()
+    torch_state = torch.get_rng_state()
+    torch_cuda_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    return r_state, np_state, torch_state, torch_cuda_state
+
+def set_random_states(states):
+    r_state, np_state, torch_state, torch_cuda_state = states
+    random.setstate(r_state)
+    np.random.set_state(np_state)
+    torch.set_rng_state(torch_state)
+    if torch_cuda_state is not None:
+        torch.cuda.set_rng_state(torch_cuda_state)
+
 
 def _ortvalue_from_torch_tensor(torch_tensor):
     # TODO: Current DLPack doesn't support bool and PyTorch disables converting bool tensor to DLPack in recent commit.
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc
index bc930899f40d6..a8445bf64f99d 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc
@@ -114,16 +114,27 @@ void unregister_grad_fn(size_t ctx_address)
   PyNodeSharedPointerPool::GetInstance().UnRegisterGradFunc(ctx_address);
 }
 
-// Supposed to be cleared on python program exit or before every forward run to resolve following issues:
-// 1. When training program exits, PyNodeSharedPointerPool destructor is called, if grad_fns_ is not empty,
+// Supposed to be cleared on python program exit to resolve following issue:
+//    When training program exits, PyNodeSharedPointerPool destructor is called, if grad_fns_ is not empty,
 //    PyNode::release_variables() will be called.
 //    (https://github.com/pytorch/pytorch/blob/15532595209d2daf34d35e10f8d3d3b64966aea2/torch/csrc/autograd/python_function.cpp#L168)
 //    The other hand, there is known issue when acquiring GIL in pybind11 destructors, there will be probabbly deadlock issue.
 //    (https://github.com/pybind/pybind11/issues/1446)
 //    The resolution here, we remove all maintained states before program exits.
-// 2. When forward functions is called repeated without corresponding backward calls, grad functions keeps accumulating without releasing
-//    (happening in backward)
-void clear_all_grad_fns(){
+
+// A known existing issue: when forward functions is called repeatedly without corresponding backward calls,
+// grad functions keeps accumulating without releasing, there might be memory (bound to those gradient function) leaks.
+// Ideally this usually won't happen in real training case, so it should be fine.
+
+// We CANNOT explictly clear grad functions before each forward pass to mitigate the known issue above.
+// For example:
+//     loss1 = forward_run(inputs1)
+//     loss2 = forward_run(inputs2)
+//     loss = loss1 + loss2
+//     loss.backward()
+// If we clear grad functions in the beggining of the second `forward_run`, when `loss.backward()` runs,
+// the backward path of `loss1` will fail to run PythonOpGrad ops (if there is any).
+void clear_all_grad_fns() {
   PyNodeSharedPointerPool::GetInstance().ClearAll();
 }
 
diff --git a/orttraining/orttraining/test/gradient/allreduce_op_test.cc b/orttraining/orttraining/test/gradient/allreduce_op_test.cc
index 1528ac6f5c705..9378c751a32cb 100644
--- a/orttraining/orttraining/test/gradient/allreduce_op_test.cc
+++ b/orttraining/orttraining/test/gradient/allreduce_op_test.cc
@@ -43,7 +43,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestReduceTwoTensors) {
 
   allreduce_test.AddOutput<float>("G_new1", {3}, output_grad);
   allreduce_test.AddOutput<float>("G_new2", {3}, output_grad);
-  allreduce_test.AddAttribute("reduce_algo", static_cast<int64_t>(0));
+  allreduce_test.AddAttribute("reduce_algo", int64_t{0});
 
   std::vector<std::unique_ptr<IExecutionProvider>> providers;
   providers.push_back(DefaultCpuExecutionProvider());
@@ -86,7 +86,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestReduceTwoTensorsFP16) {
   allreduce_test.AddOutput<MLFloat16>("G_new1", {3}, output_grad_half);
   allreduce_test.AddOutput<MLFloat16>("G_new2", {3}, output_grad_half);
 
-  allreduce_test.AddAttribute("reduce_algo", static_cast<int64_t>(0));
+  allreduce_test.AddAttribute("reduce_algo", int64_t{0});
 
   std::vector<std::unique_ptr<IExecutionProvider>> providers;
   providers.push_back(DefaultCpuExecutionProvider());
@@ -112,7 +112,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestFailTensorCountMismatch) {
 
   allreduce_test.AddOutput<float>("G_new1", {3}, {5.6301f, 6.5235f, 7.4169f});
   allreduce_test.AddOutput<float>("G_new2", {3}, {5.6301f, 6.5235f, 7.4169f});
-  allreduce_test.AddAttribute("reduce_algo", static_cast<int64_t>(0));
+  allreduce_test.AddAttribute("reduce_algo", int64_t{0});
 
   std::vector<std::unique_ptr<IExecutionProvider>> providers;
   providers.push_back(DefaultCpuExecutionProvider());
@@ -224,18 +224,8 @@ void build_optimizer_node(Graph& graph,
   auto& optimizer_node = graph.AddNode(input_gradient->Name() + "_adam_optimizer", "AdamOptimizer", "Adam optimizer.", optimizer_inputs, optimizer_outputs,
                                        nullptr /*attributes*/, kMSDomain);
 
-  ONNX_NAMESPACE::AttributeProto bias_correction_attribute, weight_decay_mode_attribute;
-
-  bias_correction_attribute.set_name("do_bias_correction");
-  bias_correction_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-  bias_correction_attribute.set_i(0);
-
-  weight_decay_mode_attribute.set_name("weight_decay_mode");
-  weight_decay_mode_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-  weight_decay_mode_attribute.set_i(0);
-
-  optimizer_node.AddAttribute("do_bias_correction", bias_correction_attribute);
-  optimizer_node.AddAttribute("weight_decay_mode", weight_decay_mode_attribute);
+  optimizer_node.AddAttribute("do_bias_correction", int64_t{0});
+  optimizer_node.AddAttribute("weight_decay_mode", int64_t{0});
 }
 
 using AllreduceGraphConfigVector = std::vector<std::tuple<std::string /*input name*/,
@@ -288,12 +278,7 @@ void build_allreduce_graph(Graph& graph, AllreduceGraphConfigVector& config,
     auto& level_1_allreduce_node = graph.AddNode("node_level_1", level_1_allreduce,
                                                  "level 1 allreduce.", level_1_inputs, level_1_outputs,
                                                  nullptr /*attributes*/, kMSDomain);
-    ONNX_NAMESPACE::AttributeProto level_1_group_type_attribute;
-
-    level_1_group_type_attribute.set_name("group_type");
-    level_1_group_type_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-    level_1_group_type_attribute.set_i(2 /*node local data parallel*/);
-    level_1_allreduce_node.AddAttribute("group_type", level_1_group_type_attribute);
+    level_1_allreduce_node.AddAttribute("group_type", int64_t{2} /*node local data parallel*/);
     // Set inputs of next node to be outputs of level 1 reduction node.
     inputs.clear();
     inputs = std::move(level_1_outputs);
@@ -317,11 +302,7 @@ void build_allreduce_graph(Graph& graph, AllreduceGraphConfigVector& config,
         auto& scaled_grad_node = graph.AddNode(std::get<0>(config[i]) + "_scaled_grad", "MixedPrecisionScale",
                                                "scale grad", scale_grad_inputs, {&scale_grad_output_arg},
                                                nullptr /*attributes*/, kMSDomain);
-        ONNX_NAMESPACE::AttributeProto scale_attribute;
-        scale_attribute.set_name("to");
-        scale_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-        scale_attribute.set_i(static_cast<int64_t>(element_type));
-        scaled_grad_node.AddAttribute("to", scale_attribute);
+        scaled_grad_node.AddAttribute("to", int64_t{element_type});
       }
       // Set inputs of next node to be outputs of scale node.
       inputs.clear();
@@ -359,19 +340,9 @@ void build_allreduce_graph(Graph& graph, AllreduceGraphConfigVector& config,
   auto& allreduce_node = graph.AddNode("node_allreduce", allreduce_op_name, "node allreduce.", inputs, allreduce_outputs,
                                        nullptr /*attributes*/, kMSDomain);
   if (adasum_reduce_type != training::AdasumReductionType::None) {
-    // Attribute
-    ONNX_NAMESPACE::AttributeProto adasum_reduction_type_attribute;
-    adasum_reduction_type_attribute.set_name("reduce_algo");
-    adasum_reduction_type_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-    adasum_reduction_type_attribute.set_i(static_cast<int64_t>(adasum_reduce_type));
-    allreduce_node.AddAttribute("reduce_algo", adasum_reduction_type_attribute);
+    allreduce_node.AddAttribute("reduce_algo", static_cast<int64_t>(adasum_reduce_type));
   } else {
-    // Attribute
-    ONNX_NAMESPACE::AttributeProto group_type_attribute;
-    group_type_attribute.set_name("group_type");
-    group_type_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-    group_type_attribute.set_i(0 /*data parallel*/);
-    allreduce_node.AddAttribute("group_type", group_type_attribute);
+    allreduce_node.AddAttribute("group_type", int64_t{0} /*data parallel*/);
   }
 
   if (build_optimizer) {
diff --git a/orttraining/orttraining/test/gradient/gradient_checker.cc b/orttraining/orttraining/test/gradient/gradient_checker.cc
index 3013f2686a9ea..417fc2861e36c 100644
--- a/orttraining/orttraining/test/gradient/gradient_checker.cc
+++ b/orttraining/orttraining/test/gradient/gradient_checker.cc
@@ -263,7 +263,7 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGraph(
   }
   // Currently only allows setting int attributes to zero. TODO: Expand this
   for (auto attr : attributes) {
-    op_session.AddAttribute<AttributeProto>(attr.name(), attr);
+    op_session.AddAttributeProto(attr);
   }
 
   // build graph
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index b190a219e604f..aea0ed2fef134 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -215,88 +215,84 @@ def assert_values_are_close(input, other, rtol=1e-05, atol=1e-06):
 def enable_custom_autograd_function(module):
     enable_custom_autograd_support()
 
-def run_with_pytorch_on_device(device, model, input_list, label_input, is_eval_mode=False):
-    with torch.no_grad():
-        model = copy.deepcopy(model).to(device)
+def _run_model_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False):
     if is_eval_mode:
         model.eval()
     else:
         model.train()
 
-    with torch.no_grad():
-        inputs_on_device = [input_.to(device) for input_ in input_list]
-        for i, val in enumerate(input_list):
-            if val.requires_grad:
-                inputs_on_device[i].requires_grad_()
-        target = label_input.to(device)
-
-    output = model(*inputs_on_device)
-    forward_outputs = [output]
+    def generate_inputs(input_list_, label_input_):
+        with torch.no_grad():
+            inputs_on_device = [input_.to(device) for input_ in input_list_]
+            for i, val in enumerate(input_list_):
+                if val.requires_grad:
+                    inputs_on_device[i].requires_grad_()
+            with torch.no_grad():
+                target = label_input_.to(device)
+        return inputs_on_device, target
+
+    inputs_on_device1, target1 = generate_inputs(input_list, label_input)
+    if run_forward_twice is True:
+        inputs_on_device2, target2 = generate_inputs(input_list, label_input)
+
+    output1 = model(*inputs_on_device1)
+    if run_forward_twice is True:
+        output2 = model(*inputs_on_device2)
+
+    forward_outputs = [output1]
     grad_outputs = []
 
     if not is_eval_mode:
         criterion = torch.nn.MSELoss()
-        loss = criterion(output, target)
+        loss = criterion(output1, target1)
+
+        if run_forward_twice is True:
+            loss += criterion(output2, target2)
+
         loss.backward()
         for name, param in model.named_parameters():
             if param.requires_grad:
                 grad_outputs.append(param.grad)
     return forward_outputs, grad_outputs
 
-def run_with_ort_on_device(device, model, input_list, label_input, is_eval_mode=False):
+def run_with_pytorch_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False):
+    with torch.no_grad():
+        model = copy.deepcopy(model).to(device)
+
+    return _run_model_on_device(device, model, input_list, label_input, is_eval_mode, run_forward_twice)
+
+def run_with_ort_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False):
     with torch.no_grad():
         model = copy.deepcopy(model)
         model.to(device)
     enable_custom_autograd_function(model)
     model = ORTModule(model)
-    if is_eval_mode:
-        model.eval()
-    else:
-        model.train()
-
-    with torch.no_grad():
-        inputs_on_device = [input_.to(device) for input_ in input_list]
-        for i, val in enumerate(input_list):
-            if val.requires_grad:
-                inputs_on_device[i].requires_grad_()
-
-        target = label_input.to(device)
-    output = model(*inputs_on_device)
-    forward_outputs = [output]
-    grad_outputs = []
 
-    if not is_eval_mode:
-        criterion = torch.nn.MSELoss()
-        loss = criterion(output, target)
-        loss.backward()
-        for name, param in model.named_parameters():
-            if param.requires_grad:
-                grad_outputs.append(param.grad)
-    return forward_outputs, grad_outputs
+    return _run_model_on_device(device, model, input_list, label_input, is_eval_mode, run_forward_twice)
 
 def compare_tensor_list(val_list_a, val_list_b):
     for val_a, val_b in zip(val_list_a, val_list_b):
        assert_values_are_close(val_a, val_b, atol=1e-7, rtol=1e-6)
 
 def run_training_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input,
-                                  ignore_grad_compare=False, expected_outputs=[], expected_grads=[]):
+                                  run_forward_twice=False, ignore_grad_compare=False, expected_outputs=[], expected_grads=[]):
     cpu = torch.device("cpu")
 
     def cpu_barrier_func():
         pass
     run_training_test_on_device_and_compare(
         cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cpu_barrier_func,
-        ignore_grad_compare, expected_outputs, expected_grads)
+        run_forward_twice, ignore_grad_compare, expected_outputs, expected_grads)
 
     def cuda_barrier_func():
         torch.cuda.synchronize()
     cuda = torch.device('cuda:0')
     run_training_test_on_device_and_compare(
         cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cuda_barrier_func,
-        ignore_grad_compare, expected_outputs, expected_grads)
+        run_forward_twice, ignore_grad_compare, expected_outputs, expected_grads)
 
 def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, barrier_func,
-                                            ignore_grad_compare=False, expected_outputs=[], expected_grads=[]):
+                                            run_forward_twice=False, ignore_grad_compare=False, expected_outputs=[], expected_grads=[]):
     repeats = 16
     for i in range(repeats):
         m = pt_model_builder_func()
@@ -307,11 +303,11 @@ def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_mo
             x_ort = copy.deepcopy(x)
 
         outputs, grads = run_with_pytorch_on_device(
-            device, m, [x], pt_model_label_input)
+            device, m, [x], pt_model_label_input, run_forward_twice=run_forward_twice)
         barrier_func()
 
         outputs_ort, grads_ort = run_with_ort_on_device(
-            device, m_ort, [x_ort], pt_model_label_input)
+            device, m_ort, [x_ort], pt_model_label_input, run_forward_twice=run_forward_twice)
         barrier_func()
 
         val_list_a = [o.detach().cpu() for o in outputs if o is not None]
@@ -330,14 +326,16 @@ def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_mo
             if len(expected_grads) > 0:
                 compare_tensor_list(val_list_a, expected_grads)
 
-def run_evaluate_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input):
+def run_evaluate_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input,
+    run_forward_twice=False):
     cpu = torch.device("cpu")
 
     def cpu_barrier_func():
         pass
 
     run_evaluate_test_on_device_and_compare(
-        cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cpu_barrier_func)
+        cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input,
+        cpu_barrier_func, run_forward_twice=run_forward_twice)
 
     def cuda_barrier_func():
         torch.cuda.synchronize()
@@ -345,9 +343,11 @@ def cuda_barrier_func():
 
     cuda = torch.device('cuda:0')
     run_evaluate_test_on_device_and_compare(
-        cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cuda_barrier_func)
+        cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input,
+        cuda_barrier_func, run_forward_twice=run_forward_twice)
 
-def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, barrier_func):
+def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator,
+    pt_model_label_input, barrier_func, run_forward_twice=False):
     repeats = 16
     for i in range(repeats):
         m = pt_model_builder_func()
@@ -357,11 +357,11 @@ def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_mo
         x_ort = copy.deepcopy(x)
 
         outputs, grads = run_with_pytorch_on_device(
-            device, m, [x], pt_model_label_input, is_eval_mode=True)
+            device, m, [x], pt_model_label_input, is_eval_mode=True, run_forward_twice=run_forward_twice)
         barrier_func()
 
         outputs_ort, grads_ort = run_with_ort_on_device(
-            device, m_ort, [x_ort], pt_model_label_input, is_eval_mode=True)
+            device, m_ort, [x_ort], pt_model_label_input, is_eval_mode=True, run_forward_twice=run_forward_twice)
         barrier_func()
 
         val_list_a = [o.detach().cpu() for o in outputs if o is not None]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 0ff12f9040394..acd1b0b6c32b4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -1134,8 +1134,16 @@ def run_step(model, input):
         _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
         _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
 
-@pytest.mark.parametrize("equation", ["s,se->se", "se,sc->sec", "se,se->s", "sec,sm->ecm",
-                                      "sec,ecm->sm", "ks,ksm->sm", "kes,ems->mek", "kes,ksm->ms"])
+# In PyTorch 1.11.0, there is issue during reduce node shape handling for exporter, so any sub-graph that
+# contains ReduceProd will fail to run, for example, "sec,sm->ecm", "sec,ecm->sm".
+# Currently skip these cases and test_gradient_correctness_einsum_2,
+# will enable these tests again once the issue in PyTorch is fixed.
+skip_torch_1_11 = pytest.mark.skipif(LooseVersion(torch.__version__) >= LooseVersion('1.11.0'), reason="PyTorch 1.11 incompatible")
+@pytest.mark.parametrize("equation", [
+    "s,se->se", "se,sc->sec", "se,se->s", "ks,ksm->sm", "kes,ems->mek", "kes,ksm->ms",
+    pytest.param("sec,sm->ecm", marks=[skip_torch_1_11]),
+    pytest.param("sec,ecm->sm", marks=[skip_torch_1_11])
+])
 def test_gradient_correctness_einsum(equation):
     class NeuralNetEinsum(torch.nn.Module):
         def __init__(self, bias_size):
@@ -1183,6 +1191,7 @@ def run_step(model, input_left, input_right):
         _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-3)
         _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, atol=1e-3, rtol=1e-3)
 
+@skip_torch_1_11
 def test_gradient_correctness_einsum_2():
     class NeuralNetEinsum(torch.nn.Module):
         def __init__(self, bias_size):
@@ -4818,3 +4827,43 @@ def run_step(model, x):
     _test_helpers.assert_values_are_close(pt_loss, ort_loss)
     _test_helpers.assert_values_are_close(pt_x.grad, ort_x.grad)
     assert ortmodule_module.ONNX_OPSET_VERSION == DEFAULT_OPSET
+
+
+def test_random_states_unchanged_for_ortmodule():
+    import numpy
+
+    os.environ['ORTMODULE_FALLBACK_RETRY'] = 'False'
+
+    class NeuralNetSlice(torch.nn.Module):
+        def __init__(self):
+            super(NeuralNetSlice, self).__init__()
+            self.dim = 32
+
+        def forward(self, x):
+            # This slice operation will call sympy.Min() when exporting, which will change Python's random state
+            return x[:self.dim, :]
+
+    def random_state_equal(a, b):
+        assert type(a) == type(b)
+        if isinstance(a, tuple):
+            assert len(a) == len(b)
+            return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b)])
+        if isinstance(a, numpy.ndarray):
+            return numpy.array_equal(a, b)
+        if isinstance(a, torch.Tensor):
+            return torch.equal(a, b)
+        return a == b
+
+    model = NeuralNetSlice()
+    x = torch.randn(16, 16)
+
+    ori_random_states = _utils.get_random_states()
+
+    ort_model = ORTModule(model)
+    ort_model(x)
+
+    new_random_states = _utils.get_random_states()
+
+    assert random_state_equal(ori_random_states, new_random_states)
+
+    del os.environ['ORTMODULE_FALLBACK_RETRY']
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
index 3b2e6bc6a38f6..a3f118380c6af 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
@@ -138,6 +138,61 @@ def input_generator():
     run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
+def test_GeLU_multiple_forward_runs():
+    @torch.jit.script
+    def bias_gelu(bias, y):
+        x = bias + y
+        return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+    @torch.jit.script
+    def bias_gelu_backward(g, bias, y):
+        x = bias + y
+        tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+        ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 +
+                        0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+        return ff*g
+
+    class GeLUFunction3(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, bias):
+            ctx.save_for_backward(input, bias)
+            return bias_gelu(bias, input)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, bias = ctx.saved_tensors
+            tmp = bias_gelu_backward(grad_output, bias, input)
+            return tmp, tmp
+
+    class GeLUModel(torch.nn.Module):
+        def __init__(self, output_size):
+            super(GeLUModel, self).__init__()
+            self.relu = GeLUFunction3.apply
+            self.bias = Parameter(torch.empty(
+                output_size,
+                device=torch.cuda.current_device(),
+                dtype=torch.float))
+
+            with torch.no_grad():
+                self.bias.uniform_()
+
+        def forward(self, model_input):
+            out = self.relu(model_input, self.bias)
+            return out
+
+    output_size = 1024
+
+    def model_builder():
+        return GeLUModel(output_size)
+
+    def input_generator():
+        return torch.randn(output_size, dtype=torch.float)
+
+    # generate a label that have same shape as forward output.
+    label_input = torch.ones([output_size])
+
+    run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True)
+
 def test_MegatronF():
     # MegatronGFunction is tested in distributed test files.
     class MegatronFFunction(torch.autograd.Function):
diff --git a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
index 767ad6051d2cb..5ac1bba684d76 100644
--- a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
+++ b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
@@ -86,7 +86,7 @@ std::unique_ptr<T> CreateOpTester(const onnxruntime::training::OpDef& op_def,
                                   int opset_version) {
   auto test = std::make_unique<T>(op_def.type.c_str(), opset_version, op_def.domain.c_str());
   for (auto attr : attributes)
-    test->AddAttribute(attr.name(), attr);
+    test->AddAttributeProto(attr);
 
   auto input_index = 0;
   for (auto& data : input_data) {
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 11efeb6d896f6..c7a95638ea622 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -1,10 +1,16 @@
+# Known Limits
+# 1. Anchors are not supported in GHA
+# https://github.community/t/support-for-yaml-anchors/16128/90
+# 2. Nested Virutalizaiton isn't supported in Azure pipeline
+# https://developercommunity.visualstudio.com/t/enable-nested-virtualization-on-azure-pipelines/466384
+
 jobs:
-- job: Android_CI
-  pool:
-    vmImage: 'macOS-11'
-  timeoutInMinutes: 180
+- job: Build_CPU_EP
+  pool: Linux-CPU-2019
+  timeoutInMinutes: 30
   steps:
-  # Onnx has no 3.9 python package available yet, need to use python 3.8 to avoid build onnx package
+  # Onnx has no 3.9 python package available yet, need to use python 3.8
+  # to avoid build onnx package
   # pythonVersion can be updated in Azure pipeline settings
   # https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=53
   - task: UsePythonVersion@0
@@ -12,7 +18,7 @@ jobs:
     inputs:
       versionSpec: $(pythonVersion)
 
-  - script: brew install coreutils ninja
+  - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
     displayName: Install coreutils and ninja
 
   - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd)
@@ -27,12 +33,133 @@ jobs:
     displayName: Build Host Protoc
 
   - script: |
-      python3 tools/python/run_android_emulator.py \
+      export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk
+      export ANDROID_HOME=/usr/local/lib/android/sdk
+      export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle
+      export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle
+      env | grep ANDROID
+    displayName: Set Android ENVs
+
+  # Start switching to jdk 11 after the Android Emulator is started
+  # since Android SDK manager requires java 8
+  - task: JavaToolInstaller@0
+    displayName: Use jdk 11
+    inputs:
+      versionSpec: '11'
+      jdkArchitectureOption: 'x64'
+      jdkSourceOption: 'PreInstalled'
+
+  - script: |
+      python3 tools/ci_build/build.py \
+        --android \
+        --build_dir build \
+        --android_sdk_path $ANDROID_HOME \
+        --android_ndk_path $ANDROID_NDK_HOME \
+        --android_abi=x86_64 \
+        --android_api=30 \
+        --skip_submodule_sync \
+        --parallel \
+        --cmake_generator=Ninja \
+        --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \
+        --build_java \
+        --skip_tests
+    displayName: CPU EP, Build
+
+  - task: CopyFiles@2
+    displayName: Copy apks
+    inputs:
+      contents: 'build/**/*.apk'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
+
+  - task: CopyFiles@2
+    displayName: Copy test data 
+    inputs:
+      contents: 'build/**/testdata/**'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
+
+  - task: CopyFiles@2
+    displayName: Copy test executables
+    inputs:
+      contents: 'build/Debug/*'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
+
+  - task: PublishBuildArtifacts@1
+    inputs:
+      pathToPublish: $(Build.ArtifactStagingDirectory)
+      artifactName: CPUBuildOutput
+
+- job: Test_CPU_EP
+  pool:
+    vmImage: 'macOS-11'
+  dependsOn: Build_CPU_EP
+  condition: succeeded()
+  steps:
+    - task: DownloadPipelineArtifact@2
+      inputs:
+        source: 'current'
+        artifact: 'CPUBuildOutput'
+        path: $(Build.SourcesDirectory)
+
+    - script: |
+        python3 tools/python/run_android_emulator.py \
         --android-sdk-root ${ANDROID_SDK_ROOT} \
         --create-avd --system-image "system-images;android-30;google_apis;x86_64" \
         --start --emulator-extra-args="-partition-size 4096" \
         --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid
-    displayName: Start Android emulator
+      displayName: Start Android emulator
+
+    - script: |
+        python3 tools/ci_build/build.py \
+        --android \
+        --build_dir build \
+        --android_sdk_path $ANDROID_HOME \
+        --android_ndk_path $ANDROID_NDK_HOME \
+        --android_abi=x86_64 \
+        --android_api=30 \
+        --test
+      displayName: CPU EP, Test on Android Emulator
+
+    - script: |
+        python3 tools/python/run_android_emulator.py \
+          --android-sdk-root ${ANDROID_SDK_ROOT} \
+          --stop \
+          --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid
+      displayName: Stop Android emulator
+      condition: always()
+
+- job: Build_NNAPI_EP
+  pool: Linux-CPU-2019
+  timeoutInMinutes: 30
+  steps:
+  - task: UsePythonVersion@0
+    displayName: Use Python $(pythonVersion)
+    inputs:
+      versionSpec: $(pythonVersion)
+
+  - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
+    displayName: Install coreutils and ninja
+
+  - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd)
+    displayName: Setup gradle wrapper to use gradle 6.8.3
+
+  # We build the host protoc to <ORT_ROOT>/protobuf_install
+  - script: |
+      /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
+        $(Build.SourcesDirectory) \
+        $(Build.BinariesDirectory)/protobuf \
+        $(Build.SourcesDirectory)/protobuf_install
+    displayName: Build Host Protoc
+
+  - script: |
+      export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk
+      export ANDROID_HOME=/usr/local/lib/android/sdk
+      export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle
+      export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle
+      env | grep ANDROID
+    displayName: set Android ENVs
 
   # Start switching to jdk 11 after the Android Emulator is started since Android SDK manager requires java 8
   - task: JavaToolInstaller@0
@@ -45,39 +172,134 @@ jobs:
   - script: |
       python3 tools/ci_build/build.py \
         --android \
-        --build_dir build \
+        --build_dir build_nnapi \
         --android_sdk_path $ANDROID_HOME \
         --android_ndk_path $ANDROID_NDK_HOME \
         --android_abi=x86_64 \
-        --android_api=30 \
+        --android_api=29 \
         --skip_submodule_sync \
         --parallel \
+        --use_nnapi \
         --cmake_generator=Ninja \
         --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \
-        --build_java
-    displayName: CPU EP, Build and Test on Android Emulator
+        --build_java \
+        --code_coverage \
+        --skip_tests
+    displayName: NNAPI EP, Build
 
-  - script: /bin/bash tools/ci_build/github/android/run_nnapi_code_coverage.sh $(pwd)
-    displayName: NNAPI EP, Build, Test and Get Code Coverage on Android Emulator
+  - task: CopyFiles@2
+    displayName: Copy apks
+    inputs:
+      contents: 'build_nnapi/**/*.apk'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
 
-  - task: PublishPipelineArtifact@0
-    displayName: 'Publish code coverage report'
+  - task: CopyFiles@2
+    displayName: Copy test data
     inputs:
-      artifactName: "coverage_rpt.txt"
-      targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
-      publishLocation: 'pipeline'
+      contents: 'build_nnapi/**/testdata/**'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
 
-  - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-    # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-    displayName: Build Minimal ORT with NNAPI and run tests
+  - task: CopyFiles@2
+    displayName: Copy Test Executables
+    inputs:
+      contents: 'build_nnapi/Debug/*'
+      targetFolder: $(Build.ArtifactStagingDirectory)
+      overWrite: true
 
-  - script: |
-      python3 tools/python/run_android_emulator.py \
+  - task: PublishBuildArtifacts@1
+    inputs:
+      pathToPublish: $(Build.ArtifactStagingDirectory)
+      artifactName: NNAPIBuildOutput
+
+- job: Test_NNAPI_EP
+  pool:
+    vmImage: 'macOS-11'
+  dependsOn: Build_NNAPI_EP
+  condition: succeeded()
+  steps:
+    - task: DownloadPipelineArtifact@2
+      inputs:
+        source: 'current'
+        artifact: 'NNAPIBuildOutput'
+        path: $(Build.SourcesDirectory)
+
+    - task: UsePythonVersion@0
+      displayName: Use Python $(pythonVersion)
+      inputs:
+        versionSpec: $(pythonVersion)
+
+    - script: |
+        python3 tools/python/run_android_emulator.py \
         --android-sdk-root ${ANDROID_SDK_ROOT} \
-        --stop \
+        --create-avd --system-image "system-images;android-30;google_apis;x86_64" \
+        --start --emulator-extra-args="-partition-size 4096" \
         --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid
-    displayName: Stop Android emulator
-    condition: always()
+      displayName: Start Android emulator
+      
+    - script: |
+        python3 tools/ci_build/build.py \
+        --android \
+        --build_dir build_nnapi \
+        --android_sdk_path $ANDROID_HOME \
+        --android_ndk_path $ANDROID_NDK_HOME \
+        --android_abi=x86_64 \
+        --android_api=29 \
+        --use_nnapi \
+        --test \
+        --code_coverage
+      displayName: NNAPI EP, Test, CodeCoverage on Android Emulator
+
+    - script: |
+        python3 -m pip install gcovr && \
+        python3 tools/ci_build/coverage.py \
+          --build_dir build_nnapi \
+          --android_sdk_path $ANDROID_HOME
+      displayName: Retrieve runtime code coverage files from the emulator and analyze
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish code coverage report'
+      inputs:
+          artifactName: "coverage_rpt.txt"
+          targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
+          publishLocation: 'pipeline'
+
+    # used by Build Minimal ORT
+    - script: brew install coreutils ninja
+      displayName: Install coreutils and ninja
+
+    # We build the host protoc to <ORT_ROOT>/protobuf_install
+    - script: |
+        /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
+          $(Build.SourcesDirectory) \
+          $(Build.BinariesDirectory)/protobuf \
+          $(Build.SourcesDirectory)/protobuf_install
+      displayName: Build Host Protoc
+
+    - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd)
+      displayName: Setup gradle wrapper to use gradle 6.8.3
+
+    # Start switching to jdk 11 after the Android Emulator is started
+    # since Android SDK manager requires java 8
+    - task: JavaToolInstaller@0
+      displayName: Use jdk 11
+      inputs:
+        versionSpec: '11'
+        jdkArchitectureOption: 'x64'
+        jdkSourceOption: 'PreInstalled'
+
+    - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
+      # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
+      displayName: Build Minimal ORT with NNAPI and run tests
+
+    - script: |
+        python3 tools/python/run_android_emulator.py \
+          --android-sdk-root ${ANDROID_SDK_ROOT} \
+          --stop \
+          --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid
+      displayName: Stop Android emulator
+      condition: always()
 
 - job: Update_Dashboard
   workspace:
@@ -87,7 +309,9 @@ jobs:
     value: true
   pool: 'Linux-CPU-2019'
   condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
-  dependsOn: Android_CI
+  dependsOn:
+  - Test_CPU_EP
+  - Test_NNAPI_EP
   steps:
   - task: DownloadPipelineArtifact@0
     displayName: 'Download code coverage report'
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 7447c45feadbc..b215c3ce3be66 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -13,6 +13,11 @@ jobs:
     inputs:
       versionSpec: '12.16.3'
 
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.8'
+      addToPath: true
+
   - template: templates/get-docker-image-steps.yml
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cpu
@@ -51,7 +56,7 @@ jobs:
     inputs:
       script: |
          set -e -x
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
          # Test ORT with the latest ONNX release.
          export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
@@ -70,56 +75,58 @@ jobs:
          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
          python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
 
-  - task: CmdLine@2
+  - task: PythonScript@0
     displayName: 'Run Release unit tests'
     inputs:
-      script: |
-        python3 $(Build.SourcesDirectory)/tools/ci_build/build.py \
-          --build_dir $(Build.BinariesDirectory) \
-          --cmake_generator Ninja \
-          --config Release \
-          --test \
-          --skip_submodule_sync \
-          --build_shared_lib \
-          --parallel \
-          --build_wheel \
-          --enable_onnx_tests \
-          --enable_transformers_tool_test \
-          --build_nodejs \
+       scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
+       workingDirectory: $(Build.BinariesDirectory)/Release
+       arguments: >-
+          --build_dir $(Build.BinariesDirectory)
+          --cmake_generator Ninja
+          --config Release
+          --test
+          --skip_submodule_sync
+          --build_shared_lib
+          --parallel
+          --build_wheel
+          --enable_onnx_tests
+          --enable_transformers_tool_test
+          --build_nodejs
           --ctest_path ""
 
   - task: CmdLine@2
     displayName: 'Install Debug python package'
     inputs:
       script: |
+         set -e -x
          rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
 
-  - task: CmdLine@2
+  - task: PythonScript@0
     displayName: 'Run Debug unit tests'
     inputs:
-      script: |
-        python3 $(Build.SourcesDirectory)/tools/ci_build/build.py \
-          --build_dir $(Build.BinariesDirectory) \
-          --cmake_generator Ninja \
-          --config Debug \
-          --test \
-          --skip_submodule_sync \
-          --build_shared_lib \
-          --parallel \
-          --build_wheel \
-          --enable_onnx_tests \
-          --enable_transformers_tool_test \
-          --build_nodejs \
+      scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
+      workingDirectory: $(Build.BinariesDirectory)/Debug
+      arguments: >-
+          --build_dir $(Build.BinariesDirectory)
+          --cmake_generator Ninja
+          --config Debug
+          --test
+          --skip_submodule_sync
+          --build_shared_lib
+          --parallel
+          --build_wheel
+          --enable_onnx_tests
+          --enable_transformers_tool_test
+          --build_nodejs
           --ctest_path ""
 
-  - task: CmdLine@2
+  - task: PythonScript@0
     displayName: 'Symbolic shape infer'
     inputs:
-      script: |
-        cd $(Build.BinariesDirectory)/Release
-        python3 $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
+      scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
+      workingDirectory: $(Build.BinariesDirectory)/Release
 
   - task: PublishTestResults@2
     displayName: 'Publish unit test results'
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 8730a7ba2971a..c1c994d80b6ee 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -96,7 +96,7 @@ jobs:
          # We assume the machine doesn't have gcc and python development header files
          sudo rm -f /build /onnxruntime_src
          sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
          # Test ORT with the latest ONNX release.
          export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index d3b8e10e2f2ff..317f36cde0922 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -13,6 +13,11 @@ jobs:
     inputs:
       versionSpec: '12.16.3'
 
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.8'
+      addToPath: true
+
   - template: templates/get-docker-image-steps.yml
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cpu
@@ -50,7 +55,7 @@ jobs:
     inputs:
       script: |
          set -e -x
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
          # Test ORT with the latest ONNX release.
          export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
@@ -71,34 +76,35 @@ jobs:
          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
          python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
 
-  - task: CmdLine@2
+
+  - task: PythonScript@0
     displayName: 'Run Release unit tests'
     inputs:
-      script: |
-        cd /tmp
-        python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path ""
+       scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
+       workingDirectory: $(Build.BinariesDirectory)/Release
+       arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path ""
 
   - task: CmdLine@2
     displayName: 'Install Debug python package'
     inputs:
       script: |
+         set -e -x
          rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
 
-  - task: CmdLine@2
+  - task: PythonScript@0
     displayName: 'Run Debug unit tests'
     inputs:
       scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
       arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Debug --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path ""
-      workingDirectory: /tmp
+      workingDirectory: $(Build.BinariesDirectory)/Debug
 
-  - task: CmdLine@2
+  - task: PythonScript@0
     displayName: 'Symbolic shape infer'
     inputs:
-      script: |
-        cd $(Build.BinariesDirectory)/Release
-        python3 $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
+      scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
+      workingDirectory: $(Build.BinariesDirectory)/Release
 
   - task: PublishTestResults@2
     displayName: 'Publish unit test results'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml
index 512cb806ed67b..e3eb972f69c16 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml
@@ -3,7 +3,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: Linux-CPU-2019
+  pool: onnxruntime-training-linux-ext-custom-ops
   steps:
   - checkout: self
     clean: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 609bf0f72dce7..c6fd4c8d469c4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -341,7 +341,7 @@ stages:
             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
             sudo rm -f /build /onnxruntime_src
             sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src
-            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
             # Test ORT with the latest ONNX release.
             export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index f0ea9431953ad..9fad69a341fec 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -116,7 +116,7 @@ stages:
             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
             sudo rm -f /build /onnxruntime_src
             sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src
-            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
             # Test ORT with the latest ONNX release.
             export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
@@ -245,7 +245,7 @@ stages:
             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
             sudo rm -f /build /onnxruntime_src
             sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src
-            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
             # Test ORT with the latest ONNX release.
             export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER)
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
index 90f5546e0cbd7..45f4997380bb6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
@@ -85,7 +85,7 @@ jobs:
     displayName: 'API Documentation Check and generate'
 
   - script: |
-     python -m pip install -q setuptools wheel numpy six
+     python -m pip install -q setuptools wheel numpy
     workingDirectory: '$(Build.BinariesDirectory)'
     displayName: 'Install python modules'
 
@@ -173,7 +173,7 @@ jobs:
 
   - ${{ if eq(parameters.EnablePython, true) }}:
       - powershell: |
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
        
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
index 1931114aa9c8c..e698956db2ccc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml
@@ -98,7 +98,7 @@ jobs:
     displayName: 'API Documentation Check and generate'
 
   - script: |
-     python -m pip install -q setuptools wheel numpy six
+     python -m pip install -q setuptools wheel numpy
     workingDirectory: '$(Build.BinariesDirectory)'
     displayName: 'Install python modules'
 
@@ -193,7 +193,7 @@ jobs:
 
   - ${{ if eq(parameters.EnablePython, true) }}:
       - powershell: |
-         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+         python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
        
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index adc9b03a0eb06..31671520e4b00 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -7,4 +7,3 @@ git+http://github.com/onnx/onnx.git@be76ca7148396176784ba8733133b9fb1186ea0d#egg
 protobuf
 sympy==1.1.1
 flatbuffers
-six
diff --git a/tools/python/PythonTools.md b/tools/python/PythonTools.md
index 6f752ad2257b9..49a4ac6a337d1 100644
--- a/tools/python/PythonTools.md
+++ b/tools/python/PythonTools.md
@@ -133,7 +133,7 @@ optional arguments:
 image_to_pb:
   image_to_pb specific options
 
-  --resize RESIZE       Provide the shape as comma separated values to resize the image to. e.g. --shape 200,200
+  --resize RESIZE       Provide the height and width to resize to as comma separated values. e.g. --shape 200,300 will resize to height 200 and width 300.
   --channels_last       Transpose image from channels first to channels last.
   --add_batch_dim       Prepend a batch dimension with value of 1 to the shape. i.e. convert from CHW to NCHW
 
diff --git a/tools/python/onnx_test_data_utils.py b/tools/python/onnx_test_data_utils.py
index 2121d579aec3b..e4e4c54aa12d6 100644
--- a/tools/python/onnx_test_data_utils.py
+++ b/tools/python/onnx_test_data_utils.py
@@ -52,7 +52,26 @@ def image_to_numpy(filename, shape, channels_last, add_batch_dim):
 
     img = PIL.Image.open(filename)
     if shape:
-        img = img.resize(shape, PIL.Image.ANTIALIAS)
+        w, h = img.size
+        new_w = shape[1]
+        new_h = shape[0]
+
+        # use the dimension that needs to shrink the least to resize to an image where that dimension matches the
+        # target size.
+        w_ratio = new_w / w
+        h_ratio = new_h / h
+        ratio = w_ratio if w_ratio > h_ratio else h_ratio
+        interim_w = int(w * ratio)
+        interim_h = int(h * ratio)
+        img = img.resize((interim_w, interim_h), PIL.Image.ANTIALIAS)
+
+        # center crop to the final target size
+        left = (interim_w - new_w) / 2
+        top = (interim_h - new_h) / 2
+        right = (interim_w + new_w) / 2
+        bottom = (interim_h + new_h) / 2
+        img = img.crop((left, top, right, bottom))
+
     img_as_np = np.array(img).astype(np.float32)
     if not channels_last:
         # HWC to CHW
@@ -110,8 +129,8 @@ def get_arg_parser():
     image_to_pb_group = parser.add_argument_group('image_to_pb',
                                                   'image_to_pb specific options')
     image_to_pb_group.add_argument('--resize', default=None, type=lambda s: [int(item) for item in s.split(',')],
-                                   help='Provide the shape as comma separated values to resize the image to.'
-                                        ' e.g. --shape 200,200')
+                                   help='Provide the height and width to resize to as comma separated values.'
+                                        ' e.g. --shape 200,300 will resize to height 200 and width 300.')
     image_to_pb_group.add_argument('--channels_last', action='store_true',
                                    help='Transpose image from channels first to channels last.')
     image_to_pb_group.add_argument('--add_batch_dim', action='store_true',
diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
index c2aec18038829..4e872a46c42a4 100644
--- a/winml/test/api/LearningModelSessionAPITest.cpp
+++ b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -476,10 +476,13 @@ static void WindowFunction(const wchar_t* window_operator_name, TensorKind kind)
 #endif
 
 #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS)
-static void DiscreteFourierTransform(bool is_onesided = false) {
-  std::vector<int64_t> shape = {1, 5};
-  std::vector<int64_t> output_shape = {1, 5, 2};
-  output_shape[1] = is_onesided ? (1 + (shape[1] >> 1)) : shape[1];
+static void DiscreteFourierTransform(size_t axis, bool is_onesided = false) {
+  auto axis_dim = axis + 1;
+  printf("\nDiscrete Fourier Transform [axis=%d, is_onesided=%s]\n", static_cast<int>(axis_dim), is_onesided ? "true" : "false");
+
+  std::vector<int64_t> shape = {2, 5, 8, 1};
+  std::vector<int64_t> output_shape = {2, 5, 8, 2};
+  output_shape[axis_dim] = is_onesided ? (1 + (shape[axis_dim] >> 1)) : shape[axis_dim];
    
   auto model =
       LearningModelBuilder::Create(13)
@@ -487,6 +490,7 @@ static void DiscreteFourierTransform(bool is_onesided = false) {
         .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Spectra", TensorKind::Float, output_shape))
         .Operators().Add(Operator(L"DFT", MS_EXPERIMENTAL_DOMAIN)
           .SetInput(L"input", L"Input.Signal")
+          .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)}))
           .SetAttribute(L"onesided", TensorInt64Bit::CreateFromArray({}, {is_onesided}))
           .SetOutput(L"output", L"Output.Spectra"))
         .CreateModel();
@@ -495,19 +499,38 @@ static void DiscreteFourierTransform(bool is_onesided = false) {
   LearningModelBinding binding(session);
 
   // Populate binding
-  binding.Bind(L"Input.Signal", TensorFloat::CreateFromArray(shape, {1, 2, 3, 4, 5}));
+  binding.Bind(
+      L"Input.Signal",
+      TensorFloat::CreateFromArray(
+          shape,
+          {1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8, 
+
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+          }));
 
   // Evaluate
   auto result = session.Evaluate(binding, L"");
 
-  // Check results
-  printf("Output.Spectra\n");
-  auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as<TensorFloat>();
-  auto y_ivv = y_tensor.GetAsVectorView();
-  for (int i = 0; i < output_shape[0] * output_shape[1] * 2; i += 2) {
-    printf("(%f + %fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1));
-  }
-  printf("\n");
+  // // Check results
+  // printf("Output.Spectra\n");
+  // auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as<TensorFloat>();
+  // auto y_ivv = y_tensor.GetAsVectorView();
+  // for (uint32_t i = 0; i < y_ivv.Size(); i+=2) {
+  //   auto format_size = 16 * (!is_onesided || axis == 0) + 10 * (is_onesided && axis == 1);
+  //   if (i % format_size == 0 && i != 0) {
+  //     printf("\n");
+  //   }
+  //   printf("(%.2f + %.2fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1));
+  // }
+  // printf("\n");
 }
 #endif
 
@@ -612,20 +635,20 @@ static void STFT(size_t batch_size, size_t signal_size, size_t dft_size,
     printf("%f, ", window_ivv.GetAt(i));
   }
   printf("\n");
-  printf("Output.STFT\n");
-  // Check results
-  auto y_tensor = result.Outputs().Lookup(L"Output.STFT").as<TensorFloat>();
-  auto y_ivv = y_tensor.GetAsVectorView();
-  auto size = y_ivv.Size();
-  WINML_EXPECT_EQUAL(size, n_dfts * output_shape[2] * 2);
-  for (size_t dft_idx = 0; dft_idx < n_dfts; dft_idx++) {
-    for (size_t i = 0; INT64(i) < output_shape[2]; i++) {
-      auto real_idx = static_cast<uint32_t>((i * 2) + (2 * dft_idx * output_shape[2]));
-      printf("(%d, %f , %fi), ", static_cast<uint32_t>(i), y_ivv.GetAt(real_idx), y_ivv.GetAt(real_idx + 1));
-    }
-  }
-  
-  printf("\n");
+  //printf("Output.STFT\n");
+  //// Check results
+  //auto y_tensor = result.Outputs().Lookup(L"Output.STFT").as<TensorFloat>();
+  //auto y_ivv = y_tensor.GetAsVectorView();
+  //auto size = y_ivv.Size();
+  //WINML_EXPECT_EQUAL(size, n_dfts * output_shape[2] * 2);
+  //for (size_t dft_idx = 0; dft_idx < n_dfts; dft_idx++) {
+  //  for (size_t i = 0; INT64(i) < output_shape[2]; i++) {
+  //    auto real_idx = static_cast<uint32_t>((i * 2) + (2 * dft_idx * output_shape[2]));
+  //    printf("(%d, %f , %fi), ", static_cast<uint32_t>(i), y_ivv.GetAt(real_idx), y_ivv.GetAt(real_idx + 1));
+  //  }
+  //}
+  //
+  //printf("\n");
 }
 #endif
 
@@ -913,45 +936,88 @@ static void ModelBuilding_ConstantMatmul() {
 
 static void ModelBuilding_DiscreteFourierTransform() {
 #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS)
-  DiscreteFourierTransform(false /*onesided*/);
-  DiscreteFourierTransform(true /*onesided*/);
+  DiscreteFourierTransform(0, false /*onesided*/);
+  DiscreteFourierTransform(0, true /*onesided*/);
+  DiscreteFourierTransform(1, false /*onesided*/);
+  DiscreteFourierTransform(1, true /*onesided*/);
+
 #endif
 }
 
-static void ModelBuilding_DiscreteFourierTransformInverseIdentity() {
 #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS)
-  std::vector<int64_t> shape = {1, 5};
-  std::vector<int64_t> output_shape = {1, shape[1], 2};
+static void DiscreteFourierTransformInverse(size_t axis) {
+  std::vector<int64_t> shape = {2, 5, 8, 1};
+  std::vector<int64_t> output_shape = {2, 5, 8, 2};
 
   auto model =
       LearningModelBuilder::Create(13)
           .Inputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Input.TimeSignal", TensorKind::Float, shape))
           .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Spectra", TensorKind::Float, output_shape))
+          .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Inverse", TensorKind::Float, output_shape))
           .Operators().Add(Operator(L"DFT", MS_EXPERIMENTAL_DOMAIN)
                              .SetInput(L"input", L"Input.TimeSignal")
-                             .SetOutput(L"output", L"DFTOutput"))
-          .Operators().Add(Operator(L"IDFT", MS_EXPERIMENTAL_DOMAIN)
-                             .SetInput(L"input", L"DFTOutput")
+                             .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)}))
                              .SetOutput(L"output", L"Output.Spectra"))
+          .Operators().Add(Operator(L"IDFT", MS_EXPERIMENTAL_DOMAIN)
+                             .SetInput(L"input", L"Output.Spectra")
+                             .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)}))
+                             .SetOutput(L"output", L"Output.Inverse"))
           .CreateModel();
 
   LearningModelSession session(model);
   LearningModelBinding binding(session);
 
+  auto input_vector =
+      std::vector<float>{
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8,
+           1, 2, 3, 4, 5, 6, 7, 8, 
+
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+           2, 4, 6, 8, 10, 12, 14, 16,
+          };
   // Populate binding
-  binding.Bind(L"Input.TimeSignal", TensorFloat::CreateFromArray(shape, {1, 2, 3, 4, 5}));
+  binding.Bind(
+      L"Input.TimeSignal",
+      TensorFloat::CreateFromArray(
+          shape,
+          input_vector));
 
   // Evaluate
   auto result = session.Evaluate(binding, L"");
-
+  
   // Check results
-  printf("Output.Spectra\n");
-  auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as<TensorFloat>();
+  auto y_tensor = result.Outputs().Lookup(L"Output.Inverse").as<TensorFloat>();
   auto y_ivv = y_tensor.GetAsVectorView();
-  for (int i = 0; i < output_shape[0] * output_shape[1] * 2; i += 2) {
-    printf("(%f + %fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1));
+  for (uint32_t i = 0; i < y_ivv.Size(); i += 2) {
+    WINML_EXPECT_TRUE(abs(y_ivv.GetAt(i) - input_vector[i / 2]) < .001);
+    WINML_EXPECT_TRUE(abs(y_ivv.GetAt(i + 1) - 0) < .001);
   }
-  printf("\n");
+  
+  //printf("Output.Spectra\n");
+  //auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as<TensorFloat>();
+  //auto y_ivv = y_tensor.GetAsVectorView();
+  //for (uint32_t i = 0; i < y_ivv.Size(); i+=2) {
+  //  auto format_size = 16;
+  //  if (i % format_size == 0 && i != 0) {
+  //    printf("\n");
+  //  }
+  //  printf("(%.2f + %.2fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1));
+  //}
+  //printf("\n");
+  
+}
+#endif
+
+static void ModelBuilding_DiscreteFourierTransformInverseIdentity() {
+#if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS)
+  DiscreteFourierTransformInverse(0);
+  DiscreteFourierTransformInverse(1);
 #endif
 }