diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml index b2da6ed5f19e1..ba7c4eaf58b9e 100644 --- a/.github/workflows/publish-python-apidocs.yml +++ b/.github/workflows/publish-python-apidocs.yml @@ -27,14 +27,13 @@ jobs: - name: Set vars id: vars run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" - - name: Check outputs - run: echo ${{ steps.vars.outputs.sha_short }} - uses: actions/checkout@v2 with: ref: gh-pages clean: false - name: Move API docs into target area run: | + ls docs/api rm -rf docs/api/python mv build/docs/inference/html docs/api/python - name: Create Pull Request diff --git a/.gitignore b/.gitignore index 7c3f4b8ecf5ff..d27dedbc2b7ce 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ onnxruntime_profile*.json /csharp/**/bin/ /csharp/Directory.Build.props docs/python/inference/*.onnx +*.onnx onnxprofile_profile_test_*.json /csharp/packages /csharp/src/Microsoft.ML.OnnxRuntime/targets/**/*.targets diff --git a/CODEOWNERS b/CODEOWNERS index 1b331ce95a66c..43a1eda4115d5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,20 +1,21 @@ # Python frontend owners -orttraining/*.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -orttraining/orttraining/python/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -orttraining/orttraining/test/python/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -orttraining/pytorch_frontend_examples/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -onnxruntime/python/training/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -onnxruntime/test/python/onnxruntime_test_ort_trainer.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -onnxruntime/test/python/onnxruntime_test_training_unit_tests.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -onnxruntime/test/python/onnxruntime_test_training_unittest_utils.py @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre -samples/python/training/** @thiagocrepaldi @tlh20 @liqunfu @baijumeswani @SherlockNoMad @xadupre +orttraining/*.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre +orttraining/orttraining/python/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre +orttraining/orttraining/test/python/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre +orttraining/pytorch_frontend_examples/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre +onnxruntime/python/training/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre +onnxruntime/test/python/onnxruntime_test_ort_trainer.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre +onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre +onnxruntime/test/python/onnxruntime_test_training_unit_tests.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre +onnxruntime/test/python/onnxruntime_test_training_unittest_utils.py @thiagocrepaldi @tlh20 @baijumeswani @xadupre +samples/python/training/** @thiagocrepaldi @tlh20 @baijumeswani @xadupre # Mobile -/onnxruntime/test/testdata/kernel_def_hashes/ @skottmckay @gwang-msft @YUNQIUGUO @edgchen1 -/onnxruntime/core/framework/kernel_def_hash_helpers.* @skottmckay @gwang-msft @YUNQIUGUO @edgchen1 +/onnxruntime/test/testdata/kernel_def_hashes/ @skottmckay @YUNQIUGUO @edgchen1 +/onnxruntime/core/framework/kernel_def_hash_helpers.* @skottmckay @YUNQIUGUO @edgchen1 # Contrib Ops onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn onnxruntime/core/graph/contrib_ops/nchwc_schema_defs.cc @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn -onnxruntime/core/graph/contrib_ops/quantization_defs.* @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn \ No newline at end of file +onnxruntime/core/graph/contrib_ops/quantization_defs.* @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn +onnxruntime/core/mlas/** @zhanghuanrong @chenfucn @yufenglee @yihonglyu @snnn \ No newline at end of file diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json index e043e07ea13b4..379ff6921c568 100644 --- a/cgmanifests/cgmanifest.json +++ b/cgmanifests/cgmanifest.json @@ -46,7 +46,7 @@ "component": { "type": "git", "git": { - "commitHash": "d721d320bd2f66d342d24b71600fe1f5e222e952", + "commitHash": "ffd5f70370642c909222f9a4cae8400023dacbdc", "repositoryUrl": "https://github.com/apache/tvm.git" }, "comments": "needed for TVM EP" diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake index c1ee5fdde51b7..3f425a0938e2f 100644 --- a/cmake/external/tvm.cmake +++ b/cmake/external/tvm.cmake @@ -4,7 +4,7 @@ if (onnxruntime_USE_TVM) FetchContent_Declare( tvm GIT_REPOSITORY https://github.com/apache/tvm.git - GIT_TAG d721d320bd2f66d342d24b71600fe1f5e222e952 + GIT_TAG ffd5f70370642c909222f9a4cae8400023dacbdc ) FetchContent_GetProperties(tvm) diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 64068a03a09f1..2331b21de7480 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -213,8 +213,13 @@ install(TARGETS onnxruntime set_target_properties(onnxruntime PROPERTIES FOLDER "ONNXRuntime") -if (WINDOWS_STORE) - target_link_options(onnxruntime PRIVATE /DELAYLOAD:api-ms-win-core-libraryloader-l1-2-1.dll) +if (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) + # Workaround STL bug https://github.com/microsoft/STL/issues/434#issuecomment-921321254 + # Note that the workaround makes std::system_error crash before Windows 10 + + # The linker warns "LNK4199: /DELAYLOAD:api-ms-win-core-heapl2-1-0.dll ignored; no imports found from api-ms-win-core-heapl2-1-0.dll" + # when you're not using imports directly, even though the import exists in the STL and the DLL would have been linked without DELAYLOAD + target_link_options(onnxruntime PRIVATE /DELAYLOAD:api-ms-win-core-heapl2-1-0.dll /ignore:4199) endif() if (winml_is_inbox) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 9590050dfe9d2..d32db62ca56d0 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -206,7 +206,7 @@ endif() if (ARM64 OR ARM OR X86 OR X64 OR X86_64) - if(WINDOWS_STORE OR (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC)) + if((WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC)) # msvc compiler report syntax error with cpuinfo arm source files # and cpuinfo does not have code for getting arm uarch info under windows else() diff --git a/cmake/onnxruntime_flatbuffers.cmake b/cmake/onnxruntime_flatbuffers.cmake index bcb196bcd8cd9..49302e92f5a66 100644 --- a/cmake/onnxruntime_flatbuffers.cmake +++ b/cmake/onnxruntime_flatbuffers.cmake @@ -21,16 +21,3 @@ set_target_properties(onnxruntime_flatbuffers PROPERTIES FOLDER "ONNXRuntime") if (FLATBUFFERS_BUILD_FLATC) add_dependencies(onnxruntime_flatbuffers flatc) endif() - -if (WINDOWS_STORE) - function(target_force_include target scope file) - if (MSVC) - target_compile_options(${target} ${scope} "/FI${file}") - else() - target_compile_options(${target} ${scope} -include "${file}") - endif() - endfunction() - - target_force_include(flatbuffers PRIVATE uwp_stubs.h) - target_force_include(flatc PRIVATE uwp_stubs.h) -endif() diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index b6553de172d24..0ff23ad9507fa 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -1318,7 +1318,7 @@ if (onnxruntime_USE_HAILO) "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) - find_package(HailoRT 4.8.1 EXACT REQUIRED) + find_package(HailoRT 4.10.0 EXACT REQUIRED) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_hailo_cc_srcs}) onnxruntime_add_shared_library_module(onnxruntime_providers_hailo ${onnxruntime_providers_hailo_cc_srcs}) diff --git a/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch b/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch index d481a14a5544f..f8e4750cbccff 100644 --- a/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch +++ b/cmake/patches/abseil/Fix_Nvidia_Build_Break.patch @@ -20,3 +20,44 @@ index 1d7d6cd..f6a7a78 100644 {allocated_storage_view.data, allocated_storage_view.capacity}); } +diff --git a/absl/copts/GENERATED_AbseilCopts.cmake b/absl/copts/GENERATED_AbseilCopts.cmake +index a4ab1aa..dfd13fd 100644 +--- a/absl/copts/GENERATED_AbseilCopts.cmake ++++ b/absl/copts/GENERATED_AbseilCopts.cmake +@@ -129,8 +129,6 @@ list(APPEND ABSL_MSVC_FLAGS + "/wd4005" + "/wd4068" + "/wd4180" +- "/wd4244" +- "/wd4267" + "/wd4503" + "/wd4800" + ) +diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl +index a6efc98..8c4de8e 100644 +--- a/absl/copts/GENERATED_copts.bzl ++++ b/absl/copts/GENERATED_copts.bzl +@@ -130,8 +130,6 @@ ABSL_MSVC_FLAGS = [ + "/wd4005", + "/wd4068", + "/wd4180", +- "/wd4244", +- "/wd4267", + "/wd4503", + "/wd4800", + ] +diff --git a/absl/copts/copts.py b/absl/copts/copts.py +index 0d6c1ec..75fd935 100644 +--- a/absl/copts/copts.py ++++ b/absl/copts/copts.py +@@ -132,10 +132,6 @@ COPT_VARS = { + "/wd4068", # unknown pragma + # qualifier applied to function type has no meaning; ignored + "/wd4180", +- # conversion from 'type1' to 'type2', possible loss of data +- "/wd4244", +- # conversion from 'size_t' to 'type', possible loss of data +- "/wd4267", + # The decorated name was longer than the compiler limit + "/wd4503", + # forcing value to bool 'true' or 'false' (performance warning) diff --git a/cmake/store_toolchain.cmake b/cmake/store_toolchain.cmake deleted file mode 100644 index ebdb88da7752c..0000000000000 --- a/cmake/store_toolchain.cmake +++ /dev/null @@ -1,5 +0,0 @@ -set(CMAKE_SYSTEM_NAME WindowsStore) -set(CMAKE_SYSTEM_VERSION 10.0) -if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR) - set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}) -endif() diff --git a/cmake/target_delayload.cmake b/cmake/target_delayload.cmake index c776b2529a2b9..53f252a3e71ac 100644 --- a/cmake/target_delayload.cmake +++ b/cmake/target_delayload.cmake @@ -9,9 +9,6 @@ function(target_delayload target_name) foreach(lib ${ARGN}) target_link_options(${target_name} PRIVATE /DELAYLOAD:"${lib}") endforeach() - if (WINDOWS_STORE) - target_link_libraries(${target_name} PRIVATE dloadhelper.lib) - else() - target_link_libraries(${target_name} PRIVATE delayimp.lib) - endif() + + target_link_libraries(${target_name} PRIVATE delayimp.lib) endfunction() diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake index f7da46124b879..6e14591224886 100644 --- a/cmake/winml_unittests.cmake +++ b/cmake/winml_unittests.cmake @@ -190,7 +190,7 @@ set_winml_target_properties(winml_google_test_lib) set_winml_target_properties(winml_test_common) get_winml_test_api_src(${WINML_TEST_SRC_DIR} winml_test_api_src) -if (NOT WINDOWS_STORE AND NOT ${winml_is_inbox}) +if (NOT ${winml_is_inbox}) get_winml_test_api_redist_only_src(${WINML_TEST_SRC_DIR} winml_test_api_redist_only_src) endif() diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh index 586f8986086c8..28d92b73530c4 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh @@ -19,7 +19,7 @@ echo "Current NuGet package version is $CurrentOnnxRuntimeVersion" if [ $RunTestCsharp = "true" ]; then if [[ $IsMacOS == "True" || $IsMacOS == "true" ]]; then mkdir -p $BUILD_BINARIESDIRECTORY/models - ln -s $BUILD_SOURCESDIRECTORY/cmake/external/onnx/onnx/backend/test/data/node $BUILD_BINARIESDIRECTORY/models/opset14 + ln -s $BUILD_SOURCESDIRECTORY/cmake/external/onnx/onnx/backend/test/data/node $BUILD_BINARIESDIRECTORY/models/opset16 fi # Run C# tests dotnet restore $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj -s $LocalNuGetRepo -s https://api.nuget.org/v3/index.json diff --git a/docs/python/inference/api_summary.rst b/docs/python/inference/api_summary.rst index 0325cdd47fa97..12d17bafd3ee9 100644 --- a/docs/python/inference/api_summary.rst +++ b/docs/python/inference/api_summary.rst @@ -1,65 +1,107 @@ -=========== -API Summary -=========== - -Summary of public functions and classes exposed -in *ONNX Runtime*. +=== +API +=== .. contents:: :local: -OrtValue -========= +API Overview +============ -*ONNX Runtime* works with native Python data structures which are mapped into ONNX data formats : -Numpy arrays (tensors), dictionaries (maps), and a list of Numpy arrays (sequences). -The data backing these are on CPU. +*ONNX Runtime* loads and runs inference on a model in ONNX graph format, or ORT format (for memory and disk constrained environments). -*ONNX Runtime* supports a custom data structure that supports all ONNX data formats that allows users -to place the data backing these on a device, for example, on a CUDA supported device. This allows for -interesting *IOBinding* scenarios (discussed below). In addition, *ONNX Runtime* supports directly -working with *OrtValue* (s) while inferencing a model if provided as part of the input feed. +The data consumed and produced by the model can be specified and accessed in the way that best matches your scenario. + +Load and run a model +-------------------- -Below is an example showing creation of an *OrtValue* from a Numpy array while placing its backing memory -on a CUDA device: +InferenceSession is the main class of ONNX Runtime. It is used to load and run an ONNX model, +as well as specify environment and application configuration options. .. code-block:: python - # X is numpy array on cpu, create an OrtValue and place it on cuda device id = 0 - ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0) - ortvalue.device_name() # 'cuda' - ortvalue.shape() # shape of the numpy array X - ortvalue.data_type() # 'tensor(float)' - ortvalue.is_tensor() # 'True' + session = onnxruntime.InferenceSession('model.onnx') + + outputs = session.run([output names], inputs) + +ONNX and ORT format models consist of a graph of computations, modeled as operators, +and implemented as optimized operator kernels for different hardware targets. +ONNX Runtime orchestrates the execution of operator kernels via `execution providers`. +An execution provider contains the set of kernels for a specific execution target (CPU, GPU, IoT etc). +Execution provides are configured using the `providers` parameter. Kernels from different execution +providers are chosen in the priority order given in the list of providers. In the example below +if there is a kernel in the CUDA execution provider ONNX Runtime executes that on GPU. If not +the kernel is executed on CPU. + +.. code-block:: python + + session = onnxruntime.InferenceSession(model, + providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + +The list of available execution providers can be found here: `Execution Providers `_. + +Since ONNX Runtime 1.10, you must explicitly specify the execution provider for your target. +Running on CPU is the only time the API allows no explicit setting of the `provider` parameter. +In the examples that follow, the `CUDAExecutionProvider` and `CPUExecutionProvider` are used, assuming the application is running on NVIDIA GPUs. +Replace these with the execution provider specific to your environment. + +You can supply other session configurations via the `session options` parameter. For example, to enable +profiling on the session: + +.. code-block:: python + + options = onnxruntime.SessionOptions() + options.enable_profiling=True + session = onnxruntime.InferenceSession('model.onnx', sess_options=options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) + + +Data inputs and outputs +----------------------- + +The ONNX Runtime Inference Session consumes and produces data using its OrtValue class. + +Data on CPU +^^^^^^^^^^^ + +On CPU (the default), OrtValues can be mapped to and from native Python data structures: numpy arrays, dictionaries and lists of +numpy arrays. + +.. code-block:: python + + # X is numpy array on cpu + ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X) + ortvalue.device_name() # 'cpu' + ortvalue.shape() # shape of the numpy array X + ortvalue.data_type() # 'tensor(float)' + ortvalue.is_tensor() # 'True' np.array_equal(ortvalue.numpy(), X) # 'True' # ortvalue can be provided as part of the input feed to a model - ses = onnxruntime.InferenceSession('model.onnx') - res = sess.run(["Y"], {"X": ortvalue}) + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) + results = session.run(["Y"], {"X": ortvalue}) -IOBinding -========= +By default, *ONNX Runtime* always places input(s) and output(s) on CPU. Having the data on CPU +may not optimal if the input or output is consumed and produced on a device +other than CPU because it introduces data copy between CPU and the device. -By default, *ONNX Runtime* always places input(s) and output(s) on CPU, which -is not optimal if the input or output is consumed and produced on a device -other than CPU because it introduces data copy between CPU and the device. -*ONNX Runtime* provides a feature, *IO Binding*, which addresses this issue by -enabling users to specify which device to place input(s) and output(s) on. -Here are scenarios to use this feature. -(In the following code snippets, *model.onnx* is the model to execute, -*X* is the input data to feed, and *Y* is the output data.) +Data on device +^^^^^^^^^^^^^^ + +*ONNX Runtime* supports a custom data structure that supports all ONNX data formats that allows users +to place the data backing these on a device, for example, on a CUDA supported device. In ONNX Runtime, +this called `IOBinding`. -Scenario 1: +To use the `IOBinding` feature, replace `InferenceSession.run()` with `InferenceSession.run_with_iobinding()`. A graph is executed on a device other than CPU, for instance CUDA. Users can -use IOBinding to put input on CUDA as the follows. +use IOBinding to copy the data onto the GPU. .. code-block:: python # X is numpy array on cpu - session = onnxruntime.InferenceSession('model.onnx') + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) io_binding = session.io_binding() # OnnxRuntime will copy the data over to the CUDA device if 'input' is consumed by nodes on the CUDA device io_binding.bind_cpu_input('input', X) @@ -67,37 +109,32 @@ use IOBinding to put input on CUDA as the follows. session.run_with_iobinding(io_binding) Y = io_binding.copy_outputs_to_cpu()[0] -Scenario 2: - The input data is on a device, users directly use the input. The output data is on CPU. .. code-block:: python # X is numpy array on cpu X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0) - session = onnxruntime.InferenceSession('model.onnx') + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) io_binding = session.io_binding() io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr()) io_binding.bind_output('output') session.run_with_iobinding(io_binding) Y = io_binding.copy_outputs_to_cpu()[0] -Scenario 3: - -The input data and output data are both on a device, users directly use the input and also place output on the device. +The input data and output data are both on a device, users directly use the input and also place output on the device. .. code-block:: python #X is numpy array on cpu X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0) Y_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, 'cuda', 0) # Change the shape to the actual shape of the output being bound - session = onnxruntime.InferenceSession('model.onnx') + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) io_binding = session.io_binding() io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr()) io_binding.bind_output(name='output', device_type=Y_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=Y_ortvalue.shape(), buffer_ptr=Y_ortvalue.data_ptr()) session.run_with_iobinding(io_binding) -Scenario 4: Users can request *ONNX Runtime* to allocate an output on a device. This is particularly useful for dynamic shaped outputs. Users can use the *get_outputs()* API to get access to the *OrtValue* (s) corresponding to the allocated output(s). @@ -107,7 +144,7 @@ Users can thus consume the *ONNX Runtime* allocated memory for the output as an #X is numpy array on cpu X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0) - session = onnxruntime.InferenceSession('model.onnx') + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) io_binding = session.io_binding() io_binding.bind_input(name='input', device_type=X_ortvalue.device_name(), device_id=0, element_type=np.float32, shape=X_ortvalue.shape(), buffer_ptr=X_ortvalue.data_ptr()) #Request ONNX Runtime to bind and allocate memory on CUDA for 'output' @@ -117,7 +154,7 @@ Users can thus consume the *ONNX Runtime* allocated memory for the output as an ort_output = io_binding.get_outputs()[0] -Scenario 5: +In addition, *ONNX Runtime* supports directly working with *OrtValue* (s) while inferencing a model if provided as part of the input feed. Users can bind *OrtValue* (s) directly. @@ -127,39 +164,52 @@ Users can bind *OrtValue* (s) directly. #X is numpy array on cpu X_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(X, 'cuda', 0) Y_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, 'cuda', 0) # Change the shape to the actual shape of the output being bound - session = onnxruntime.InferenceSession('model.onnx') + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) io_binding = session.io_binding() io_binding.bind_ortvalue_input('input', X_ortvalue) io_binding.bind_ortvalue_output('output', Y_ortvalue) session.run_with_iobinding(io_binding) -Device -====== - -The package is compiled for a specific device, GPU or CPU. -The CPU implementation includes optimizations -such as MKL (Math Kernel Libary). The following function -indicates the chosen option: -.. autofunction:: onnxruntime.get_device +You can also bind inputs and outputs directly to a PyTorch tensor. -Examples and datasets -===================== - -The package contains a few models stored in ONNX format -used in the documentation. These don't need to be downloaded -as they are installed with the package. - -.. autofunction:: onnxruntime.datasets.get_example +.. code-block:: python -Load and run a model -==================== + # X is a PyTorch tensor on device + session = onnxruntime.InferenceSession('model.onnx', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])) + binding = session.io_binding() + + X_tensor = X.contiguous() + + binding.bind_input( + name='X', + device_type='cuda', + device_id=0, + element_type=np.float32, + shape=tuple(x_tensor.shape), + buffer_ptr=x_tensor.data_ptr(), + ) + + ## Allocate the PyTorch tensor for the model output + Y_shape = ... # You need to specify the output PyTorch tensor shape + Y_tensor = torch.empty(Y_shape, dtype=torch.float32, device='cuda:0').contiguous() + binding.bind_output( + name='Y', + device_type='cuda', + device_id=0, + element_type=np.float32, + shape=tuple(Y_tensor.shape), + buffer_ptr=Y_tensor.data_ptr(), + ) + + session.run_with_iobinding(binding) + + +API Details +=========== -*ONNX Runtime* reads a model saved in ONNX format. -The main class *InferenceSession* wraps these functionalities -in a single place. -Main class +InferenceSession ---------- .. autoclass:: onnxruntime.InferenceSession diff --git a/hailo/README.md b/hailo/README.md index 2dd300985886c..e12b9c28e6b5a 100644 --- a/hailo/README.md +++ b/hailo/README.md @@ -5,11 +5,11 @@ Hailo ONNX Runtime integrates ONNX Runtime with HailoRT to enable Hailo-EP, prov * ONNX Runtime version 1.11.1 with Python 3.7 and above # Prerequisites -* HailoRT v4.8.1 +* HailoRT v4.10.0 # Build Instructions To build ONNXRuntime with HailoRT please follow the following steps: -* Clone ONNXRuntime-Hailo from github. +* Clone ONNXRuntime-Hailo from github * Compile ONNXRuntime with Hailo using the following command: ``` ./build.sh --use_hailo --parallel --skip_tests --enable_pybind --build_wheel --config Release @@ -17,8 +17,8 @@ To build ONNXRuntime with HailoRT please follow the following steps: # Run ONNX Runtime with HailoRT To run your ONNX model on ONNXRuntime with Hailo execution provider, follow the following steps: -1. Convert your ONNX model with DFC tool - see [Model Compilation](https://hailo.ai/developer-zone/documentation/dataflow-compiler/latest/?sp_referrer=compilation.html#for-inference-using-onnx-runtime). -2. Create the ONNXRuntime session with `"HailoExecutionProvider"` in the execution providers list, and run the ONNX model. +1. Convert your ONNX model with DFC tool - see [Model Compilation](https://hailo.ai/developer-zone/documentation/dataflow-compiler/latest/?sp_referrer=compilation.html#for-inference-using-onnx-runtime) +2. Create the ONNXRuntime session with `"HailoExecutionProvider"` in the execution providers list, and run the ONNX model ## Examples: * C++ @@ -27,9 +27,14 @@ To run your ONNX model on ONNXRuntime with Hailo execution provider, follow the The ONNX models used in these tests are located in [testdata/hailo directory](./../onnxruntime/test/testdata/hailo/). To run the tests, do the following: - 1. Compile onnxruntime with Hailo. - 2. Go to `build/Linux/Release/`. - 3. Run a test with the name `Test_Name`: `./onnxruntime_test_all --gtest_filter=HailoCustomOpTest.Test_Name`. + 1. Compile onnxruntime with Hailo + 2. Go to `build/Linux/Release/` + 3. Run a test with the name `Test_Name`: `./onnxruntime_test_all --gtest_filter=HailoCustomOpTest.Test_Name` * Python The example `hailo/examples/hailo_example.py` contains a basic inference example using onnxruntime with Hailo-EP. + The ONNX model used in this example is located in [hailo/examples/](./../examples/). + To run the example, do the following: + 1. Compile onnxruntime with Hailo + 2. Go to `build/Linux/Release/dist/` and install the Python wheel (for example: `pip install onnxruntime-1.11.0-cp36-cp36m-linux_x86_64.whl`) + 3. Go to `hailo/examples` and run `python hailo_example.py` diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index a389c407fc264..bf4f99571f674 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -342,52 +342,49 @@ class Node { /** Gets the number of output edges from this Node */ size_t GetOutputEdgesCount() const noexcept { return relationships_.output_edges.size(); } - /** Add an attribute to this Node with specified attribute name and value. */ - void AddAttribute(std::string attr_name, const ONNX_NAMESPACE::AttributeProto& value); - void AddAttribute(std::string attr_name, ONNX_NAMESPACE::AttributeProto&& value); + /** Adds an AttributeProto to this Node. + @remarks The attribute name is used as the key in the attribute map. */ + void AddAttributeProto(ONNX_NAMESPACE::AttributeProto value); -#define ADD_ATTR_INTERFACES(TypeName) \ - void AddAttribute(std::string attr_name, const TypeName& value); \ - void AddAttribute(std::string attr_name, \ - gsl::span values); + // keep this signature in sync with ADD_ATTR_SINGLE_INTERFACE below + /** Adds an attribute to this Node with the specified attribute name and value. */ + void AddAttribute(std::string attr_name, int64_t value); -#define ADD_ATTR_MOVE_INTERFACE(TypeName) \ - void AddAttribute(std::string attr_name, TypeName&& value); + // keep this signature in sync with ADD_ATTR_LIST_INTERFACE below + /** Adds an attribute to this Node with the specified attribute name and values. */ + void AddAttribute(std::string attr_name, gsl::span values); - void AddAttribute(std::string attr_name, std::string value); - void AddAttribute(std::string attr_name, gsl::span values); +#define ADD_ATTR_SINGLE_INTERFACE(Type) \ + void AddAttribute(std::string attr_name, Type value) - ADD_ATTR_INTERFACES(int64_t) - ADD_ATTR_INTERFACES(float) - ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TensorProto) - ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::TensorProto) +#define ADD_ATTR_LIST_INTERFACE(Type) \ + void AddAttribute(std::string attr_name, gsl::span values) + +#define ADD_ATTR_INTERFACES(Type) \ + ADD_ATTR_SINGLE_INTERFACE(Type); \ + ADD_ATTR_LIST_INTERFACE(Type) + + ADD_ATTR_INTERFACES(float); + ADD_ATTR_INTERFACES(std::string); + ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TensorProto); #if !defined(DISABLE_SPARSE_TENSORS) - ADD_ATTR_INTERFACES(ONNX_NAMESPACE::SparseTensorProto) - ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::SparseTensorProto) + ADD_ATTR_INTERFACES(ONNX_NAMESPACE::SparseTensorProto); #endif - ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TypeProto) - ADD_ATTR_MOVE_INTERFACE(ONNX_NAMESPACE::TypeProto) + ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TypeProto); - void AddAttribute(std::string attr_name, const ONNX_NAMESPACE::GraphProto& value); - void AddAttribute(std::string attr_name, ONNX_NAMESPACE::GraphProto&& value); + ADD_ATTR_SINGLE_INTERFACE(ONNX_NAMESPACE::GraphProto); - // The below overloads are made so the compiler does not attempt to resolve - // C-strings with a gsl::span overloads +#undef ADD_ATTR_SINGLE_INTERFACE +#undef ADD_ATTR_LIST_INTERFACE +#undef ADD_ATTR_INTERFACES + + // The below overload is made so the compiler does not attempt to resolve + // string literals with the gsl::span overload template void AddAttribute(std::string attr_name, const char (&value)[N]) { this->AddAttribute(std::move(attr_name), std::string(value, N - 1)); } - template - void AddAttribute(const char (&attr_name)[M], const char (&value)[N]) { - this->AddAttribute(std::string(attr_name, M - 1), std::string(value, N - 1)); - } - - template - void AddAttribute(const char (&attr_name)[M], T&& value) { - this->AddAttribute(std::string(attr_name, M - 1), std::forward(value)); - } - /** Gets the Node's attributes. */ const NodeAttributes& GetAttributes() const noexcept { return attributes_; } diff --git a/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h b/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h index 6d2478a94881d..3dbd270cde9b6 100644 --- a/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h +++ b/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h @@ -10,7 +10,7 @@ extern "C" { #endif -ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, _In_ const char* settings); +ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, _In_ const char* opt_str); #ifdef __cplusplus } diff --git a/onnxruntime/contrib_ops/cpu/signal/dft.cc b/onnxruntime/contrib_ops/cpu/signal/dft.cc index d08852b84a124..4a90243da66df 100644 --- a/onnxruntime/contrib_ops/cpu/signal/dft.cc +++ b/onnxruntime/contrib_ops/cpu/signal/dft.cc @@ -39,20 +39,16 @@ ONNX_OPERATOR_KERNEL_EX( kMSExperimentalDomain, 1, kCpuExecutionProvider, - KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints()), + KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T1", BuildKernelDefConstraints()) + .TypeConstraint("T2", BuildKernelDefConstraints()), STFT); static bool is_real_valued_signal(const onnxruntime::TensorShape & shape) { - // The first dimention is the batch size - // The second dimention is the signal value - return shape.NumDimensions() == 2; + return shape.NumDimensions() == 2 || shape[shape.NumDimensions() - 1] == 1; } static bool is_complex_valued_signal(const onnxruntime::TensorShape& shape) { - // The first dimention is the batch size - // The second dimention is the signal length - // The third dimention is set to 2 and represents the real and imaginary parts of the complex sample - return shape.NumDimensions() == 3 && shape[2] == 2; + return shape.NumDimensions() > 2 && shape[shape.NumDimensions() - 1] == 2; } static bool is_power_of_2(size_t size) { @@ -143,24 +139,27 @@ static T compute_angular_velocity(size_t number_of_samples, bool inverse) { } template -static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx, - const Tensor* X, Tensor* Y, const Tensor* window, bool is_onesided, bool inverse, +static Status fft_radix2(OpKernelContext* /*ctx*/, + const Tensor* X, Tensor* Y, + size_t X_offset, size_t X_stride, size_t Y_offset, size_t Y_stride, int64_t axis, + const Tensor* window, bool is_onesided, bool inverse, std::vector>& V, std::vector>& temp_output) { // Get shape and significant bits const auto& X_shape = X->Shape(); - size_t number_of_samples = static_cast(X_shape[1]); + size_t number_of_samples = static_cast(X_shape[axis]); unsigned significant_bits = static_cast(log2(number_of_samples)); // Get data - auto* X_data = const_cast(reinterpret_cast(X->DataRaw())) + (batch_idx * number_of_samples); + auto* X_data = const_cast(reinterpret_cast(X->DataRaw())) + X_offset; // Get window U* window_data = nullptr; if (window) { window_data = const_cast(reinterpret_cast(window->DataRaw())); } + size_t Y_data_stride = 1; std::complex* Y_data; if (is_onesided) { if (temp_output.size() != number_of_samples) { @@ -168,7 +167,8 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx, } Y_data = temp_output.data(); } else { - Y_data = reinterpret_cast*>(Y->MutableDataRaw()) + (batch_idx * number_of_samples); + Y_data = reinterpret_cast*>(Y->MutableDataRaw()) + Y_offset; + Y_data_stride = Y_stride; } auto angular_velocity = compute_angular_velocity(number_of_samples, inverse); @@ -184,9 +184,9 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx, for (size_t i = 0; i < number_of_samples; i++) { size_t bit_reversed_index = bit_reverse(i, significant_bits); - auto x = *(X_data + bit_reversed_index); + auto x = *(X_data + bit_reversed_index*X_stride); auto window_element = window_data ? *(window_data + bit_reversed_index) : 1; - *(Y_data + i) = std::complex(1, 0) * x * window_element; + *(Y_data + i*Y_data_stride) = std::complex(1, 0) * x * window_element; } // Run fft_radix2 @@ -199,8 +199,8 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx, auto first_idx = bit_reverse(k, current_significant_bits); auto second_idx = bit_reverse(midpoint + k, current_significant_bits); for (size_t j = 0; j < number_of_samples; j += i) { - std::complex* even = (Y_data + j) + k; - std::complex* odd = (Y_data + j) + (midpoint + k); + std::complex* even = (Y_data + j*Y_data_stride) + k; + std::complex* odd = (Y_data + j*Y_data_stride) + (midpoint + k); std::complex first = *even + (V[first_idx] * *odd); std::complex second = *even + (V[second_idx] * *odd); *even = first; @@ -212,32 +212,34 @@ static Status fft_radix2(OpKernelContext* /*ctx*/, size_t batch_idx, // Scale the output if inverse if (inverse) { for (size_t i = 0; i < number_of_samples; i++) { - std::complex& val = *(Y_data + i); + std::complex& val = *(Y_data + i * Y_data_stride); val /= static_cast(number_of_samples); } } if (is_onesided) { - const auto& Y_shape = Y->Shape(); - size_t fft_output_size = static_cast(Y_shape[1]); - auto destination = reinterpret_cast*>(Y->MutableDataRaw()) + (batch_idx * fft_output_size); - memcpy(destination, Y_data, sizeof(std::complex) * fft_output_size); + auto destination = reinterpret_cast*>(Y->MutableDataRaw()) + Y_offset; + for (size_t i = 0; i < number_of_samples; i++) { + *(destination + Y_stride * i) = *(Y_data + i); + } } return Status::OK(); } template -static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tensor* window, bool inverse) { +static Status dft_naive(const Tensor* X, Tensor* Y, + size_t X_offset, size_t X_stride, size_t Y_offset, size_t Y_stride, int64_t axis, + const Tensor* window, bool inverse) { // Get shape and significant bits const auto& X_shape = X->Shape(); - size_t number_of_samples = static_cast(X_shape[1]); + size_t number_of_samples = static_cast(X_shape[axis]); const auto& Y_shape = Y->Shape(); - size_t dft_output_size = static_cast(Y_shape[1]); + size_t dft_output_size = static_cast(Y_shape[axis]); // Get data - auto* X_data = const_cast(reinterpret_cast(X->DataRaw())) + (batch_idx * number_of_samples); - auto* Y_data = reinterpret_cast*>(Y->MutableDataRaw()) + (batch_idx * dft_output_size); + auto* X_data = const_cast(reinterpret_cast(X->DataRaw())) + X_offset; + auto* Y_data = reinterpret_cast*>(Y->MutableDataRaw()) + Y_offset; U* window_data = nullptr; if (window) { @@ -247,14 +249,14 @@ static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tens auto angular_velocity = compute_angular_velocity(number_of_samples, inverse); for (size_t i = 0; i < dft_output_size; i++) { - std::complex& out = *(Y_data + i); + std::complex& out = *(Y_data + i*Y_stride); out.real(0); out.imag(0); for (size_t j = 0; j < number_of_samples; j++) { // vectorize over this loop auto exponential = std::complex(cos(i * j * angular_velocity), sin(i * j * angular_velocity)); auto window_element = window_data ? * (window_data + j) : 1; - auto element = *(X_data + j) * window_element; + auto element = *(X_data + j*X_stride) * window_element; out += exponential * element; } @@ -267,26 +269,70 @@ static Status dft_naive(size_t batch_idx, const Tensor* X, Tensor* Y, const Tens } template -static Status discrete_fourier_transform(OpKernelContext* ctx, const Tensor* X, Tensor* Y, const Tensor* window, bool is_onesided, bool inverse, +static Status discrete_fourier_transform(OpKernelContext* ctx, const Tensor* X, Tensor* Y, int64_t axis, const Tensor* window, bool is_onesided, bool inverse, std::vector>& V, std::vector>& temp_output) { // Get shape const auto& X_shape = X->Shape(); - size_t number_of_batches = static_cast(X_shape[0]); - size_t number_of_samples = static_cast(X_shape[1]); - - // radix 2 fft - for (size_t i = 0; i < number_of_batches; i++) { + const auto& Y_shape = Y->Shape(); + size_t number_of_samples = static_cast(X_shape[axis]); + + auto batch_and_signal_rank = X->Shape().NumDimensions(); + auto total_dfts = static_cast(X->Shape().Size() / X->Shape()[axis]); + + auto is_input_real = X->Shape().NumDimensions() == 2 || X->Shape()[X->Shape().NumDimensions() - 1] == 1; + auto compex_input_factor = is_input_real ? 1 : 2; + if (X->Shape().NumDimensions() > 2) + { + total_dfts /= X->Shape()[X->Shape().NumDimensions() - 1]; + batch_and_signal_rank -= 1; + } + + + + // Calculate x/y offsets/strides + for (size_t i = 0; i < total_dfts; i++) + { + size_t X_offset = 0; + size_t X_stride = X_shape.SizeFromDimension(axis+1) / compex_input_factor; + size_t cumulative_packed_stride = total_dfts; + size_t temp = i; + for (size_t r = 0; r < batch_and_signal_rank; r++) { + if (r == static_cast(axis)) + { + continue; + } + cumulative_packed_stride /= X_shape[r]; + auto index = temp / cumulative_packed_stride; + temp -= (index * cumulative_packed_stride); + X_offset += index * X_shape.SizeFromDimension(r + 1) / compex_input_factor; + } + + size_t Y_offset = 0; + size_t Y_stride = Y_shape.SizeFromDimension(axis + 1) / 2; + cumulative_packed_stride = total_dfts; + temp = i; + for (size_t r = 0; r < batch_and_signal_rank; r++) { + if (r == static_cast(axis)) + { + continue; + } + cumulative_packed_stride /= X_shape[r]; + auto index = temp / cumulative_packed_stride; + temp -= (index * cumulative_packed_stride); + Y_offset += index * Y_shape.SizeFromDimension(r + 1) / 2; + } + if (is_power_of_2(number_of_samples)) { - ORT_RETURN_IF_ERROR((fft_radix2(ctx, i, X, Y, window, is_onesided, inverse, V, temp_output))); + ORT_RETURN_IF_ERROR((fft_radix2(ctx, X, Y, X_offset, X_stride, Y_offset, Y_stride, axis, window, is_onesided, inverse, V, temp_output))); } else { - ORT_RETURN_IF_ERROR((dft_naive(i, X, Y, window, inverse))); + ORT_RETURN_IF_ERROR((dft_naive(X, Y, X_offset, X_stride, Y_offset, Y_stride, axis, window, inverse))); } - } + } return Status::OK(); } -static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, bool inverse) { +static Status discrete_fourier_transform(OpKernelContext* ctx, int64_t axis, bool is_onesided, bool inverse) { // Get input shape const auto* X = ctx->Input(0); const auto& X_shape = X->Shape(); @@ -295,13 +341,21 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, // Get the DFT output size. Onesided will return only the unique values! // note: x >> 1 === std::floor(x / 2.f) - int64_t number_of_samples = static_cast(X_shape[1]); + int64_t number_of_samples = static_cast(X_shape[axis]); auto dft_output_size = is_onesided ? ((number_of_samples >> 1) + 1) : number_of_samples; // Get output shape - auto Y_shape = onnxruntime::TensorShape({X_shape[0], dft_output_size, 2}); + auto Y_shape = onnxruntime::TensorShape(X_shape); + if (X_shape.NumDimensions() == 2) + { + Y_shape = onnxruntime::TensorShape({X_shape[0], dft_output_size, 2}); + } else + { + Y_shape[Y_shape.NumDimensions() - 1] = 2; + } + Y_shape[axis] = dft_output_size; auto Y = ctx->Output(0, Y_shape); // Get data type @@ -312,9 +366,9 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, std::vector> V; std::vector> temp_output; if (is_real_valued) { - ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output))); + ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output))); } else if (is_complex_valued) { - ORT_RETURN_IF_ERROR((discrete_fourier_transform>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output))); + ORT_RETURN_IF_ERROR((discrete_fourier_transform>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output))); } else { ORT_THROW("Unsupported input signal shape. The signal's first dimenstion must be the batch dimension and its second dimension must be the signal length dimension. It may optionally include a 3rd dimension of size 2 for complex inputs.", data_type); } @@ -322,9 +376,9 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, std::vector> V; std::vector> temp_output; if (is_real_valued) { - ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output))); + ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output))); } else if (is_complex_valued) { - ORT_RETURN_IF_ERROR((discrete_fourier_transform>(ctx, X, Y, nullptr, is_onesided, inverse, V, temp_output))); + ORT_RETURN_IF_ERROR((discrete_fourier_transform>(ctx, X, Y, axis, nullptr, is_onesided, inverse, V, temp_output))); } else { ORT_THROW("Unsupported input signal shape. The signal's first dimenstion must be the batch dimension and its second dimension must be the signal length dimension. It may optionally include a 3rd dimension of size 2 for complex inputs.", data_type); } @@ -336,12 +390,12 @@ static Status discrete_fourier_transform(OpKernelContext* ctx, bool is_onesided, } Status DFT::Compute(OpKernelContext* ctx) const { - ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, is_onesided_, false)); + ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, axis_ + 1, is_onesided_, false)); return Status::OK(); } Status IDFT::Compute(OpKernelContext* ctx) const { - ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, false, true)); + ORT_RETURN_IF_ERROR(discrete_fourier_transform(ctx, axis_ + 1, false, true)); return Status::OK(); } @@ -376,9 +430,9 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside // Get signal const auto* signal = ctx->Input(0); - const auto* window = ctx->Input(1); - const auto* frame_length_tensor = ctx->Input(2); - const auto frame_step = get_scalar_value_from_tensor(ctx->Input(3)); + const auto frame_step = get_scalar_value_from_tensor(ctx->Input(1)); + const auto* window = ctx->Input(2); + const auto* frame_length_tensor = ctx->Input(3); // Get input signal shape const auto& signal_shape = signal->Shape(); @@ -468,7 +522,7 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside 0); // Run individual dft - ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, &input, &output, window, is_onesided, false, V, temp_output))); + ORT_RETURN_IF_ERROR((discrete_fourier_transform(ctx, &input, &output, 1, window, is_onesided, false, V, temp_output))); } } diff --git a/onnxruntime/contrib_ops/cpu/signal/dft.h b/onnxruntime/contrib_ops/cpu/signal/dft.h index 2b04781c70f59..fc90d48fab25c 100644 --- a/onnxruntime/contrib_ops/cpu/signal/dft.h +++ b/onnxruntime/contrib_ops/cpu/signal/dft.h @@ -8,16 +8,20 @@ namespace contrib { class DFT final : public OpKernel { bool is_onesided_ = true; + int64_t axis_ = 0; public: explicit DFT(const OpKernelInfo& info) : OpKernel(info) { is_onesided_ = static_cast(info.GetAttrOrDefault("onesided", 0)); + axis_ = info.GetAttrOrDefault("axis", 0); } Status Compute(OpKernelContext* ctx) const override; }; class IDFT final : public OpKernel { + int64_t axis_ = 0; public: explicit IDFT(const OpKernelInfo& info) : OpKernel(info) { + axis_ = info.GetAttrOrDefault("axis", 0); } Status Compute(OpKernelContext* ctx) const override; }; diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc index 01820c34741f7..e77eadc010cfa 100644 --- a/onnxruntime/core/framework/kernel_registry.cc +++ b/onnxruntime/core/framework/kernel_registry.cc @@ -309,6 +309,7 @@ Status KernelRegistry::TryFindKernel(const Node& node, << " kernel is not supported in " << expected_provider << "." << " Encountered following errors: (" << ToString(verify_kernel_def_error_strs) << ")"; + VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str(); return Status(common::ONNXRUNTIME, common::FAIL, oss.str()); } diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 81c44b91f5e55..ef9d66f117998 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -22,6 +22,7 @@ #include "core/graph/indexed_sub_graph.h" #include "core/graph/model.h" #include "core/graph/model_load_utils.h" +#include "core/graph/node_attr_utils.h" #include "core/graph/op.h" #include "core/graph/runtime_optimization_record_container.h" @@ -762,7 +763,7 @@ Status Node::LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node, const log subgraphs_.push_back(std::move(subgraph)); } - AddAttribute(attr_proto.name(), std::move(attr_proto)); + AddAttributeProto(std::move(attr_proto)); } } @@ -872,82 +873,46 @@ void Node::CreateSubgraph(const std::string& attr_name) { #endif // !defined(ORT_MINIMAL_BUILD) -void Node::AddAttribute(std::string attr_name, const ONNX_NAMESPACE::AttributeProto& value) { - graph_->SetGraphResolveNeeded(); - graph_->SetGraphProtoSyncNeeded(); - attributes_[std::move(attr_name)] = value; -} +void Node::AddAttributeProto(AttributeProto value) { + utils::SetNodeAttribute(std::move(value), attributes_); -void Node::AddAttribute(std::string attr_name, ONNX_NAMESPACE::AttributeProto&& value) { graph_->SetGraphResolveNeeded(); graph_->SetGraphProtoSyncNeeded(); - attributes_[std::move(attr_name)] = std::move(value); } -static void AddAttributeHelper(Node& node, std::string attr_name, - AttributeProto_AttributeType attr_type, AttributeProto&& a) { - a.set_name(attr_name); - a.set_type(attr_type); - node.AddAttribute(std::move(attr_name), std::move(a)); -} - -void Node::AddAttribute(std::string attr_name, std::string value) { - AttributeProto a; - *(a.mutable_s()) = std::move(value); - AddAttributeHelper(*this, std::move(attr_name), - AttributeProto_AttributeType::AttributeProto_AttributeType_STRING, - std::move(a)); -}; - -#define ADD_BASIC_ATTR_IMPL(type, enumType, field) \ - void Node::AddAttribute(std::string attr_name, const type& value) { \ - AttributeProto a; \ - a.set_##field(value); \ - AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \ - }; - -#define ADD_ATTR_IMPL(type, enumType, field) \ - void Node::AddAttribute(std::string attr_name, const type& value) { \ - AttributeProto a; \ - *(a.mutable_##field()) = value; \ - AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \ +#define ADD_ATTR_SINGLE_IMPL(Type) \ + void Node::AddAttribute(std::string attr_name, Type value) { \ + AttributeProto a = utils::MakeAttribute(std::move(attr_name), std::move(value)); \ + AddAttributeProto(std::move(a)); \ } -#define ADD_ATTR_MOVE_IMPL(type, enumType, field) \ - void Node::AddAttribute(std::string attr_name, type&& value) { \ - AttributeProto a; \ - *(a.mutable_##field()) = std::move(value); \ - AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \ +#define ADD_ATTR_LIST_IMPL(Type) \ + void Node::AddAttribute(std::string attr_name, gsl::span values) { \ + AttributeProto a = utils::MakeAttribute(std::move(attr_name), values); \ + AddAttributeProto(std::move(a)); \ } -#define ADD_LIST_ATTR_IMPL(type, enumType, field) \ - void Node::AddAttribute(std::string attr_name, \ - gsl::span values) { \ - AttributeProto a; \ - auto* mutable_field = a.mutable_##field(); \ - for (const auto& val : values) { \ - *(mutable_field->Add()) = val; \ - } \ - AddAttributeHelper(*this, std::move(attr_name), enumType, std::move(a)); \ - } +#define ADD_ATTR_IMPLS(Type) \ + ADD_ATTR_SINGLE_IMPL(Type) \ + ADD_ATTR_LIST_IMPL(Type) -void Node::AddAttribute(std::string attr_name, const GraphProto& value) { - AttributeProto a; - *a.mutable_g() = value; - // Do not move attr_name as it is needed below - AddAttributeHelper(*this, attr_name, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, std::move(a)); - -#if !defined(ORT_MINIMAL_BUILD) - // subgraph is created via deserialization and not here in a minimal build - CreateSubgraph(attr_name); +ADD_ATTR_IMPLS(int64_t) +ADD_ATTR_IMPLS(float) +ADD_ATTR_IMPLS(std::string) +ADD_ATTR_IMPLS(TensorProto) +#if !defined(DISABLE_SPARSE_TENSORS) +ADD_ATTR_IMPLS(SparseTensorProto) #endif -}; +ADD_ATTR_IMPLS(TypeProto) + +#undef ADD_ATTR_SINGLE_IMPL +#undef ADD_ATTR_LIST_IMPL +#undef ADD_ATTR_IMPLS -void Node::AddAttribute(std::string attr_name, GraphProto&& value) { - AttributeProto a; - *a.mutable_g() = std::move(value); +void Node::AddAttribute(std::string attr_name, GraphProto value) { // Do not move attr_name as it is needed below - AddAttributeHelper(*this, attr_name, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, std::move(a)); + AttributeProto a = utils::MakeAttribute(attr_name, std::move(value)); + AddAttributeProto(std::move(a)); #if !defined(ORT_MINIMAL_BUILD) // subgraph is created via deserialization and not here in a minimal build @@ -955,24 +920,6 @@ void Node::AddAttribute(std::string attr_name, GraphProto&& value) { #endif }; -ADD_BASIC_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT, f) -ADD_BASIC_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INT, i) -ADD_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t) -ADD_ATTR_MOVE_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t) -ADD_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp) -ADD_ATTR_MOVE_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp) - -ADD_LIST_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOATS, floats) -ADD_LIST_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INTS, ints) -ADD_LIST_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS, strings) -ADD_LIST_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSORS, tensors) -ADD_LIST_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTOS, type_protos) -#if !defined(DISABLE_SPARSE_TENSORS) -ADD_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR, sparse_tensor) -ADD_ATTR_MOVE_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR, sparse_tensor) -ADD_LIST_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSORS, sparse_tensors) -#endif - #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) bool Node::ClearAttribute(const std::string& attr_name) { graph_->SetGraphResolveNeeded(); @@ -2588,8 +2535,9 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) { // The attribute was not specified in the node. if (!attr_def.second.required) { if (utils::HasName(attr_def.second.default_value)) { + assert(attr_def.first == attr_def.second.default_value.name()); // Set default value to the node attributes. - node.AddAttribute(attr_def.first, attr_def.second.default_value); + node.AddAttributeProto(attr_def.second.default_value); } // TODO: Handle optional attribute but no default value specified in op definition. } else { diff --git a/onnxruntime/core/graph/node_attr_utils.cc b/onnxruntime/core/graph/node_attr_utils.cc new file mode 100644 index 0000000000000..120df9e5d43f3 --- /dev/null +++ b/onnxruntime/core/graph/node_attr_utils.cc @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/graph/node_attr_utils.h" + +#include "core/common/common.h" +#include "core/framework/tensorprotoutils.h" + +using namespace ONNX_NAMESPACE; + +namespace onnxruntime::utils { + +static void SetNameAndType(std::string attr_name, AttributeProto_AttributeType attr_type, AttributeProto& a) { + a.set_name(std::move(attr_name)); + a.set_type(attr_type); +} + +#define MAKE_BASIC_ATTR_IMPL(type, enumType, field) \ + AttributeProto MakeAttribute(std::string attr_name, type value) { \ + AttributeProto a; \ + a.set_##field(std::move(value)); \ + SetNameAndType(std::move(attr_name), enumType, a); \ + return a; \ + } + +#define MAKE_ATTR_IMPL(type, enumType, field) \ + AttributeProto MakeAttribute(std::string attr_name, type value) { \ + AttributeProto a; \ + *(a.mutable_##field()) = std::move(value); \ + SetNameAndType(std::move(attr_name), enumType, a); \ + return a; \ + } + +#define MAKE_LIST_ATTR_IMPL(type, enumType, field) \ + AttributeProto MakeAttribute(std::string attr_name, gsl::span values) { \ + AttributeProto a; \ + auto* mutable_field = a.mutable_##field(); \ + for (const auto& val : values) { \ + *(mutable_field->Add()) = val; \ + } \ + SetNameAndType(std::move(attr_name), enumType, a); \ + return a; \ + } + +MAKE_BASIC_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INT, i) +MAKE_LIST_ATTR_IMPL(int64_t, AttributeProto_AttributeType::AttributeProto_AttributeType_INTS, ints) + +MAKE_BASIC_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT, f) +MAKE_LIST_ATTR_IMPL(float, AttributeProto_AttributeType::AttributeProto_AttributeType_FLOATS, floats) + +MAKE_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRING, s) +MAKE_LIST_ATTR_IMPL(std::string, AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS, strings) + +MAKE_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSOR, t) +MAKE_LIST_ATTR_IMPL(TensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TENSORS, tensors) + +#if !defined(DISABLE_SPARSE_TENSORS) +MAKE_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSOR, + sparse_tensor) +MAKE_LIST_ATTR_IMPL(SparseTensorProto, AttributeProto_AttributeType::AttributeProto_AttributeType_SPARSE_TENSORS, + sparse_tensors) +#endif + +MAKE_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTO, tp) +MAKE_LIST_ATTR_IMPL(TypeProto, AttributeProto_AttributeType::AttributeProto_AttributeType_TYPE_PROTOS, type_protos) + +MAKE_ATTR_IMPL(GraphProto, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPH, g) +MAKE_LIST_ATTR_IMPL(GraphProto, AttributeProto_AttributeType::AttributeProto_AttributeType_GRAPHS, graphs) + +#undef MAKE_BASIC_ATTR_IMPL +#undef MAKE_ATTR_IMPL +#undef MAKE_LIST_ATTR_IMPL + +std::pair SetNodeAttribute(AttributeProto attribute, + NodeAttributes& node_attributes) { + ORT_ENFORCE(utils::HasName(attribute), "AttributeProto must have a name."); + std::string name = attribute.name(); + return node_attributes.insert_or_assign(std::move(name), std::move(attribute)); +} + +} // namespace onnxruntime::utils diff --git a/onnxruntime/core/graph/node_attr_utils.h b/onnxruntime/core/graph/node_attr_utils.h new file mode 100644 index 0000000000000..94242e8d26404 --- /dev/null +++ b/onnxruntime/core/graph/node_attr_utils.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +#include + +#include "onnx/onnx_pb.h" + +#include "core/graph/basic_types.h" + +namespace onnxruntime::utils { + +// keep these signatures in sync with DECLARE_MAKE_ATTRIBUTE_FNS below +/** Creates an AttributeProto with the specified name and value. */ +ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, int64_t value); +/** Creates an AttributeProto with the specified name and values. */ +ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, gsl::span values); + +#define DECLARE_MAKE_ATTRIBUTE_FNS(type) \ + ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, type value); \ + ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, gsl::span values) + +DECLARE_MAKE_ATTRIBUTE_FNS(float); +DECLARE_MAKE_ATTRIBUTE_FNS(std::string); +DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::TensorProto); +#if !defined(DISABLE_SPARSE_TENSORS) +DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::SparseTensorProto); +#endif +DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::TypeProto); +DECLARE_MAKE_ATTRIBUTE_FNS(ONNX_NAMESPACE::GraphProto); + +#undef DECLARE_MAKE_ATTRIBUTE_FNS + +// The below overload is made so the compiler does not attempt to resolve +// string literals with the gsl::span overload +inline ONNX_NAMESPACE::AttributeProto MakeAttribute(std::string attr_name, const char* value) { + return MakeAttribute(std::move(attr_name), std::string{value}); +} + +/** + * Sets an attribute in `node_attributes` with key `attribute.name()` and value `attribute`. + * If an attribute with the same name exists, it will be overwritten. + * @return Pair of (iterator to attribute, whether attribute was added (true) or updated (false)). + */ +std::pair SetNodeAttribute(ONNX_NAMESPACE::AttributeProto attribute, + NodeAttributes& node_attributes); + +} // namespace onnxruntime::utils diff --git a/onnxruntime/core/graph/signal_ops/signal_defs.cc b/onnxruntime/core/graph/signal_ops/signal_defs.cc index 6b78bf075af5d..70720b8f85ed0 100644 --- a/onnxruntime/core/graph/signal_ops/signal_defs.cc +++ b/onnxruntime/core/graph/signal_ops/signal_defs.cc @@ -42,6 +42,24 @@ static T get_scalar_value_from_tensor(const ONNX_NAMESPACE::TensorProto* t) { } } +inline const ONNX_NAMESPACE::TensorShapeProto* getOptionalInputShape(ONNX_NAMESPACE::InferenceContext& ctx, size_t n) { + const auto* input_type = ctx.getInputType(n); + + if (input_type == nullptr) { + return nullptr; + } + + const auto value_case = input_type->value_case(); + if (value_case != ONNX_NAMESPACE::TypeProto::kTensorType && value_case != ONNX_NAMESPACE::TypeProto::kSparseTensorType) { + fail_type_inference("Attribute expected to have tensor or sparse tensor type"); + } + if (value_case == ONNX_NAMESPACE::TypeProto::kTensorType) { + return &input_type->tensor_type().shape(); + } else { + return &input_type->sparse_tensor_type().shape(); + } +} + void RegisterSignalSchemas() { MS_SIGNAL_OPERATOR_SCHEMA(DFT) .SetDomain(kMSExperimentalDomain) @@ -53,132 +71,242 @@ void RegisterSignalSchemas() { "Values can be 0 or 1.", AttributeProto::AttributeType::AttributeProto_AttributeType_INT, static_cast(0)) + .Attr("axis", + "The axis on which to perform the DFT. By default this value is set to 0, which corresponds to the first dimension after the batch index." + "This value must be less than signal_dimN, where signal_dimN is the number of dimensions in the signal.", + AttributeProto::AttributeType::AttributeProto_AttributeType_INT, + static_cast(0)) .Input(0, - "input", - "For complex input, the following shape is expected: [batch_idx][n_fft][2]" - "The final dimension represents the real and imaginary parts of the value." - "For real input, the following shape is expected: [batch_idx][n_fft]" - "The first dimension is the batch dimension.", - "T") + "input", + "For real input, the following shape is expected: [batch_idx][n_fft]." + "For complex input, the following shape is expected: [batch_idx][n_fft][2]." + "The final dimension represents the real and imaginary parts of the value." + "For real multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]." + "For complex multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]." + "The first dimension is the batch dimension.", + "T") .Output(0, "output", "The Fourier Transform of the input vector." - "If onesided is 1, [batch_idx][floor(n_fft/2)+1][2]" - "If onesided is 0, [batch_idx][n_fft][2]", + "If signal_dimN = 1, and onesided is 0, [batch_idx][n_fft][2]" + "If signal_dimN = 1, and onesided is 1, [batch_idx][floor(n_fft/2)+1][2]" + "If signal_dimN = 2, and onesided is 0 and axis = 0, [batch_idx][signal_dim1][signal_dim2][2]" + "If signal_dimN = 2, and onesided is 0 and axis = 1, [batch_idx][signal_dim1][signal_dim2][2]" + "If signal_dimN = 2, and onesided is 1 and axis = 0, [batch_idx][floor(signal_dim1/2)+1][signal_dim2][2]" + "If signal_dimN = 2, and onesided is 1 and axis = 1, [batch_idx][signal_dim1][floor(signal_dim2/2)+1][2]", "T") - .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "") + .TypeConstraint( + "T", + {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"}, + "Constrain input and output types to float tensors.") .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateElemTypeFromInputToOutput(ctx, 0, 0); - int64_t ndim = 1; - - bool is_onesided = true; - auto attr_proto = ctx.getAttribute("onesided"); - if (attr_proto && attr_proto->has_i()) { - is_onesided = static_cast(attr_proto->i()); - } + propagateElemTypeFromInputToOutput(ctx, 0, 0); + const int64_t batch_ndim = 1; - if (ctx.getInputType(0)->tensor_type().has_shape()) { auto& input_shape = getInputShape(ctx, 0); - ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape; + auto dim_size = static_cast(input_shape.dim_size()); + auto has_component_dimension = dim_size > 2; + ONNX_NAMESPACE::TensorShapeProto result_shape_proto = input_shape; + + bool is_onesided = static_cast(getAttribute(ctx, "onesided", 0)); if (is_onesided) { - auto n_fft = input_shape.dim(1).dim_value(); - result_shape.mutable_dim(1)->set_dim_value((n_fft >> 1) + 1); + // Since signal_ndim = 1, and multidimensional DFT is not supported, + // only the single signal dim (1) needs to be updated + auto n_fft = input_shape.dim(1).dim_value(); + result_shape_proto.mutable_dim(1)->set_dim_value((n_fft >> 1) + 1); } - - auto dim_size = static_cast(input_shape.dim_size()); - if (dim_size == ndim + 1) { // real input - result_shape.add_dim()->set_dim_value(2); // output is same shape, but with extra dim for 2 values (real/imaginary) - } else if (dim_size == ndim + 2) { // complex input, do nothing + + if (has_component_dimension) { + result_shape_proto.mutable_dim(static_cast(dim_size - 1))->set_dim_value(2); } else { - fail_shape_inference( - "the input_shape must [batch_idx][n_fft] for real values or [batch_idx][n_fft][2] for complex values.") + result_shape_proto.add_dim()->set_dim_value(2); } - updateOutputShape(ctx, 0, result_shape); - } + + updateOutputShape(ctx, 0, result_shape_proto); }); - ; MS_SIGNAL_OPERATOR_SCHEMA(IDFT) .SetDomain(kMSExperimentalDomain) .SinceVersion(1) .SetDoc(R"DOC(IDFT)DOC") + .Attr("axis", + "The axis on which to perform the DFT. By default this value is set to 0, which corresponds to the first dimension after the batch index." + "This value must be less than signal_dimN, where signal_dimN is the number of dimensions in the signal.", + AttributeProto::AttributeType::AttributeProto_AttributeType_INT, + static_cast(0)) .Input(0, "input", - "A complex signal of dimension signal_ndim." - "The last dimension of the tensor should be 2," - "representing the real and imaginary components of complex numbers," - "and should have at least signal_ndim + 2 dimensions." + "For real input, the following shape is expected: [batch_idx][n_fft]." + "For complex input, the following shape is expected: [batch_idx][n_fft][2]." + "The final dimension represents the real and imaginary parts of the value." + "For real multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]." + "For complex multi-dimensional input, the following shape is expected: [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]." "The first dimension is the batch dimension.", "T") .Output(0, "output", - "The inverse fourier transform of the input vector," - "using the same format as the input.", - "T") - .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "") + "The inverse discrete Fourier transform of the input. " + "If signal_dimN = 1, [batch_idx][n_fft][2]" + "If signal_dimN = 2 and axis = 0, [batch_idx][signal_dim1][signal_dim2][2]" + "If signal_dimN = 2 and axis = 1, [batch_idx][signal_dim1][signal_dim2][2]" + "For all types of input, the last dimension of the output represents the components of a complex number.", + "T", + OpSchema::Single, + true, + 1, + OpSchema::NonDifferentiable) + .TypeConstraint( + "T", + {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"}, + "Constrain input and output types to float tensors.") .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateElemTypeFromInputToOutput(ctx, 0, 0); - int64_t ndim = 1; - auto attr_proto = ctx.getAttribute("signal_ndim"); - if (attr_proto && attr_proto->has_i()) { - ndim = static_cast(attr_proto->i()); - } + propagateElemTypeFromInputToOutput(ctx, 0, 0); + const int64_t batch_ndim = 1; + + auto& input_shape = getInputShape(ctx, 0); + ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape; + auto dim_size = static_cast(input_shape.dim_size()); + auto has_component_dimension = dim_size > 2; - auto& input_shape = getInputShape(ctx, 0); - ONNX_NAMESPACE::TensorShapeProto result_shape = input_shape; + if (has_component_dimension) { + result_shape.mutable_dim(static_cast(dim_size - 1))->set_dim_value(2); + } else { + result_shape.add_dim()->set_dim_value(2); + } - auto dim_size = static_cast(input_shape.dim_size()); - if (dim_size == ndim + 1) { // real input - result_shape.add_dim()->set_dim_value(2); // output is same shape, but with extra dim for 2 values (real/imaginary) - } else if (dim_size == ndim + 2) { // complex input, do nothing - } else { - fail_shape_inference( - "the input_shape must have 1 + signal_ndim dimensions for real inputs, or 2 + signal_ndim dimensions for complex input.") - } - - updateOutputShape(ctx, 0, result_shape); + updateOutputShape(ctx, 0, result_shape); }); MS_SIGNAL_OPERATOR_SCHEMA(STFT) .SetDomain(kMSExperimentalDomain) .SinceVersion(1) .SetDoc(R"DOC(STFT)DOC") - .Attr("onesided", - "If True (default), only values for half of the fft size are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry." - "The output tensor will return the first floor(n_fft/2) + 1 values from the DFT." - "Values can be 0 or 1.", - AttributeProto::AttributeType::AttributeProto_AttributeType_INT, - static_cast(1)) + .Attr( + "onesided", + "If onesided is 1, only values for w in [0, 1, 2, ..., floor(n_fft/2) + 1] are returned because " + "the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,w]=X[m,n_fft-w]*. " + "Note if the input or window tensors are complex, then onesided output is not possible. " + "Enabling onesided with real inputs performs a Real-valued fast Fourier transform (RFFT)." + "When invoked with real or complex valued input, the default value is 0. " + "Values can be 0 or 1.", + AttributeProto::INT, + static_cast(0)) .Input(0, "signal", - "A complex signal of dimension signal_ndim." - "The last dimension of the tensor should be 2," - "representing the real and imaginary components of complex numbers," - "and should have at least signal_ndim + 2 dimensions." - "The first dimension is the batch dimension.", - "T1") + "Input tensor representing a real or complex valued signal. " + "For real input, the following shape is expected: [batch_size][signal_length]. " + "For complex input, the following shape is expected: [batch_size][signal_length][2], where " + "[batch_size][signal_length][0] represents the real component and [batch_size][signal_length][1] represents the imaginary component of the signal.", + "T1", + OpSchema::Single, + true, + 1, + OpSchema::NonDifferentiable) .Input(1, + "frame_step", + "The number of samples to step between successive DFTs.", + "T2", + OpSchema::Single, + true, + 1, + OpSchema::NonDifferentiable) + .Input(2, "window", - "A tensor representing the window that will be slid over the input signal.", + "A tensor representing the window that will be slid over the signal." + "The window must have rank 1 with shape: [window_shape]. " + "It's an optional value. ", "T1", - OpSchema::FormalParameterOption::Optional) - .Input(2, - "frame_length", // frame_length, fft_length, pad_mode - "Size of the fft.", - "T2", - OpSchema::FormalParameterOption::Optional) + OpSchema::Optional, + true, + 1, + OpSchema::NonDifferentiable) .Input(3, - "frame_step", - "The number of samples to step between successive DFTs.", - "T2") + "frame_length", + "A scalar representing the size of the DFT. " + "It's an optional value.", + "T2", + OpSchema::Optional, + true, + 1, + OpSchema::NonDifferentiable) .Output(0, "output", "The inverse fourier transform of the input vector," "using the same format as the input.", "T1") - .TypeConstraint("T1", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "") - .TypeConstraint("T2", {"tensor(int64)"}, ""); + .TypeConstraint( + "T1", + {"tensor(float)", + "tensor(float16)", + "tensor(double)", + "tensor(bfloat16)"}, + "Constrain signal and output to float tensors.") + .TypeConstraint( + "T2", + {"tensor(int64)"}, + "Constrain scalar length types to int64_t.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + constexpr int64_t batch_ndim = 1; + constexpr int64_t component_ndim = 1; + + // Get inputs + auto& input_shape = getInputShape(ctx, 0); + auto frame_step = get_scalar_value_from_tensor(ctx.getInputData(1)); + const ONNX_NAMESPACE::TensorShapeProto* window_input = nullptr; + try { + window_input = getOptionalInputShape(ctx, 2); + } catch (...) { + window_input = nullptr; + } + + const ONNX_NAMESPACE::TensorShapeProto* frame_length_input = nullptr; + try { + frame_length_input = getOptionalInputShape(ctx, 3); + } catch (...) { + frame_length_input = nullptr; + } + + // Determine the size of the DFT based on the 2 optional inputs window and frame_length. One must be set. + int64_t dft_size = 0; + if (window_input == nullptr && frame_length_input == nullptr) { + fail_type_inference("STFT expects to have at least one of these inputs set: [window, frame_length]."); + } else if (window_input != nullptr && window_input->dim_size() > 0 && frame_length_input != nullptr) { + if (window_input->dim_size() != 1) { + fail_type_inference("STFT's window input, must have rank = 1."); + } + auto window_length = window_input->dim(0).dim_value(); + auto frame_length = get_scalar_value_from_tensor(ctx.getInputData(3)); + if (window_length != frame_length) { + fail_type_inference("If STFT has both a window input and frame_length specified, the dimension of the window must match the frame_length specified!"); + } + dft_size = window_length; + } else if (window_input != nullptr && window_input->dim_size() > 0) { + if (window_input->dim_size() != 1) { + fail_type_inference("STFT's window input, must have rank = 1."); + } + dft_size = window_input->dim(0).dim_value(); + } else if (frame_length_input != nullptr) { + dft_size = get_scalar_value_from_tensor(ctx.getInputData(3)); + } + + bool is_onesided = static_cast(getAttribute(ctx, "onesided", 0)); + if (is_onesided) { + dft_size = is_onesided ? ((dft_size >> 1) + 1) : dft_size; + } + + auto signal_size = input_shape.dim(1).dim_value(); + auto n_dfts = static_cast(std::floor((signal_size - dft_size) / static_cast(frame_step)) + 1); + + // The output has the following shape: [batch_size][frames][dft_unique_bins][2] + ONNX_NAMESPACE::TensorShapeProto result_shape_proto; + result_shape_proto.add_dim()->set_dim_value(input_shape.dim(0).dim_value()); // batch size + result_shape_proto.add_dim()->set_dim_value(n_dfts); + result_shape_proto.add_dim()->set_dim_value(dft_size); + result_shape_proto.add_dim()->set_dim_value(2); + updateOutputShape(ctx, 0, result_shape_proto); + }); // Window Functions MS_SIGNAL_OPERATOR_SCHEMA(HannWindow) diff --git a/onnxruntime/core/optimizer/bias_dropout_fusion.cc b/onnxruntime/core/optimizer/bias_dropout_fusion.cc index e81dd40c482d5..31f800aa6f0cd 100644 --- a/onnxruntime/core/optimizer/bias_dropout_fusion.cc +++ b/onnxruntime/core/optimizer/bias_dropout_fusion.cc @@ -188,10 +188,10 @@ Status BiasDropoutFusion::ApplyImpl(Graph& graph, bool& modified, int graph_leve kMSDomain); // Get attribute "seed" from "Dropout" node if available. - NodeAttributes dropout_attrs = dropout_node.GetAttributes(); + const NodeAttributes& dropout_attrs = dropout_node.GetAttributes(); NodeAttributes::const_iterator seed = dropout_attrs.find("seed"); if (seed != dropout_attrs.end()) { - dropout_add_fusion_node.AddAttribute("seed", seed->second); + dropout_add_fusion_node.AddAttributeProto(seed->second); } // Assign provider to this new node. Provider should be same as the provider for old node. diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index 4d6fdebbe49a1..a02aa309a0bd2 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -8,6 +8,7 @@ #include "core/common/inlined_containers.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/graph_utils.h" +#include "core/graph/node_attr_utils.h" #include "core/optimizer/utils.h" #include "core/optimizer/selectors_actions/actions.h" @@ -137,23 +138,6 @@ class ConvAddRelu : public NodeSelector { #endif // !defined(ORT_MINIMAL_BUILD) namespace actions { -// TODO refactor to lift common logic from Node::AddAttribute() -void SetStringAttribute(std::string name, std::string value, NodeAttributes& attributes) { - ONNX_NAMESPACE::AttributeProto a{}; - a.set_name(name); - a.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING); - a.set_s(std::move(value)); - attributes.insert_or_assign(std::move(name), std::move(a)); -}; - -void SetFloatsAttribute(std::string name, gsl::span value, NodeAttributes& attributes) { - ONNX_NAMESPACE::AttributeProto a{}; - a.set_name(name); - a.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOATS); - a.mutable_floats()->Assign(value.begin(), value.end()); - attributes.insert_or_assign(std::move(name), std::move(a)); -}; - using NTO = NodesToOptimize; class FuseConvActivation : public ReplaceWithNew { @@ -169,7 +153,7 @@ class FuseConvActivation : public ReplaceWithNew { ORT_ENFORCE(activation != nullptr, "Expected activation node."); const auto& activation_op_type = activation->OpType(); - SetStringAttribute("activation", activation_op_type, extra_fused_conv_attributes); + utils::SetNodeAttribute(utils::MakeAttribute("activation", activation_op_type), extra_fused_conv_attributes); InlinedVector activation_params; if (activation_op_type == "LeakyRelu") { @@ -190,7 +174,8 @@ class FuseConvActivation : public ReplaceWithNew { } if (!activation_params.empty()) { - SetFloatsAttribute("activation_params", activation_params, extra_fused_conv_attributes); + utils::SetNodeAttribute(utils::MakeAttribute("activation_params", activation_params), + extra_fused_conv_attributes); } return extra_fused_conv_attributes; @@ -215,7 +200,7 @@ class FuseConvAddRelu : public ReplaceWithNew { NodeAttributes ExtraAttributes(const RuntimeState&) const override { NodeAttributes extra_fused_conv_attributes; - SetStringAttribute("activation", "Relu", extra_fused_conv_attributes); + utils::SetNodeAttribute(utils::MakeAttribute("activation", "Relu"), extra_fused_conv_attributes); return extra_fused_conv_attributes; } diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc index 9bf517226fa7f..6b9c0f897a4e4 100644 --- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc @@ -38,11 +38,7 @@ static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_ kOnnxDomain); // Add attribute: "to" = 6 - ONNX_NAMESPACE::AttributeProto to; - to.set_name("to"); - to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - to.set_i(static_cast(ONNX_NAMESPACE::TensorProto_DataType_INT32)); - node.AddAttribute("to", std::move(to)); + node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_INT32}); node.SetExecutionProviderType(provider_type); return &cast32; @@ -525,7 +521,7 @@ static void CreateEmbedLayernormNode(Graph& graph, NodeAttributes ln_attrs = layer_norm_node.GetAttributes(); NodeAttributes::const_iterator epsilon = ln_attrs.find("epsilon"); if (epsilon != ln_attrs.end()) { - embed_layer_norm_node.AddAttribute("epsilon", epsilon->second); + embed_layer_norm_node.AddAttributeProto(epsilon->second); } else { embed_layer_norm_node.AddAttribute("epsilon", contrib::kDefaultEmbedLayerNormEpsilon); } diff --git a/onnxruntime/core/optimizer/gemm_activation_fusion.cc b/onnxruntime/core/optimizer/gemm_activation_fusion.cc index 21e3b40e10d1c..9c0f0a8d202b2 100644 --- a/onnxruntime/core/optimizer/gemm_activation_fusion.cc +++ b/onnxruntime/core/optimizer/gemm_activation_fusion.cc @@ -83,7 +83,7 @@ Status GemmActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l for (const auto& attr : attrs) { AttributeProto fused_gemm_attr(attr.second); fused_gemm_attr.set_name("activation_" + attr.first); - fused_gemm.AddAttribute("activation_" + attr.first, std::move(fused_gemm_attr)); + fused_gemm.AddAttributeProto(std::move(fused_gemm_attr)); } // move output definitions and edges from act_node to fused_gemm. delete gemm_node and act_node. diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc index 019b5a3949799..af8183b9e63bc 100644 --- a/onnxruntime/core/optimizer/layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc @@ -447,7 +447,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr continue; } bool cast_1_present = false; - int64_t cast_1_to_attr; + int64_t cast_1_to_attr{}; // check if there are Casts as input to the Pow and Div if (p_div_input == p_pow_input) { const Node* p_pow_input_node = graph_utils::GetInputNode(pow_node, 0); @@ -574,7 +574,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr // Assign provider to this new node. Provider should be same as the provider for old node. layer_norm_node.SetExecutionProviderType(reduce_mean_node.GetExecutionProviderType()); - if (allow_precision_change_ && p_cast_2 != nullptr) { + if (allow_precision_change_ && cast_1_present && p_cast_2 != nullptr) { ONNX_NAMESPACE::TensorProto_DataType cast_1_type = gsl::narrow_cast(cast_1_to_attr); const ONNX_NAMESPACE::TypeProto* casted_type = DataTypeImpl::TensorTypeFromONNXEnum(cast_1_type)->GetTypeProto(); NodeArg* LN_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("layer_norm_out"), casted_type); diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc index ad8fb13b56a49..a81ca67052db0 100644 --- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc @@ -248,7 +248,7 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le NodeAttributes ln_attrs = ln_node.GetAttributes(); NodeAttributes::const_iterator epsilon = ln_attrs.find("epsilon"); if (epsilon != ln_attrs.end()) { - skip_layer_norm_node.AddAttribute("epsilon", epsilon->second); + skip_layer_norm_node.AddAttributeProto(epsilon->second); } else { skip_layer_norm_node.AddAttribute("epsilon", contrib::kDefaultSkipLayerNormEpsilon); } diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc index 26e99574b1871..a4efbf6a90e74 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc @@ -326,7 +326,7 @@ void ApiNode::CopyAttributes(const api::NodeRef& node) { const ApiNode& ort_node = static_cast(node); const NodeAttributes& attributes = ort_node.node_.GetAttributes(); for (const auto& pair : attributes) { - node_.AddAttribute(pair.first, pair.second); + node_.AddAttributeProto(pair.second); } } diff --git a/onnxruntime/core/platform/posix/ort_mutex.cc b/onnxruntime/core/platform/posix/ort_mutex.cc index 8a5d41eb36080..e124ce168085f 100644 --- a/onnxruntime/core/platform/posix/ort_mutex.cc +++ b/onnxruntime/core/platform/posix/ort_mutex.cc @@ -5,7 +5,6 @@ #include "core/platform/ort_mutex.h" #include #include -#include #include namespace onnxruntime { diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 1a3818b758def..1a1f30fdf97a5 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -16,7 +16,6 @@ limitations under the License. #include "core/platform/env.h" -#include #include #include diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 81cdb6d5577f3..f9faae787ac4a 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -202,7 +202,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, // Transpose021 has a specialized Transpose3DImpl kernel dim3 grid_size, block_size; - if (CanDoTranspose3D(prop, new_rank, new_input_dims, new_permutations, grid_size, block_size)) { + if (CanDoTranspose3D(prop, static_cast(new_rank), new_input_dims, new_permutations, grid_size, block_size)) { TensorPitches new_input_strides(new_input_dims); return Transpose3DImpl(stream, element_size, ToConstSpan(new_input_dims), ToConstSpan(new_input_strides), input.DataRaw(), output.MutableDataRaw(), output.Shape().Size(), grid_size, block_size); diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu index 0b77a2012ef14..3d79e126a2e7a 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu @@ -7,44 +7,52 @@ namespace onnxruntime { namespace cuda { -constexpr unsigned int TILE_DIM = 16; +constexpr unsigned int NUM_ELE_PER_THREAD = 4; -template -__global__ void Transpose3DKernel(const TArray input_shape, - const TArray input_strides, +template +__global__ void Transpose3DKernel(const TArray input_shape, const TArray input_strides, const T* input_data, T* output_data) { - __shared__ T tile[TILE_DIM * (TILE_DIM + 1)]; + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; - tile[threadIdx.y * TILE_DIM + threadIdx.x] = input_data[blockIdx.z * input_strides[0] + y * input_shape[2] + x]; +#pragma unroll + for (unsigned int i = 0; i < TILE_DIM; i += (TILE_DIM / NUM_ELE_PER_THREAD)) { + tile[threadIdx.y + i][threadIdx.x] = input_data[blockIdx.z * input_strides[0] + (y + i) * input_shape[2] + x]; + } __syncthreads(); x = blockIdx.y * TILE_DIM + threadIdx.x; y = blockIdx.x * TILE_DIM + threadIdx.y; - output_data[blockIdx.z * input_strides[0] + y * input_shape[1] + x] = tile[threadIdx.x * TILE_DIM + threadIdx.y]; +#pragma unroll + for (unsigned int i = 0; i < TILE_DIM; i += (TILE_DIM / NUM_ELE_PER_THREAD)) { + output_data[blockIdx.z * input_strides[0] + (y + i) * input_shape[1] + x] = tile[threadIdx.x][threadIdx.y + i]; + } } -bool CanDoTranspose3D(const cudaDeviceProp& prop, - int32_t rank, - const gsl::span& input_dims, - const gsl::span& permutations, - dim3& grid_size, dim3& block_size) { - if (rank == 3 && - // permutation is done in the last two dimensions. - permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2) && - // the last two dimensions are aligned with TILE_DIM. - input_dims[rank - 2] % TILE_DIM == 0 && input_dims[rank - 1] % TILE_DIM == 0) { - int grid_size_x = static_cast(input_dims[2] / TILE_DIM); - int grid_size_y = static_cast(input_dims[1] / TILE_DIM); +bool CanDoTranspose3D(const cudaDeviceProp& prop, size_t rank, const gsl::span& input_dims, + const gsl::span& permutations, dim3& grid_size, dim3& block_size) { + // Permutation is done in the last two dimensions and the last two dimensions are aligned with TILE_DIM. + if (rank == 3 && permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2)) { + unsigned int tile_dim = 0; + if (input_dims[rank - 2] % 32 == 0 && input_dims[rank - 1] % 32 == 0) { + tile_dim = 32; + } else if (input_dims[rank - 2] % 16 == 0 && input_dims[rank - 1] % 16 == 0) { + tile_dim = 16; + } else { + return false; + } + + int grid_size_x = static_cast(input_dims[2] / tile_dim); + int grid_size_y = static_cast(input_dims[1] / tile_dim); int grid_size_z = static_cast(input_dims[0]); - if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] && grid_size_z <= prop.maxGridSize[2]) { - block_size = dim3(TILE_DIM, TILE_DIM); - grid_size = dim3(static_cast(grid_size_x), - static_cast(grid_size_y), + if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] && + grid_size_z <= prop.maxGridSize[2]) { + block_size = dim3(tile_dim, tile_dim / NUM_ELE_PER_THREAD); + grid_size = dim3(static_cast(grid_size_x), static_cast(grid_size_y), static_cast(grid_size_z)); return true; } else { @@ -54,34 +62,28 @@ bool CanDoTranspose3D(const cudaDeviceProp& prop, return false; } -Status Transpose3DImpl(cudaStream_t stream, size_t element_size, - const TArray& input_shape, const TArray& input_strides, - const void* input_data, void* output_data, int64_t N, const dim3& grid_size, const dim3& block_size) { +#define CALL_TRANSPOSE_3D(type, tile_dim) \ + Transpose3DKernel<<>>( \ + input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), \ + reinterpret_cast::MappedType*>(output_data)) + +#define HANDLE_TRANSPOSE_3D_TILE_DIM(type) \ + case sizeof(type): { \ + if (block_size.x == 32) { \ + CALL_TRANSPOSE_3D(type, 32); \ + } else { \ + CALL_TRANSPOSE_3D(type, 16); \ + } \ + } break + +Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, + const TArray& input_strides, const void* input_data, void* output_data, int64_t N, + const dim3& grid_size, const dim3& block_size) { switch (element_size) { - case sizeof(int8_t): - Transpose3DKernel<<>>( - input_shape, input_strides, - reinterpret_cast::MappedType*>(input_data), - reinterpret_cast::MappedType*>(output_data)); - break; - case sizeof(int16_t): - Transpose3DKernel<<>>( - input_shape, input_strides, - reinterpret_cast::MappedType*>(input_data), - reinterpret_cast::MappedType*>(output_data)); - break; - case sizeof(int32_t): - Transpose3DKernel<<>>( - input_shape, input_strides, - reinterpret_cast::MappedType*>(input_data), - reinterpret_cast::MappedType*>(output_data)); - break; - case sizeof(int64_t): - Transpose3DKernel<<>>( - input_shape, input_strides, - reinterpret_cast::MappedType*>(input_data), - reinterpret_cast::MappedType*>(output_data)); - break; + HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t); + HANDLE_TRANSPOSE_3D_TILE_DIM(int16_t); + HANDLE_TRANSPOSE_3D_TILE_DIM(int32_t); + HANDLE_TRANSPOSE_3D_TILE_DIM(int64_t); default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ", element_size); diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h index 4e4d7d8bcabab..96d2686170e1d 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace cuda { bool CanDoTranspose3D(const cudaDeviceProp& prop, - int32_t rank, const gsl::span& input_dims, const gsl::span& permutations, + size_t rank, const gsl::span& input_dims, const gsl::span& permutations, dim3& grid_size, dim3& block_size); Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp index b029b56bb1b43..54223e450f925 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp @@ -214,7 +214,7 @@ namespace Dml // Change the name of the attribute to its fused node version std::string fusedAttributeName = Dml::FusionHelpers::GetFusedAttributeName(attribute.first); attribute.second.set_name(fusedAttributeName); - node.AddAttribute(fusedAttributeName, attribute.second); + node.AddAttributeProto(attribute.second); } } } diff --git a/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc b/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc deleted file mode 100644 index f066c6023ebf2..0000000000000 --- a/onnxruntime/core/providers/hailo/hailo_global_vdevice.cc +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright (c) 2022 Hailo Technologies Ltd. All rights reserved. - * Distributed under the MIT license (https://opensource.org/licenses/MIT) - **/ - -#include "core/providers/shared_library/provider_api.h" -#include "hailo_global_vdevice.h" - -namespace onnxruntime { - -std::mutex GlobalVDevice::m_mutex; - -GlobalVDevice& GlobalVDevice::get_instance() -{ - static GlobalVDevice instance; - return instance; -} - -std::shared_ptr GlobalVDevice::get_vdevice() -{ - std::lock_guard lock(m_mutex); - if (!m_vdevice) { - m_vdevice = create_vdevice(); - } - - return m_vdevice; -} - -void GlobalVDevice::release() -{ - std::lock_guard lock(m_mutex); - if (m_vdevice.use_count() <= 1) { - m_vdevice.reset(); - } -} - -std::shared_ptr GlobalVDevice::create_vdevice() -{ - hailo_vdevice_params_t params; - auto status = hailo_init_vdevice_params(¶ms); - HAILO_ORT_ENFORCE(HAILO_SUCCESS == status, "Failed init vdevice_params, status = ", status); - params.scheduling_algorithm = HAILO_SCHEDULING_ALGORITHM_ROUND_ROBIN; - - auto vdevice = VDevice::create(params); - HAILO_CHECK_EXPECTED(vdevice, "Creating VDevice failed"); - return vdevice.release(); -} - -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/hailo/hailo_global_vdevice.h b/onnxruntime/core/providers/hailo/hailo_global_vdevice.h deleted file mode 100644 index df8f81cd1410c..0000000000000 --- a/onnxruntime/core/providers/hailo/hailo_global_vdevice.h +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright (c) 2022 Hailo Technologies Ltd. All rights reserved. - * Distributed under the MIT license (https://opensource.org/licenses/MIT) - **/ - -#pragma once -#include "hailo/hailort.hpp" -#include "utils.h" - -namespace onnxruntime { - -using hailort::VDevice; - -class GlobalVDevice { -public: - static GlobalVDevice& get_instance(); - std::shared_ptr get_vdevice(); - void release(); - - static std::mutex m_mutex; - -private: - GlobalVDevice() : m_vdevice() {} - GlobalVDevice(GlobalVDevice const&) = delete; - void operator=(GlobalVDevice const&) = delete; - - std::shared_ptr create_vdevice(); - - std::shared_ptr m_vdevice; -}; - -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/hailo/hailo_op.cc b/onnxruntime/core/providers/hailo/hailo_op.cc index dbe603b13ce9d..65b8e8906b92a 100644 --- a/onnxruntime/core/providers/hailo/hailo_op.cc +++ b/onnxruntime/core/providers/hailo/hailo_op.cc @@ -6,7 +6,6 @@ #include "core/providers/shared_library/provider_api.h" #include "hailo_op.h" #include "utils.h" -#include "hailo_global_vdevice.h" #include #include @@ -50,7 +49,16 @@ HailoKernel::HailoKernel(const OpKernelInfo& info) : OpKernel(info), m_mutex() HAILO_ORT_ENFORCE(status.IsOK(), "attribute '", OUTPUT_ORDER_ATTRIBUTE, "' is not set"); m_hef = create_hef_from_memory(binary_hef.c_str(), binary_hef.length()); - m_vdevice = GlobalVDevice::get_instance().get_vdevice(); + + hailo_vdevice_params_t params; + auto hailo_status = hailo_init_vdevice_params(¶ms); + HAILO_ORT_ENFORCE(HAILO_SUCCESS == hailo_status, "Failed init vdevice_params, status = ", hailo_status); + params.scheduling_algorithm = HAILO_SCHEDULING_ALGORITHM_ROUND_ROBIN; + params.group_id = "SHARED"; + auto expected_vdevice = VDevice::create(params); + HAILO_CHECK_EXPECTED(expected_vdevice, "Failed to create VDevice"); + m_vdevice = std::move(expected_vdevice.value()); + m_network_group = configure_network_group(*m_vdevice.get()); auto output_nodes = info.node().OutputDefs(); @@ -64,7 +72,6 @@ HailoKernel::~HailoKernel() m_network_group.reset(); m_vdevice.reset(); m_hef.reset(); - GlobalVDevice::get_instance().release(); } std::unique_ptr HailoKernel::create_hef_from_memory(const void* binary_hef, size_t size) diff --git a/onnxruntime/core/providers/tvm/custom_logging.cc b/onnxruntime/core/providers/tvm/custom_logging.cc index 08053e456aed9..3140683825da6 100644 --- a/onnxruntime/core/providers/tvm/custom_logging.cc +++ b/onnxruntime/core/providers/tvm/custom_logging.cc @@ -11,31 +11,33 @@ #include #include + // TODO(agladyshev): Make conditional choice of sep for Windows and UNIX std::string GetFileName(const std::string& file_path, char sep = '/') { - return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1), - file_path.end()}; + return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1), + file_path.end()}; } std::string GetTimedLogMessage(const std::string& file, int lineno, const std::string& message) { - std::stringstream sstream; - std::string file_name = GetFileName(file); - std::time_t t = std::time(nullptr); - sstream << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "][TVM] " - << file_name << ":" << lineno << ": " + message; - return sstream.str(); + std::stringstream sstream; + std::string file_name = GetFileName(file); + std::time_t t = std::time(nullptr); + sstream << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "][TVM] " + << file_name << ":" << lineno << ": " + message; + return sstream.str(); } namespace tvm { namespace runtime { namespace detail { - void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { - throw std::runtime_error(GetTimedLogMessage(file, lineno, message)); - } + void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + throw std::runtime_error(GetTimedLogMessage(file, lineno, message)); + } + + void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl; + } - void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { - std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl; - } } // namespace detail } // namespace runtime } // namespace tvm diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.cc b/onnxruntime/core/providers/tvm/tvm_allocator.cc index 4fadff5c57b9a..ef06e1f59a94a 100644 --- a/onnxruntime/core/providers/tvm/tvm_allocator.cc +++ b/onnxruntime/core/providers/tvm/tvm_allocator.cc @@ -8,6 +8,7 @@ namespace onnxruntime { +namespace tvm { void* TVMAllocator::Alloc(size_t size) { void* p = nullptr; @@ -24,4 +25,5 @@ void TVMAllocator::Free(void* p) { TVMDeviceFreeDataSpace(ctx, p); } -} // namespace onnxruntime +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.h b/onnxruntime/core/providers/tvm/tvm_allocator.h index 11854024879c9..50a13f890ac86 100644 --- a/onnxruntime/core/providers/tvm/tvm_allocator.h +++ b/onnxruntime/core/providers/tvm/tvm_allocator.h @@ -7,7 +7,9 @@ #include "core/framework/allocator.h" #include "tvm_common.h" + namespace onnxruntime { +namespace tvm { #define TVM_ALLOC_ALIGN 128 @@ -22,14 +24,14 @@ class TVMAllocator : public IAllocator { : IAllocator(info) { switch (info.device.Type()) { case OrtDevice::CPU: - ctx = {kDLCPU, info.device.Id()}; - break; + ctx = {kDLCPU, info.device.Id()}; + break; case OrtDevice::GPU: - ctx = {kDLVulkan, info.device.Id()}; - break; + ctx = {kDLVulkan, info.device.Id()}; + break; default: - ORT_NOT_IMPLEMENTED("Unsupported device"); - break; + ORT_NOT_IMPLEMENTED("Unsupported device"); + break; } } @@ -38,5 +40,7 @@ class TVMAllocator : public IAllocator { DLDevice ctx; }; -} // namespace onnxruntime +} // namespace tvm +} // namespace onnxruntime + #endif // TVM_ALLOCATOR diff --git a/onnxruntime/core/providers/tvm/tvm_api.cc b/onnxruntime/core/providers/tvm/tvm_api.cc index f225e06528c7c..ff61c6c43d33e 100644 --- a/onnxruntime/core/providers/tvm/tvm_api.cc +++ b/onnxruntime/core/providers/tvm/tvm_api.cc @@ -17,16 +17,9 @@ using TvmPackedFunc = ::tvm::PackedFunc; TvmModule TVMCompile(const std::string& onnx_txt, const std::string& model_path, - const std::string& executor, - const std::string& target, - const std::string& target_host, - int opt_level, + const TvmEPOptions& options, int opset, - bool freeze_params, - const std::vector>& input_shapes, - bool nhwc, - const std::string& tuning_logfile, - const std::string& tuning_type) + const TVMTensorShapes& input_shapes) { ::tvm::Array shapes; for (size_t i = 0; i < input_shapes.size(); ++i) @@ -41,19 +34,18 @@ TvmModule TVMCompile(const std::string& onnx_txt, const TvmPackedFunc* compile = ::tvm::runtime::Registry::Get("tvm_onnx_import_and_compile"); ORT_ENFORCE(compile != nullptr, "Unable to retrieve 'tvm_onnx_import_and_compile'."); - TvmModule mod = (*compile)( - TVMByteArray{onnx_txt.data(), onnx_txt.size()}, - model_path, - executor, - target, - target_host, - opt_level, - opset, - freeze_params, - shapes, - nhwc, - tuning_logfile, - tuning_type); + TvmModule mod = (*compile)(TVMByteArray{onnx_txt.data(), onnx_txt.size()}, + model_path, + options.executor, + options.target, + options.target_host, + options.opt_level, + opset, + options.freeze_weights, + shapes, + options.to_nhwc, + options.tuning_file_path, + options.tuning_type); ORT_ENFORCE(mod.get() != nullptr, "Compiled TVM Module is nullptr!"); return mod; } @@ -108,20 +100,19 @@ void TVM_VM_GetOutputs(TvmModule& mod, } void TVMGetOutputShapes(TvmModule& mod, - size_t num_outputs, - std::vector>& output_shapes) + TVMTensorShapes& output_shapes) { - output_shapes.clear(); + size_t size = output_shapes.size(); TvmPackedFunc get_output = mod.GetFunction("get_output", false); - for (size_t i = 0; i < num_outputs; ++i) { + for (size_t i = 0; i < size; ++i) { ::tvm::runtime::NDArray output_array = get_output(i); ::tvm::runtime::ShapeTuple shape_tuple = output_array.Shape(); size_t dims_num = shape_tuple.size(); - std::vector dims; + TensorShapeVector dims; for (size_t j = 0; j < dims_num; ++j) { dims.push_back(int64_t(shape_tuple[j])); } - output_shapes.push_back(dims); + output_shapes[i] = dims; } } diff --git a/onnxruntime/core/providers/tvm/tvm_api.h b/onnxruntime/core/providers/tvm/tvm_api.h index 291da438618a9..77241def6e745 100644 --- a/onnxruntime/core/providers/tvm/tvm_api.h +++ b/onnxruntime/core/providers/tvm/tvm_api.h @@ -9,31 +9,27 @@ #include "tvm_common.h" #include "tvm_defaults.h" +#include "tvm_ep_options.h" + namespace onnxruntime { namespace tvm { - TvmModule TVMCompile(const std::string& onnx_txt, - const std::string& model_path, - const std::string& executor, - const std::string& target, - const std::string& target_host, - int opt_level, - int opset, - bool freeze_params, - const std::vector>& input_shapes, - bool nhwc = false, - const std::string& tuning_logfile = "", - const std::string& tuning_type = std::string(onnxruntime::tvm::default_tuning_type)); - void TVMSetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); - void TVM_VM_SetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); - void TVMGetOutputs(TvmModule& mod, std::vector& outputs); - void TVM_VM_GetOutputs(TvmModule& mod, std::vector& outputs); - void TVMGetOutputShapes(TvmModule& mod, - size_t num_outputs, - std::vector>& output_shapes); - void TVMRun(TvmModule& mod); - void TVM_VM_Run(TvmModule& mod); + + TvmModule TVMCompile(const std::string& onnx_txt, + const std::string& model_path, + const TvmEPOptions& options, + int opset, + const TVMTensorShapes& input_shapes); + void TVMSetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); + void TVM_VM_SetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); + void TVMGetOutputs(TvmModule& mod, std::vector& outputs); + void TVM_VM_GetOutputs(TvmModule& mod, std::vector& outputs); + void TVMGetOutputShapes(TvmModule& mod, + TVMTensorShapes& output_shapes); + void TVMRun(TvmModule& mod); + void TVM_VM_Run(TvmModule& mod); + } // namespace tvm } // namespace onnxruntime -#endif // TVM_API_H \ No newline at end of file +#endif // TVM_API_H diff --git a/onnxruntime/core/providers/tvm/tvm_common.h b/onnxruntime/core/providers/tvm/tvm_common.h index 5b3a0c4dea9ec..17120eb2a2afe 100644 --- a/onnxruntime/core/providers/tvm/tvm_common.h +++ b/onnxruntime/core/providers/tvm/tvm_common.h @@ -4,10 +4,20 @@ #ifndef TVM_COMMON_H #define TVM_COMMON_H +#include +#include + #include #include #include -using TvmModule = tvm::runtime::Module; + +namespace onnxruntime { +namespace tvm { + +using TvmModule = ::tvm::runtime::Module; + +} // namespace tvm +} // namespace onnxruntime #endif // TVM_COMMON_H diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.cc b/onnxruntime/core/providers/tvm/tvm_compiler.cc new file mode 100644 index 0000000000000..dfbf182506556 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_compiler.cc @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include + +#include "tvm_compiler.h" +#include "tvm_api.h" + + +namespace onnxruntime { +namespace tvm { + +TVMCompiler::TVMCompiler(std::string&& onnx_model_str, + const std::string& model_path, + int opset) : +onnx_model_str_(std::move(onnx_model_str)), +model_path_(model_path), +opset_(opset) { +} + +auto TVMCompiler::operator()(const TvmEPOptions& options, + const TVMTensorShapes& input_shapes) -> ModulePtr { + if (mod_) { + return mod_; + } + + mod_ = std::make_shared(); + *mod_ = tvm::TVMCompile(onnx_model_str_, + model_path_, + options, + opset_, + input_shapes); + onnx_model_str_.clear(); + return mod_; +} + +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.h b/onnxruntime/core/providers/tvm/tvm_compiler.h new file mode 100644 index 0000000000000..057ed058fd4c4 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_compiler.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef TVM_COMPILER_H +#define TVM_COMPILER_H + +#include +#include + +#include "tvm_common.h" +#include "tvm_ep_options.h" + + +namespace onnxruntime { +namespace tvm { + +class TVMCompiler { + using ModulePtr = std::shared_ptr; +public: + TVMCompiler() = delete; + ~TVMCompiler() = default; + + TVMCompiler(std::string&& onnx_model_str, + const std::string& model_path, + int opset); + + ModulePtr operator()(const TvmEPOptions& options, + const TVMTensorShapes& input_shapes); + +private: + ModulePtr mod_; + std::string onnx_model_str_; + std::string model_path_; + int opset_; +}; + +} // namespace tvm +} // namespace onnxruntime + +#endif // TVM_COMPILER_H diff --git a/onnxruntime/core/providers/tvm/tvm_defaults.h b/onnxruntime/core/providers/tvm/tvm_defaults.h index e7928a1941176..030a4ea05d56e 100644 --- a/onnxruntime/core/providers/tvm/tvm_defaults.h +++ b/onnxruntime/core/providers/tvm/tvm_defaults.h @@ -11,7 +11,7 @@ constexpr const char* default_executor_type = "vm"; constexpr const char* vm_executor_type = "vm"; constexpr const char* graph_executor_type = "graph"; -constexpr const char* default_target_str = "cpu"; +constexpr const char* default_target_str = "llvm"; constexpr const char* llvm_target_str = "llvm"; constexpr const char* cpu_target_str = "cpu"; diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.cc b/onnxruntime/core/providers/tvm/tvm_ep_options.cc new file mode 100644 index 0000000000000..6e2a077835d64 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_ep_options.cc @@ -0,0 +1,261 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include + +#include "core/common/common.h" +#include "core/common/cpuid_info.h" +#include "core/framework/provider_options_utils.h" + +#include "tvm_ep_options.h" + + +namespace onnxruntime { +namespace tvm { + +namespace provider_option_names { +constexpr const char* kExecutor = "executor"; +constexpr const char* kTarget = "target"; +constexpr const char* kTargetHost = "target_host"; +constexpr const char* kOptLevel = "opt_level"; +constexpr const char* kFreezeWeights = "freeze_weights"; +constexpr const char* kToNHWC = "to_nhwc"; +constexpr const char* kTuningFilePath = "tuning_file_path"; +constexpr const char* kTuningType = "tuning_type"; +constexpr const char* kInputNames = "input_names"; +constexpr const char* kInputShapes = "input_shapes"; + +static const std::unordered_set valid_keys { + std::string{kExecutor}, + std::string{kTarget}, + std::string{kTargetHost}, + std::string{kOptLevel}, + std::string{kFreezeWeights}, + std::string{kToNHWC}, + std::string{kTuningFilePath}, + std::string{kTuningType}, + std::string{kInputNames}, + std::string{kInputShapes} +}; + +} // namespace provider_option_names + +size_t split(const std::string &src, std::vector &dst, char ch) { + dst.clear(); + + size_t pos = src.find( ch ); + size_t initialPos = 0; + while( pos != std::string::npos ) { + dst.push_back( src.substr( initialPos, pos - initialPos ) ); + initialPos = pos + 1; + + pos = src.find( ch, initialPos ); + } + dst.push_back( src.substr( initialPos, std::min( pos, src.size() ) - initialPos + 1 ) ); + + return dst.size(); +} + +TvmEPOptions TvmEPOptionsHelper::FromOptionsString(const char* opt_str) { + std::string settings{opt_str}; + ProviderOptions options; + if (!settings.empty()) { + const std::string& str = settings; + + // tokenize settings + std::regex reg("\\s*,\\s*"); + std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1); + std::sregex_token_iterator iter_end; + std::vector pairs(iter, iter_end); + + ORT_ENFORCE(pairs.size() > 0); + + for(const auto& pair : pairs) { + auto pos_colon = pair.find(':'); + ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair."); + std::string key = pair.substr(0, pos_colon); + std::string value = pair.substr(pos_colon + 1); + + // trim leading and trailing spaces from key/value + key = whitespace_trimming(key); + value = whitespace_trimming(value); + + // Check keys of obtained options + if (tvm::provider_option_names::valid_keys.count(key) == 0) { + ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")"); + } + + options[key] = value; + } + } + + return TvmEPOptionsHelper::FromProviderOptions(options); +} + +std::string TvmEPOptionsHelper::whitespace_trimming(const std::string& str) { + const std::string WHITESPACE = " \n\r\t\f\v"; + size_t start = str.find_first_not_of(WHITESPACE); + if (start == std::string::npos) { + return ""; + } else { + size_t end = str.find_last_not_of(WHITESPACE); + ORT_ENFORCE(end != std::string::npos); + return str.substr(start, end + 1); + } +} + +TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_options) { + TvmEPOptions options{}; + + ORT_THROW_IF_ERROR( + ProviderOptionsParser{} + .AddAssignmentToReference(tvm::provider_option_names::kExecutor, options.executor) + .AddAssignmentToReference(tvm::provider_option_names::kTarget, options.target) + .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host) + .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level) + .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights) + .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc) + .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path) + .AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type) + .AddAssignmentToReference(tvm::provider_option_names::kInputNames, options.input_names_str) + .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, options.input_shapes_str) + .Parse(pr_options)); + + optionsPostprocess(options); + + return options; +} + +void TvmEPOptionsHelper::optionsPostprocess(TvmEPOptions& options) { + setInputShapes(options); + targetPostprocess(options.target); + targetHostPostprocess(options.target, options.target_host); + optLevelPostprocess(options.opt_level); +} + +bool TvmEPOptionsHelper::checkCPUTarget(const std::string& target) { + bool check = target.find("llvm") != std::string::npos; + return check; +} + +bool TvmEPOptionsHelper::checkGPUTarget(const std::string& target) { + bool check = ( + target.find("cuda") != std::string::npos || + target.find("opencl") != std::string::npos || + target.find("metal") != std::string::npos || + target.find("vulkan") != std::string::npos + ); + return check; +} + +void TvmEPOptionsHelper::setInputShapes(TvmEPOptions& options) { + if (options.input_names_str.empty() && options.input_shapes_str.empty()) + return; + ORT_ENFORCE(!options.input_names_str.empty() && !options.input_shapes_str.empty(), + "Both provider options \"input_names\" and \"input_shapes\" should be empty or full"); + + std::vector name_set; + std::string trimmed_names = whitespace_trimming(options.input_names_str); + size_t inp_tensors_num = split(trimmed_names, name_set, ' '); + ORT_ENFORCE(inp_tensors_num, "There is no any input tensor names!"); + + std::string trimmed_shapes = whitespace_trimming(options.input_shapes_str); + size_t end_pos = trimmed_shapes.find_last_of(']'); + ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found"); + ORT_ENFORCE(end_pos == (trimmed_shapes.size() - 1), + "Invalid string for input shapes. Symbol ] should be last after whitespace trimming"); + + std::vector shape_set; + split(trimmed_shapes, shape_set, ']'); + shape_set.pop_back(); + ORT_ENFORCE( shape_set.size() == inp_tensors_num, + "Number of shapes is not the same as number of input tensor names"); + + for (size_t i = 0; i < inp_tensors_num; ++i) { + size_t pos = shape_set[i].find('['); + ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]"); + std::string numbers = shape_set[i].substr(pos + 1); + std::vector number_set; + ORT_ENFORCE(split(numbers, number_set, ' '), "There is no any number between [ and ] symbols"); + + TensorShapeVector dims; + for(const auto& number : number_set) { + dims.push_back(std::stoi(number)); + } + + options.input_shapes[name_set[i]] = dims; + } +} + +void TvmEPOptionsHelper::targetPostprocess(std::string& target) { + if(target == tvm::cpu_target_str || + target == tvm::llvm_target_str) { + ProcessCPUTarget(target); + } else if(target == tvm::gpu_target_str) { + ProcessGPUTarget(); + } else if(target.empty()) { + ORT_NOT_IMPLEMENTED("target option is empty!"); + } else { + // TODO(vvchernov): extend mechanism of auto-definition of target + // target is gotten from option set up by client + } +} + +void TvmEPOptionsHelper::ProcessCPUTarget(std::string& target) { + const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo(); + // auto detect from CPU ID + if (cpu_id_info.HasAVX512Skylake()) { + target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512; + } else if (cpu_id_info.HasAVX512f()) { + target = tvm::cpu_targets::LLVM_TARGET_AVX512; + } else if (cpu_id_info.HasAVX2()) { + target = tvm::cpu_targets::LLVM_TARGET_AVX2; + } else if (cpu_id_info.HasAVX()) { + target = tvm::cpu_targets::LLVM_TARGET_AVX; + } else { + // TODO(vvchernov): extend mechanism of auto-definition of cpu target + target = tvm::llvm_target_str; + } +} + +void TvmEPOptionsHelper::ProcessGPUTarget() { + ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!"); +} + +void TvmEPOptionsHelper::targetHostPostprocess(const std::string& target, std::string& target_host) { + if((target_host == tvm::cpu_target_str || + target_host == tvm::llvm_target_str) && + target_host != target) { + target_host = target; + } else if (target_host.empty()) { + target_host = target; + } else { + // TODO(vvchernov): extend mechanism of auto-definition of target host + // target host is gotten from option set up by client + } +} + +void TvmEPOptionsHelper::optLevelPostprocess(unsigned int& opt_level) { + if(opt_level < 1) { + opt_level = tvm::default_opt_level; + } +} + +std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) { + out << "TVM EP options:\n" << + "executor type: " << options.executor << "\n" << + "target: " << options.target << "\n" << + "target_host: " << options.target_host << "\n" << + "opt level: " << options.opt_level << "\n" << + "freeze weights: " << options.freeze_weights << "\n" << + "tuning file path: " << options.tuning_file_path << "\n" << + "tuning type: " << options.tuning_type << "\n" << + "convert layout to NHWC: " << options.to_nhwc << "\n" << + "input tensor names: " << options.input_names_str << "\n" << + "input tensor shapes: " << options.input_shapes_str; + return out; +} + +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.h b/onnxruntime/core/providers/tvm/tvm_ep_options.h similarity index 52% rename from onnxruntime/core/providers/tvm/tvm_execution_provider_info.h rename to onnxruntime/core/providers/tvm/tvm_ep_options.h index b3c8932ab686f..7918b37a6bca4 100644 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.h +++ b/onnxruntime/core/providers/tvm/tvm_ep_options.h @@ -1,17 +1,20 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#ifndef TVM_EXECUTION_PROVIDER_INFO_H -#define TVM_EXECUTION_PROVIDER_INFO_H +#ifndef TVM_EXECUTION_PROVIDER_OPTIONS_H +#define TVM_EXECUTION_PROVIDER_OPTIONS_H #include #include #include +#include #include "core/framework/provider_options.h" +#include "core/framework/tensor_shape.h" #include "tvm_defaults.h" + namespace onnxruntime { namespace tvm { @@ -22,12 +25,13 @@ const std::string LLVM_TARGET_AVX2 = "llvm -mcpu=core-avx2"; const std::string LLVM_TARGET_SKYLAKE_AVX512 = "llvm -mcpu=skylake-avx512"; const std::string LLVM_TARGET_AVX512 = "llvm -mcpu=skylake-avx512"; } // namespace cpu_targets -} // namespace tvm -using TVMInputShapes = std::unordered_map>; +using TVMTensorShapes = std::vector; +using TVMInputShapes = std::unordered_map; +using InputsInfoMap = std::unordered_map; // Information needed to construct an TVM execution provider. -struct TvmExecutionProviderInfo { +struct TvmEPOptions { std::string executor{tvm::default_executor_type}; std::string target{tvm::default_target_str}; std::string target_host{tvm::default_target_str}; @@ -39,12 +43,31 @@ struct TvmExecutionProviderInfo { std::string input_names_str{""}; std::string input_shapes_str{""}; TVMInputShapes input_shapes{}; + TVMTensorShapes output_shapes{}; +}; + +std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options); +class TvmEPOptionsHelper { +public: + static TvmEPOptions FromOptionsString(const char* options); + static TvmEPOptions FromProviderOptions(const ProviderOptions& options); static std::string whitespace_trimming(const std::string& str); - static TvmExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); - static TvmExecutionProviderInfo FromOptionsString(const char* options); + + static bool checkCPUTarget(const std::string& target); + static bool checkGPUTarget(const std::string& target); + +private: + static void optionsPostprocess(TvmEPOptions& options); + static void setInputShapes(TvmEPOptions& options); + static void targetPostprocess(std::string& target); + static void ProcessCPUTarget(std::string& target); + static void ProcessGPUTarget(); + static void targetHostPostprocess(const std::string& target, std::string& target_host); + static void optLevelPostprocess(unsigned int& opt_level); }; +} // namespace tvm } // namespace onnxruntime -#endif // TVM_EXECUTION_PROVIDER_INFO_H +#endif // TVM_EXECUTION_PROVIDER_OPTIONS_H diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc index cafd1561f0a5f..12eae2262c435 100644 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc +++ b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc @@ -10,7 +10,6 @@ #include "core/framework/compute_capability.h" #include "core/platform/env.h" #include "core/graph/model.h" -#include "core/common/cpuid_info.h" #include "tvm_execution_provider.h" #include "xpu_data_transfer.h" @@ -22,239 +21,19 @@ using namespace ONNX_NAMESPACE; namespace onnxruntime { +namespace tvm { // Information to construct kernel function state. struct TVMFuncState { AllocateFunc allocate_func = nullptr; DestroyFunc release_func = nullptr; AllocatorHandle allocator = nullptr; - TvmModule* module = nullptr; - std::function>& input_shapes)> compiler = nullptr; + std::shared_ptr compiler = nullptr; }; -class TVMRunner { - public: - using TVMTensorShape = std::vector; - using TVMTensorShapes = std::vector; - using InputsInfoMap = std::map; - using ORTGraphNodes = std::vector; - - TVMRunner() = delete; - ~TVMRunner() = default; - - TVMRunner(TvmExecutionProvider* ep, - const std::string& name, - const Graph& graph) : - use_vm_(ep->info_.executor == "vm") { - // Extract input shapes - const ORTGraphNodes& all_nodes = graph.GetInputsIncludingInitializers(); - TVMTensorShapes input_shapes; - size_t indx = 0; - if (ep->info_.freeze_weights) { - for (const auto* node : all_nodes) { - const auto& node_name = node->Name(); - if(!graph.IsInitializedTensor(node_name)) { - TVMTensorShape ishape; - if(!ep->info_.input_shapes.empty() && - ep->info_.input_shapes.count(node_name)) { - ishape = ep->info_.input_shapes[node_name]; - inputs_info_[indx] = ishape; - update_output_shapes_ = true; - } else { - getTensorInfo(*node->Shape(), ishape, indx); - } - input_shapes.emplace_back(ishape); - } - ++indx; - } - } else { - for (const auto* node : all_nodes) { - const auto& node_name = node->Name(); - TVMTensorShape ishape; - if(!ep->info_.input_shapes.empty() && - ep->info_.input_shapes.count(node_name)) { - ishape = ep->info_.input_shapes[node_name]; - inputs_info_[indx++] = ishape; - update_output_shapes_ = true; - } else { - getTensorInfo(*node->Shape(), ishape, indx++); - } - if(!graph.IsInitializedTensor(node_name)) { - input_shapes.emplace_back(ishape); - } - } - } - - // Get module from tvm - mod_ = ep->CompileFunc(name, input_shapes); - - // Prepare draft for output tvm tensors - const ORTGraphNodes& ort_outputs_info = graph.GetOutputs(); - size_t num_outputs = ort_outputs_info.size(); - - if (update_output_shapes_) { - if (!use_vm_) { - tvm::TVMGetOutputShapes(*mod_, num_outputs, output_shapes_); - } - } else { - for (auto i = 0u; i < num_outputs; i++) { - TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(*ort_outputs_info[i]->Shape()); - int dims = ort_shape.NumDimensions(); - - TVMTensorShape oshape(dims); - for (int j = 0; j < dims; ++j) { - oshape[j] = int64_t(ort_shape[j]); - } - output_shapes_.emplace_back(oshape); - } - } - - for (auto i = 0u; i < num_outputs; i++) { - DLTensor t; - // Draft for tensor, correct data is defined during inference - t.strides = nullptr; - t.byte_offset = 0; - t.data = nullptr; - if (!(use_vm_ && update_output_shapes_)) { - t.ndim = output_shapes_[i].size(); - t.shape = output_shapes_[i].data(); - } else { - t.ndim = 0; - t.shape = nullptr; - } - - tensors_outputs_.push_back(t); - } - } - - common::Status operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { - Ort::CustomOpApi ort{*api}; - - size_t num = inputs_info_.size(); - std::vector inds(num); - std::vector dl_tensors_inputs(num); - size_t counter = 0u; - for (auto& info : inputs_info_) { - // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z - auto& i = info.first; - auto& shape = info.second; - const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i); - ORT_ENFORCE(input_tensor->IsTensor()); - const Tensor& tensor = input_tensor->Get(); - const OrtDevice& device = tensor.Location().device; - auto tensor_info = ort.GetTensorTypeAndShape(input_tensor); - auto tensor_type = ort.GetTensorElementType(tensor_info); - if (!update_output_shapes_) { - std::vector ort_shape = ort.GetTensorShape(tensor_info); - ORT_ENFORCE(compare_shapes(shape, ort_shape)); - } - ort.ReleaseTensorTypeAndShapeInfo(tensor_info); - - DLTensor t; - t.device = GetDLDevice(device); - t.dtype = GetDataType(tensor_type); - t.strides = nullptr; - t.byte_offset = 0; - t.data = const_cast(ort.GetTensorData(input_tensor)); - t.ndim = shape.size(); - t.shape = shape.data(); - dl_tensors_inputs[counter] = t; - inds[counter++] = i; - } - if (use_vm_) { - tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs); - // Infer once for calculating of output shapes - if(!probe_infer_) { - tvm::TVM_VM_Run(*mod_); - size_t num_outputs = tensors_outputs_.size(); - tvm::TVMGetOutputShapes(*mod_, num_outputs, output_shapes_); - for (size_t i = 0; i < num_outputs; ++i) { - tensors_outputs_[i].ndim = output_shapes_[i].size(); - tensors_outputs_[i].shape = output_shapes_[i].data(); - } - probe_infer_ = true; - } - } else { - tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs); - } - - size_t num_outputs = tensors_outputs_.size(); - for (auto i = 0u; i < num_outputs; i++) { - //setup output tensor property - OrtValue* output_tensor = ort.KernelContext_GetOutput(context, - i, - output_shapes_[i].data(), - output_shapes_[i].size()); - ORT_ENFORCE(output_tensor->IsTensor()); - const Tensor& tensor = output_tensor->Get(); - const OrtDevice& device = tensor.Location().device; - auto tensor_info = ort.GetTensorTypeAndShape(output_tensor); - auto tensor_type = ort.GetTensorElementType(tensor_info); - ort.ReleaseTensorTypeAndShapeInfo(tensor_info); - - tensors_outputs_[i].device = GetDLDevice(device); - tensors_outputs_[i].dtype = GetDataType(tensor_type); - tensors_outputs_[i].data = ort.GetTensorMutableData(output_tensor); - } - - if (use_vm_) { - tvm::TVM_VM_Run(*mod_); - tvm::TVM_VM_GetOutputs(*mod_, tensors_outputs_); - } else { - tvm::TVMRun(*mod_); - tvm::TVMGetOutputs(*mod_, tensors_outputs_); - } - - return Status::OK(); - } - private: - void getTensorInfo(const TensorShapeProto& shape_proto, - TVMTensorShape& ishape, - size_t indx) { - TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto); - int dims = ort_shape.NumDimensions(); - - ishape.resize(dims); - for (int j = 0; j < dims; ++j) { - int64_t dim = int64_t(ort_shape[j]); - ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " + - "Please use provider options to setup input_names and input_shapes"); - ishape[j] = dim; - } - inputs_info_[indx] = ishape; - } - - bool compare_shapes(const TVMTensorShape& shape1, const TVMTensorShape& shape2) { - size_t size = shape1.size(); - if (shape2.size() == size) { - for (size_t i = 0; i < size; ++i) { - if(shape1[i] != shape2[i]) { - return false; - } - } - } else { - return false; - } - - return true; - } - - private: - TvmModule* mod_; - bool use_vm_ = true; - bool probe_infer_ = false; - InputsInfoMap inputs_info_{}; - bool update_output_shapes_ = false; - TVMTensorShapes output_shapes_; - std::vector tensors_outputs_; -}; - -TvmExecutionProvider::TvmExecutionProvider(const TvmExecutionProviderInfo& info) +TvmExecutionProvider::TvmExecutionProvider(const TvmEPOptions& options) : IExecutionProvider{kTvmExecutionProvider}, - info_{info} { - ProcessInfo(); - + options_{options} { AllocatorCreationInfo default_memory_info = {[](int) { return std::make_unique(); }, @@ -273,10 +52,6 @@ TvmExecutionProvider::TvmExecutionProvider(const TvmExecutionProviderInfo& info) TvmExecutionProvider::~TvmExecutionProvider() {} -AllocatorPtr TvmExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { - return allocator_; -} - std::vector> TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer, const std::vector& /*kernel_registries*/) const { @@ -327,8 +102,8 @@ TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer, } common::Status TvmExecutionProvider::Compile(const std::vector& nodes, - std::vector& node_compute_funcs) { - PrintProviderOptions(); + std::vector& node_compute_funcs) { + printOptions(); for (auto* fused_node : nodes) { auto func_body = fused_node->GetFunctionBody(); if (!func_body) @@ -345,32 +120,28 @@ common::Status TvmExecutionProvider::Compile(const std::vector& nodes, opset->set_domain(kOnnxDomain); opset->set_version(node_graph.DomainToVersionMap().at(kOnnxDomain)); - std::string string_buf; - model_proto.SerializeToString(&string_buf); - buffers_[func_name] = string_buf; - opsets_[func_name] = int(opset->version()); - model_paths_[func_name] = fused_node->ModelPath().ToPathString();; + std::string onnx_model_str; + model_proto.SerializeToString(&onnx_model_str); + compilers_[func_name] = std::make_shared(std::move(onnx_model_str), + fused_node->ModelPath().ToPathString(), + int(opset->version())); + InputsInfoMap all_input_shapes; + auto mod = compileModel(func_name, node_graph, all_input_shapes); + + std::vector output_tensors; + prepareOutputTensors(mod, output_tensors, node_graph.GetOutputs().size()); + + runners_[func_name] = std::make_shared(options_, mod, all_input_shapes, output_tensors); if (dump_subgraphs_) { - std::fstream dump("/tmp/" + fused_node->Name() + ".onnx", + std::fstream dump("/tmp/" + func_name + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary); model_proto.SerializeToOstream(&dump); } - NodeComputeInfo compute_info; - compute_info.create_state_func = std::bind(&TvmExecutionProvider::CreateStateFunc, - this, - std::placeholders::_1, - std::placeholders::_2); - - compute_info.release_state_func = [](FunctionState state) { - if (state) - delete static_cast(state); - }; // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs // if the checking fails due to unsupported op(s) - runners_[func_name] = std::make_shared(this, func_name, node_graph); - compute_info.compute_func = *runners_[func_name].get(); + NodeComputeInfo compute_info = prepareComputeInfo(func_name); node_compute_funcs.push_back(compute_info); } @@ -378,182 +149,156 @@ common::Status TvmExecutionProvider::Compile(const std::vector& nodes, } std::unique_ptr TvmExecutionProvider::GetDataTransfer() const { - if (GPUTargetCheck()) { - return std::make_unique(); - } else if (info_.target.find("llvm") != std::string::npos) { - return std::make_unique(); + //TODO(vvchernov): target or target host? + if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) { + return std::make_unique(); + } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) { + return std::make_unique(); } else { - ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", info_.target); + ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target); } } -bool TvmExecutionProvider::GPUTargetCheck() const { - //TODO(vvchernov): target or target host? - bool check = ( - info_.target.find("cuda") != std::string::npos || - info_.target.find("opencl") != std::string::npos || - info_.target.find("metal") != std::string::npos || - info_.target.find("vulkan") != std::string::npos - ); - return check; +AllocatorPtr TvmExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { + return allocator_; } -size_t TvmExecutionProvider::split(const std::string &txt, std::vector &strs, char ch) const { - size_t pos = txt.find( ch ); - size_t initialPos = 0; - strs.clear(); +void TvmExecutionProvider::printOptions() { + LOGS(*GetLogger(), INFO) << options_; +} - while( pos != std::string::npos ) { - strs.push_back( txt.substr( initialPos, pos - initialPos ) ); - initialPos = pos + 1; +std::shared_ptr TvmExecutionProvider::compileModel(const std::string& func_name, + const Graph& graph, + InputsInfoMap& all_input_shapes) { + all_input_shapes.clear(); - pos = txt.find( ch, initialPos ); - } + TVMTensorShapes input_shapes; + if (options_.freeze_weights) { + setInputShapesForFreezedNN(graph, input_shapes, all_input_shapes); + } else { + setInputShapesForUnfreezedNN(graph, input_shapes, all_input_shapes); + } - strs.push_back( txt.substr( initialPos, std::min( pos, txt.size() ) - initialPos + 1 ) ); + std::shared_ptr mod = compilers_[func_name]->operator()(options_, input_shapes); - return strs.size(); + return mod; } -void TvmExecutionProvider::ProcessInfo() { - if(!info_.input_shapes_str.empty()) { - ORT_ENFORCE(!info_.input_names_str.empty(), - "Please insert input tensor names. Input shapes only is invalid case"); - // Parse strings and set to input_shapes map - std::vector tmp_strs; - std::vector names_strs; - - std::string names_str = TvmExecutionProviderInfo::whitespace_trimming(info_.input_names_str); - std::string shapes_str = TvmExecutionProviderInfo::whitespace_trimming(info_.input_shapes_str); - - ORT_ENFORCE(split(names_str, names_strs, ' '), "There is no any input tensor names!"); - size_t inp_tensors_num = names_strs.size(); - - size_t end_pos = shapes_str.find_last_of(']'); - ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found"); - ORT_ENFORCE(end_pos == (shapes_str.size() - 1), - "Invalid string for input shapes. Symbol ] should be last after whitespace trimming"); - split(shapes_str, tmp_strs, ']'); - tmp_strs.pop_back(); - ORT_ENFORCE( tmp_strs.size() == inp_tensors_num, - "Number of shapes is not the same as number of input tensor names"); - for (size_t i = 0; i < inp_tensors_num; ++i) { - size_t pos = tmp_strs[i].find('['); - ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]"); - std::string nums_str = tmp_strs[i].substr(pos + 1); - std::vector nums_strs; - ORT_ENFORCE(split(nums_str, nums_strs, ' '), "There is no any numbers between [ and ] symbols"); - std::vector dims; - for(const auto& num_str : nums_strs) { - dims.push_back(std::stoi(num_str)); - } - - info_.input_shapes[names_strs[i]] = dims; +void TvmExecutionProvider::setInputShapesForFreezedNN(const Graph& graph, + TVMTensorShapes& input_shapes, + InputsInfoMap& all_input_shapes) { + const std::vector& all_nodes = graph.GetInputsIncludingInitializers(); + + size_t indx = 0; + for (const auto* node : all_nodes) { + if(!graph.IsInitializedTensor(node->Name())) { + TensorShapeVector shape = getInputShape(node); + all_input_shapes[indx++] = shape; + input_shapes.emplace_back(shape); } } +} - if(info_.target == tvm::cpu_target_str || - info_.target == tvm::llvm_target_str) { - ProcessCPUTarget(); - } else if(info_.target == tvm::gpu_target_str) { - ProcessGPUTarget(); - } else if(info_.target.empty()) { - ORT_NOT_IMPLEMENTED("target option is empty!"); - } else { - // TODO(vvchernov): extend mechanism of auto-definition of target - // target is gotten from option set up by client +void TvmExecutionProvider::setInputShapesForUnfreezedNN(const Graph& graph, + TVMTensorShapes& input_shapes, + InputsInfoMap& all_input_shapes) { + const std::vector& all_nodes = graph.GetInputsIncludingInitializers(); + + size_t indx = 0; + for (const auto* node : all_nodes) { + TensorShapeVector shape = getInputShape(node); + all_input_shapes[indx++] = shape; + if(!graph.IsInitializedTensor(node->Name())) { + input_shapes.emplace_back(shape); + } } +} - if((info_.target_host == tvm::cpu_target_str || - info_.target_host == tvm::llvm_target_str) && - info_.target_host != info_.target) { - info_.target_host = info_.target; - } else if (info_.target_host.empty()) { - info_.target_host = info_.target; - } else { - // TODO(vvchernov): extend mechanism of auto-definition of target host - // target host is gotten from option set up by client - } +TensorShapeVector TvmExecutionProvider::getInputShape(const NodeArg* node) { + TensorShapeVector shape; + const auto& node_name = node->Name(); + if(!options_.input_shapes.empty() && + options_.input_shapes.count(node_name)) { + shape = options_.input_shapes[node_name]; + } else { + shape = convertTensorShape(*node->Shape()); + } - if(info_.opt_level < 1) { - info_.opt_level = tvm::default_opt_level; - } + return shape; } -void TvmExecutionProvider::ProcessCPUTarget() { - const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo(); - // auto detect from CPU ID - if (cpu_id_info.HasAVX512Skylake()) { - info_.target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512; - } else if (cpu_id_info.HasAVX512f()) { - info_.target = tvm::cpu_targets::LLVM_TARGET_AVX512; - } else if (cpu_id_info.HasAVX2()) { - info_.target = tvm::cpu_targets::LLVM_TARGET_AVX2; - } else if (cpu_id_info.HasAVX()) { - info_.target = tvm::cpu_targets::LLVM_TARGET_AVX; - } else { - // TODO(vvchernov): extend mechanism of auto-definition of cpu target - info_.target = tvm::llvm_target_str; +TensorShapeVector TvmExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) { + TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto); + size_t dims = ort_shape.NumDimensions(); + + TensorShapeVector shape(dims); + for (size_t j = 0; j < dims; ++j) { + int64_t dim = int64_t(ort_shape[j]); + ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " + + "Please use provider options to setup input_names and input_shapes"); + shape[j] = dim; } + + return shape; } -void TvmExecutionProvider::ProcessGPUTarget() { - ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!"); +void TvmExecutionProvider::prepareOutputTensors(const std::shared_ptr& mod, + std::vector& output_tensors, + size_t num) { + ORT_ENFORCE(mod != nullptr, "TVM module is not compiled"); + output_tensors.clear(); + options_.output_shapes.clear(); + options_.output_shapes.resize(num); + + if (options_.executor != "vm") { + tvm::TVMGetOutputShapes(*mod, options_.output_shapes); + } + + for (auto& output_shape : options_.output_shapes) { + DLTensor t; + // Draft for tensor, correct data is defined during inference + t.strides = nullptr; + t.byte_offset = 0; + t.data = nullptr; + if (options_.executor == "vm") { + t.ndim = 0; + t.shape = nullptr; + } else { + t.ndim = output_shape.size(); + t.shape = output_shape.data(); + } + + output_tensors.push_back(t); + } } -void TvmExecutionProvider::PrintProviderOptions() const { - LOGS(*GetLogger(), INFO) << "TVM EP options:\n" << - "executor type: " << info_.executor << "\n" << - "target: " << info_.target << "\n" << - "target_host: " << info_.target_host << "\n" << - "opt level: " << info_.opt_level << "\n" << - "freeze weights: " << info_.freeze_weights << "\n" << - "tuning file path: " << info_.tuning_file_path << "\n" << - "tuning type: " << info_.tuning_type << "\n" << - "convert layout to NHWC: " << info_.to_nhwc << "\n" << - "input tensor names: " << info_.input_names_str << "\n" << - "input tensor shapes: " << info_.input_shapes_str; +NodeComputeInfo TvmExecutionProvider::prepareComputeInfo(const std::string& func_name) { + NodeComputeInfo compute_info; + compute_info.create_state_func = std::bind(&TvmExecutionProvider::createStateFunc, + this, + std::placeholders::_1, + std::placeholders::_2); + + compute_info.release_state_func = [](FunctionState state) { + if (state) + delete static_cast(state); + }; + + compute_info.compute_func = *runners_[func_name].get(); + + return compute_info; } -int TvmExecutionProvider::CreateStateFunc(ComputeContext* context, FunctionState* state) { +int TvmExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) { auto* state_ptr = new TVMFuncState(); *state_ptr = {context->allocate_func, - context->release_func, - context->allocator_handle, - nullptr, - std::bind(&TvmExecutionProvider::CompileFunc, - this, - std::placeholders::_1, - std::placeholders::_2)}; + context->release_func, + context->allocator_handle, + compilers_[context->node_name]}; + // TODO(vvchernov): Who and when release state? *state = state_ptr; return 0; } -TvmModule* TvmExecutionProvider::CompileFunc(std::string func_name, - const TVMTensorShapes& input_shapes) { - if (modules_.count(func_name)) { - return modules_[func_name].get(); - } - - TvmModule mod_f = tvm::TVMCompile(buffers_[func_name], - model_paths_[func_name], - info_.executor, - info_.target, - info_.target_host, - info_.opt_level, - opsets_[func_name], - info_.freeze_weights, - input_shapes, - info_.to_nhwc, - info_.tuning_file_path, - info_.tuning_type); - auto module_ptr = std::make_shared(); - *module_ptr = mod_f; - modules_[func_name] = module_ptr; - // Release memory after module generation - buffers_.erase(func_name); - opsets_.erase(func_name); - return modules_[func_name].get(); -} - +} // namespace tvm } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_execution_provider.h index 6a5d2a2b4c6ac..9d891ee292976 100644 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider.h +++ b/onnxruntime/core/providers/tvm/tvm_execution_provider.h @@ -13,28 +13,27 @@ #include "core/framework/execution_provider.h" #include "core/platform/ort_mutex.h" -#include "tvm_common.h" -#include "tvm_execution_provider_info.h" +#include "tvm_compiler.h" +#include "tvm_runner.h" -namespace onnxruntime { +namespace onnxruntime { + class Graph; + class NodeArg; namespace tvm { + namespace env_vars { static const std::string kDumpSubgraphs = "ORT_TVM_DUMP_SUBGRAPHS"; } // namespace env_vars -} // namespace tvm - -class TVMRunner; class TvmExecutionProvider : public IExecutionProvider { - friend TVMRunner; + using Compiler = tvm::TVMCompiler; + using Compilers = std::unordered_map>; + using Runner = tvm::TVMRunner; + using Runners = std::unordered_map>; - using TVMTensorShape = std::vector; - using TVMTensorShapes = std::vector; - using TVMRunners = std::unordered_map>; - using TVMModules = std::unordered_map>; public: - explicit TvmExecutionProvider(const TvmExecutionProviderInfo& info); + explicit TvmExecutionProvider(const TvmEPOptions& options); virtual ~TvmExecutionProvider(); std::vector> @@ -47,27 +46,27 @@ class TvmExecutionProvider : public IExecutionProvider { AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const override; private: - bool GPUTargetCheck() const; - size_t split(const std::string &txt, std::vector &strs, char ch) const; - void ProcessInfo(); - void ProcessCPUTarget(); - void ProcessGPUTarget(); - void PrintProviderOptions() const; - // Bindings for compute info - int CreateStateFunc(ComputeContext*, FunctionState*); - TvmModule* CompileFunc(std::string func_name, const TVMTensorShapes& input_shapes); + void printOptions(); + std::shared_ptr compileModel(const std::string& func_name, + const Graph& graph, + InputsInfoMap& inputs_info); + void setInputShapesForFreezedNN(const Graph& graph, TVMTensorShapes& input_shapes, InputsInfoMap& all_input_shapes); + void setInputShapesForUnfreezedNN(const Graph& graph, TVMTensorShapes& input_shapes, InputsInfoMap& all_input_shapes); + TensorShapeVector getInputShape(const NodeArg* node); + TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto); + void prepareOutputTensors(const std::shared_ptr& mod, std::vector& output_tensors, size_t num); + NodeComputeInfo prepareComputeInfo(const std::string& func_name); + int createStateFunc(ComputeContext*, FunctionState*); private: - TVMRunners runners_; - std::unordered_map buffers_; - std::unordered_map opsets_; - std::unordered_map model_paths_; + TvmEPOptions options_; + Compilers compilers_; + Runners runners_; bool dump_subgraphs_ = false; OrtMutex tvm_mu_; AllocatorPtr allocator_; - TvmExecutionProviderInfo info_; - TVMModules modules_; }; +} // namespace tvm } // namespace onnxruntime #endif // TVM_EXECUTION_PROVIDER_H diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc deleted file mode 100644 index 2bb2d1d6923e3..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider_info.cc +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include - -#include "core/common/common.h" -#include "core/framework/provider_options_utils.h" - -#include "tvm_execution_provider_info.h" - - -namespace onnxruntime { -namespace tvm { -namespace provider_option_names { -constexpr const char* kExecutor = "executor"; -constexpr const char* kTarget = "target"; -constexpr const char* kTargetHost = "target_host"; -constexpr const char* kOptLevel = "opt_level"; -constexpr const char* kFreezeWeights = "freeze_weights"; -constexpr const char* kToNHWC = "to_nhwc"; -constexpr const char* kTuningFilePath = "tuning_file_path"; -constexpr const char* kTuningType = "tuning_type"; -constexpr const char* kInputNames = "input_names"; -constexpr const char* kInputShapes = "input_shapes"; - -static const std::unordered_set valid_keys { - std::string{kExecutor}, - std::string{kTarget}, - std::string{kTargetHost}, - std::string{kOptLevel}, - std::string{kFreezeWeights}, - std::string{kToNHWC}, - std::string{kTuningFilePath}, - std::string{kTuningType}, - std::string{kInputNames}, - std::string{kInputShapes} -}; - -} // namespace provider_option_names -} // namespace tvm - -std::string TvmExecutionProviderInfo::whitespace_trimming(const std::string& str) { - const std::string WHITESPACE = " \n\r\t\f\v"; - size_t start = str.find_first_not_of(WHITESPACE); - if (start == std::string::npos) { - return ""; - } else { - size_t end = str.find_last_not_of(WHITESPACE); - ORT_ENFORCE(end != std::string::npos); - return str.substr(start, end + 1); - } -} - -TvmExecutionProviderInfo TvmExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) { - TvmExecutionProviderInfo info{}; - - ORT_THROW_IF_ERROR( - ProviderOptionsParser{} - .AddAssignmentToReference(tvm::provider_option_names::kExecutor, info.executor) - .AddAssignmentToReference(tvm::provider_option_names::kTarget, info.target) - .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, info.target_host) - .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, info.opt_level) - .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, info.freeze_weights) - .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, info.to_nhwc) - .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, info.tuning_file_path) - .AddAssignmentToReference(tvm::provider_option_names::kTuningType, info.tuning_type) - .AddAssignmentToReference(tvm::provider_option_names::kInputNames, info.input_names_str) - .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, info.input_shapes_str) - .Parse(options)); - - return info; -} - -TvmExecutionProviderInfo TvmExecutionProviderInfo::FromOptionsString(const char* opt_str) { - std::string settings{opt_str}; - ProviderOptions options; - if (!settings.empty()) { - const std::string& str = settings; - - // tokenize settings - std::regex reg("\\s*,\\s*"); - std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1); - std::sregex_token_iterator iter_end; - std::vector pairs(iter, iter_end); - - ORT_ENFORCE(pairs.size() > 0); - - for(const auto& pair : pairs) { - auto pos_colon = pair.find(':'); - ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair."); - std::string key = pair.substr(0, pos_colon); - std::string value = pair.substr(pos_colon + 1); - - // trim leading and trailing spaces from key/value - key = whitespace_trimming(key); - value = whitespace_trimming(value); - - // Check keys of obtained options - if (tvm::provider_option_names::valid_keys.count(key) == 0) { - ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")"); - } - - options[key] = value; - } - } - - return FromProviderOptions(options); -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc index b63077e3b311c..bcfeb637bd461 100644 --- a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc +++ b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc @@ -13,32 +13,31 @@ namespace onnxruntime { struct TvmProviderFactory : IExecutionProviderFactory { - TvmProviderFactory(const TvmExecutionProviderInfo& info) : info_{info} {} + TvmProviderFactory(const tvm::TvmEPOptions& options) : options_{options} {} ~TvmProviderFactory() = default; std::unique_ptr CreateProvider() override { - return std::make_unique(info_); + return std::make_unique(options_); } - private: - TvmExecutionProviderInfo info_; +private: + tvm::TvmEPOptions options_; }; -std::shared_ptr CreateExecutionProviderFactory_Tvm(const char* settings) { - TvmExecutionProviderInfo info = TvmExecutionProviderInfo::FromOptionsString(settings); - return std::make_shared(info); +std::shared_ptr CreateExecutionProviderFactory_Tvm(const char* opt_str) { + tvm::TvmEPOptions options = tvm::TvmEPOptionsHelper::FromOptionsString(opt_str); + return std::make_shared(options); } -std::shared_ptr CreateExecutionProviderFactory_Tvm(const TvmExecutionProviderInfo& info) -{ - return std::make_shared(info); +std::shared_ptr CreateExecutionProviderFactory_Tvm(const tvm::TvmEPOptions& options) { + return std::make_shared(options); } } // namespace onnxruntime ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, - _In_ const char* settings) { - onnxruntime::TvmExecutionProviderInfo info = onnxruntime::TvmExecutionProviderInfo::FromOptionsString(settings); - options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Tvm(info)); + _In_ const char* opt_str) { + onnxruntime::tvm::TvmEPOptions tvm_options = onnxruntime::tvm::TvmEPOptionsHelper::FromOptionsString(opt_str); + options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Tvm(tvm_options)); return nullptr; } diff --git a/onnxruntime/core/providers/tvm/tvm_runner.cc b/onnxruntime/core/providers/tvm/tvm_runner.cc new file mode 100644 index 0000000000000..117ecea680ea5 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_runner.cc @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/graph/model.h" +#include "core/framework/tensorprotoutils.h" + +#include "tvm_runner.h" + + +using namespace ONNX_NAMESPACE; +namespace onnxruntime { +namespace tvm { + +TVMRunner::TVMRunner(const TvmEPOptions& options, + const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const std::vector& output_tensors) { + runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors); +} + +common::Status TVMRunner::operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { + return runner_->run(api, context); +} + +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_runner.h b/onnxruntime/core/providers/tvm/tvm_runner.h new file mode 100644 index 0000000000000..85d37ccec1042 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_runner.h @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef TVM_RUNNER_H +#define TVM_RUNNER_H + +#include +#include + +#include "tvm_runner_impl.h" + + +namespace onnxruntime { +namespace tvm { + +class TVMRunner { +public: + TVMRunner() = delete; + virtual ~TVMRunner() = default; + + TVMRunner(const TvmEPOptions& options, + const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const std::vector& output_tensor); + + common::Status operator()(FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context); + +private: + std::shared_ptr runner_; +}; + +} // namespace tvm +} // namespace onnxruntime + +#endif // TVM_TVM_RUNNER_H diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc new file mode 100644 index 0000000000000..bade84b6803f3 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc @@ -0,0 +1,165 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/tensorprotoutils.h" + +#include "tvm_runner_impl.h" +#include "tvm_utils.h" +#include "tvm_api.h" + + +namespace onnxruntime { +namespace tvm { + +/* ------------------------------------ RunnerImplFactory ----------------------------- */ + +std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& mod, + const TvmEPOptions& options, + const InputsInfoMap& inputs_info, + const std::vector output_tensors) { + const std::string& name = options.executor; + if (name == "graph") { + return std::make_shared(mod, inputs_info, options.output_shapes, output_tensors); + } else if (name == "vm") { + return std::make_shared(mod, inputs_info, options.output_shapes, output_tensors); + } + return nullptr; +} + +/* ------------------------------------ RunnerImpl ------------------------------------ */ + +RunnerImpl::RunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector output_tensors) : + mod_(mod), + inputs_info_(inputs_info), + output_shapes_(output_shapes), + output_tensors_(output_tensors) { +} + +void RunnerImpl::convert_input_tensors2dl_tensors(Ort::CustomOpApi& ort, + OrtKernelContext* context, + std::vector& dst, + std::vector& dst_inds) { + size_t num = inputs_info_.size(); + dst.reserve(num); + dst_inds.reserve(num); + for (auto& info : inputs_info_) { + // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z + auto& i = info.first; + auto& shape = info.second; + const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i); + ORT_ENFORCE(input_tensor->IsTensor()); + const Tensor& tensor = input_tensor->Get(); + const OrtDevice& device = tensor.Location().device; + auto tensor_info = ort.GetTensorTypeAndShape(input_tensor); + auto tensor_type = ort.GetTensorElementType(tensor_info); + ort.ReleaseTensorTypeAndShapeInfo(tensor_info); + + DLTensor t; + t.device = GetDLDevice(device); + t.dtype = GetDataType(tensor_type); + t.strides = nullptr; + t.byte_offset = 0; + t.data = const_cast(ort.GetTensorData(input_tensor)); + t.ndim = shape.size(); + t.shape = shape.data(); + dst.emplace_back(t); + dst_inds.push_back(i); + } +} + +void RunnerImpl::add_device_type_data2output_tensors(Ort::CustomOpApi& ort, + OrtKernelContext* context) { + size_t num_outputs = output_tensors_.size(); + for (auto i = 0u; i < num_outputs; i++) { + //setup output tensor property + OrtValue* output_tensor = ort.KernelContext_GetOutput(context, + i, + output_shapes_[i].data(), + output_shapes_[i].size()); + ORT_ENFORCE(output_tensor->IsTensor()); + const Tensor& tensor = output_tensor->Get(); + const OrtDevice& device = tensor.Location().device; + auto tensor_info = ort.GetTensorTypeAndShape(output_tensor); + auto tensor_type = ort.GetTensorElementType(tensor_info); + ort.ReleaseTensorTypeAndShapeInfo(tensor_info); + + output_tensors_[i].device = GetDLDevice(device); + output_tensors_[i].dtype = GetDataType(tensor_type); + output_tensors_[i].data = ort.GetTensorMutableData(output_tensor); + } +} + +/* ------------------------------------ GERunnerImpl ------------------------------------ */ + +GERunnerImpl::GERunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector output_tensors) : + RunnerImpl(mod, inputs_info, output_shapes, output_tensors) { +} + +void GERunnerImpl::set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) { + std::vector inds; + std::vector dl_tensors_inputs; + convert_input_tensors2dl_tensors(ort, context, dl_tensors_inputs, inds); + + tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs); +} + +void GERunnerImpl::connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) { + add_device_type_data2output_tensors(ort, context); +} + +void GERunnerImpl::run_and_get_output() { + tvm::TVMRun(*mod_); + tvm::TVMGetOutputs(*mod_, output_tensors_); +} + +/* ------------------------------------ VMRunnerImpl ------------------------------------ */ + +VMRunnerImpl::VMRunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector output_tensors) : + RunnerImpl(mod, inputs_info, output_shapes, output_tensors) { +} + +void VMRunnerImpl::set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) { + std::vector inds; + std::vector dl_tensors_inputs; + convert_input_tensors2dl_tensors(ort, context, dl_tensors_inputs, inds); + + tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs); +} + +void VMRunnerImpl::connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) { + if(!probe_infer_) { + infer_once_to_get_output_shapes(); + } + + add_device_type_data2output_tensors(ort, context); +} + +void VMRunnerImpl::run_and_get_output() { + tvm::TVM_VM_Run(*mod_); + tvm::TVM_VM_GetOutputs(*mod_, output_tensors_); +} + +void VMRunnerImpl::infer_once_to_get_output_shapes() { + tvm::TVM_VM_Run(*mod_); + size_t num_outputs = output_tensors_.size(); + // TODO(vvchernov): check it + output_shapes_.resize(num_outputs); + tvm::TVMGetOutputShapes(*mod_, output_shapes_); + for (size_t i = 0; i < num_outputs; ++i) { + output_tensors_[i].ndim = output_shapes_[i].size(); + output_tensors_[i].shape = output_shapes_[i].data(); + } + probe_infer_ = true; +} + +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.h b/onnxruntime/core/providers/tvm/tvm_runner_impl.h new file mode 100644 index 0000000000000..e9104859c78e6 --- /dev/null +++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.h @@ -0,0 +1,104 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef TVM_RUNNER_IMPL_H +#define TVM_RUNNER_IMPL_H + +#include +#include +#include + +#include "core/framework/func_api.h" +#include "core/session/onnxruntime_cxx_api.h" + +#include "tvm_common.h" +#include "tvm_ep_options.h" + + +namespace onnxruntime { +namespace tvm { + +class RunnerImpl { +public: + RunnerImpl() = delete; + RunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector tensors_outputs); + virtual ~RunnerImpl() = default; + + virtual common::Status run(const OrtCustomOpApi* api, OrtKernelContext* context) { + Ort::CustomOpApi ort{*api}; + + set_input(ort, context); + connect_output_tensors2ort(ort, context); + run_and_get_output(); + + return Status::OK(); + } + + virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) = 0; + virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) = 0; + virtual void run_and_get_output() = 0; + +protected: + void convert_input_tensors2dl_tensors(Ort::CustomOpApi& ort, + OrtKernelContext* context, + std::vector& dst, + std::vector& dst_inds); + void add_device_type_data2output_tensors(Ort::CustomOpApi& ort, + OrtKernelContext* context); + +protected: + std::shared_ptr mod_; + InputsInfoMap inputs_info_; + TVMTensorShapes output_shapes_; + std::vector output_tensors_; +}; + + +class GERunnerImpl : public RunnerImpl { +public: + GERunnerImpl() = delete; + GERunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector tensors_outputs); + virtual ~GERunnerImpl() = default; + + virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) override final; + virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) override final; + virtual void run_and_get_output() override final; +}; + + +class VMRunnerImpl : public RunnerImpl { +public: + VMRunnerImpl() = delete; + VMRunnerImpl(const std::shared_ptr& mod, + const InputsInfoMap& inputs_info, + const TVMTensorShapes output_shapes, + const std::vector tensors_outputs); + virtual ~VMRunnerImpl() = default; + + virtual void set_input(Ort::CustomOpApi& ort, OrtKernelContext* context) override final; + virtual void connect_output_tensors2ort(Ort::CustomOpApi& ort, OrtKernelContext* context) override final; + virtual void run_and_get_output() override final; + +private: + void infer_once_to_get_output_shapes(); + +private: + bool probe_infer_ = false; +}; + + +std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& mod, + const TvmEPOptions& options, + const InputsInfoMap& inputs_info, + const std::vector output_tensors); + +} // namespace tvm +} // namespace onnxruntime + +#endif // TVM_TVM_RUNNER_IMPL_H diff --git a/onnxruntime/core/providers/tvm/tvm_utils.h b/onnxruntime/core/providers/tvm/tvm_utils.h index ab0e8da5652f9..9471afb135578 100644 --- a/onnxruntime/core/providers/tvm/tvm_utils.h +++ b/onnxruntime/core/providers/tvm/tvm_utils.h @@ -10,7 +10,9 @@ #include "core/framework/ortdevice.h" #include "core/common/common.h" + namespace onnxruntime { +namespace tvm { inline DLDataType GetDataType(ONNXTensorElementDataType type) { if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { @@ -50,6 +52,7 @@ inline DLDevice GetDLDevice(const OrtDevice& device) { return context; } -} // namespace onnxruntime +} // namespace tvm +} // namespace onnxruntime #endif // TVM_UTILS_H diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc index 4efb171dda849..5247382566aad 100644 --- a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc +++ b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc @@ -6,7 +6,10 @@ #include "xpu_data_transfer.h" #include "tvm_utils.h" + namespace onnxruntime { +namespace tvm { + XPUDataTransfer::XPUDataTransfer() { } @@ -14,8 +17,8 @@ XPUDataTransfer::~XPUDataTransfer() { } bool XPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) || - (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU); + return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) || + (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU); } common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int _exec_queue_id) const { @@ -27,11 +30,11 @@ common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int _ const OrtDevice& dst_device = dst.Location().device; if ((src_device.Type() == OrtDevice::CPU) && (dst_device.Type() == OrtDevice::CPU)) { - if (src_data == dst_data) { - // no need copying as both pointers are referring to same piece of memory. - return Status::OK(); - } - memcpy(dst_data, src_data, bytes); + if (src_data == dst_data) { + // no need copying as both pointers are referring to same piece of memory. + return Status::OK(); + } + memcpy(dst_data, src_data, bytes); } else { DLTensor tvm_src, tvm_dst; DLDataType dl_type{kDLInt, 8, 1}; @@ -80,4 +83,5 @@ common::Status TvmCPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, in return Status::OK(); } -} // namespace onnxruntime +} // namespace tvm +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.h b/onnxruntime/core/providers/tvm/xpu_data_transfer.h index f07c11794390a..0b38f71baa22e 100644 --- a/onnxruntime/core/providers/tvm/xpu_data_transfer.h +++ b/onnxruntime/core/providers/tvm/xpu_data_transfer.h @@ -7,10 +7,12 @@ #include "core/framework/data_transfer.h" #include "tvm_common.h" + namespace onnxruntime { +namespace tvm { class XPUDataTransfer : public IDataTransfer { - public: +public: XPUDataTransfer(); ~XPUDataTransfer(); @@ -23,7 +25,7 @@ class XPUDataTransfer : public IDataTransfer { }; class TvmCPUDataTransfer : public IDataTransfer { - public: +public: TvmCPUDataTransfer() = default; // Dampen MSVC warning about not fully overriding CopyTensor using IDataTransfer::CopyTensor; @@ -31,5 +33,7 @@ class TvmCPUDataTransfer : public IDataTransfer { common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; }; -} // namespace onnxruntime +} // namespace tvm +} // namespace onnxruntime + #endif // XPU_DATA_TRANSFER diff --git a/onnxruntime/python/tools/microbench/attention.py b/onnxruntime/python/tools/microbench/attention.py new file mode 100644 index 0000000000000..bc9daae4455c5 --- /dev/null +++ b/onnxruntime/python/tools/microbench/attention.py @@ -0,0 +1,57 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +import argparse +from dataclasses import dataclass +import numpy as np +from benchmark import BenchmarkOp, add_arguments + + +@dataclass +class OpParam: + batch_size: int + seq_len: int + hidden_size: int + length: int + data_type: type + + +class BenchmarkAttention(BenchmarkOp): + def __init__(self, args): + BenchmarkOp.__init__(self, args) + + def create_inputs_outputs(cls, op_param): + np.random.seed(0) + input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type) + weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type) + bias = np.random.rand(op_param.length).astype(op_param.data_type) + mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32) + output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type) + inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index} + outputs = {"return_val": output_data} + return inputs, outputs + + def create_cases(self): + model = "models/attention_fp16.onnx" if self.args.precision == "fp16" else "models/attention_fp32.onnx" + data_type = np.float16 if self.args.precision == "fp16" else np.float32 + # bert-base + op_param = OpParam(1, 384, 768, 768 * 3, data_type) + self.add_case(op_param, model) + + def case_profile(cls, op_param, time): + profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms" + return profile + + +def main(): + parser = argparse.ArgumentParser() + add_arguments(parser) + args = parser.parse_args() + bm = BenchmarkAttention(args) + bm.benchmark() + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py index cb8e5f57c20f0..86fa98e153146 100644 --- a/onnxruntime/python/tools/microbench/benchmark.py +++ b/onnxruntime/python/tools/microbench/benchmark.py @@ -1,63 +1,94 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + from abc import ABC, abstractmethod from argparse import ArgumentParser -import time +import logging import numpy import onnxruntime as ort +import time import torch +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + def numpy_type(torch_type): type_map = {torch.float32: numpy.float32, - torch.float16: numpy.float16} + torch.float16: numpy.float16, + torch.int32: numpy.int32} return type_map[torch_type] def add_arguments(parser: ArgumentParser): - parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use") - parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use") - parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling') + parser.add_argument("--provider", required=False, type=str, + choices=["cuda", "rocm", "cpu", None], default=None, + help=("Execution provider to use. By default, a " + "provider is selected in the priority order " + "(cuda|rocm, cpu) depending on availability.")) + parser.add_argument("--precision", required=False, type=str, + choices=["fp16", "fp32"], default="fp16", + help="Number format to use") + parser.add_argument('--profiling', required=False, type=bool, + default=False, help='If enable profiling') + + +def provider_name(name): + provider_map = {"cuda": "CUDAExecutionProvider", + "rocm": "ROCMExecutionProvider", + "cpu": "CPUExecutionProvider"} + return provider_map[name] + + +def get_default_provider(): + if "CUDAExecutionProvider" in ort.get_available_providers(): + return "CUDAExecutionProvider" + if "ROCMExecutionProvider" in ort.get_available_providers(): + return "ROCMExecutionProvider" + return "CPUExecutionProvider" class Benchmark: def __init__(self, model, inputs, outputs, args): - self.provider = args.provider + self.provider = (get_default_provider() if args.provider == None + else provider_name(args.provider)) + logger.info(f"Execution provider: {self.provider}") self.profiling = args.profiling self.model = model + logger.info(f"Model: {self.model}") self.inputs = inputs self.outputs = outputs def create_input_output_tensors(self): - device = "cuda" - input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()} - output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()} + on_gpu = (self.provider == "CUDAExecutionProvider" + or self.provider == "ROCMExecutionProvider") + device = "cuda" if on_gpu else "cpu" + input_tensors = {name: torch.from_numpy(array).to(device) + for name, array in self.inputs.items()} + output_tensors = {name: torch.from_numpy(array).to(device) + for name, array in self.outputs.items()} return input_tensors, output_tensors @classmethod def create_io_binding(cls, sess, input_tensors, output_tensors): io_binding = sess.io_binding() for name, tensor in input_tensors.items(): - io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + io_binding.bind_input(name, tensor.device.type, 0, + numpy_type(tensor.dtype), tensor.shape, + tensor.data_ptr()) for name, tensor in output_tensors.items(): - io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + io_binding.bind_output(name, tensor.device.type, 0, + numpy_type(tensor.dtype), tensor.shape, + tensor.data_ptr()) return io_binding def create_session(self): sess_opt = ort.SessionOptions() sess_opt.enable_profiling = self.profiling - if self.provider == "rocm": - execution_provider = ["ROCMExecutionProvider"] - elif self.provider == "cuda": - execution_provider = ["CUDAExecutionProvider"] - else: - raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.") - - sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider) - - if self.provider == "rocm": - assert 'ROCMExecutionProvider' in sess.get_providers() - elif self.provider == "cuda": - assert 'CUDAExecutionProvider' in sess.get_providers() - + sess = ort.InferenceSession(self.model, sess_options=sess_opt, + providers=[self.provider]) return sess def benchmark(self): diff --git a/onnxruntime/python/tools/microbench/cast.py b/onnxruntime/python/tools/microbench/cast.py new file mode 100644 index 0000000000000..d6ae83a236c85 --- /dev/null +++ b/onnxruntime/python/tools/microbench/cast.py @@ -0,0 +1,75 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +import argparse +from dataclasses import dataclass +import numpy as np +from benchmark import BenchmarkOp, add_arguments + + +@dataclass +class OpParam: + x : int + y : int + m : int + n : int + input_data_type : type + output_data_type : type + + +@dataclass +class ModelParam: + token_type_ids_dim0 : int + input_ids_dim1 : int + + +class BenchmarkCast(BenchmarkOp): + def __init__(self, args): + BenchmarkOp.__init__(self, args) + + def create_inputs_outputs(cls, op_param): + np.random.seed(0) + input_data = np.random.rand(op_param.x, op_param.y, op_param.m, op_param.n).astype(op_param.input_data_type) + output_data = np.random.rand(op_param.x, op_param.y, op_param.m, op_param.n).astype(op_param.output_data_type) + inputs = {"X": input_data} + outputs = {"Y": output_data} + return inputs, outputs + + def add_model_cases(self, mp, model, input_data_type, output_data_type): + self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1024, input_data_type, output_data_type), model) + self.add_case(OpParam(1, mp.token_type_ids_dim0, mp.input_ids_dim1, 1, input_data_type, output_data_type), model) + self.add_case(OpParam(16, mp.token_type_ids_dim0, mp.input_ids_dim1, mp.input_ids_dim1, input_data_type, output_data_type), model) + + def create_cases(self): + model = "models/cast_fp16tofp32.onnx" if self.args.precision == "fp16" else "models/cast_fp32tofp16.onnx" + input_data_type = np.float16 if self.args.precision == "fp16" else np.float32 + output_data_type = np.float32 if self.args.precision == "fp16" else np.float16 + # huggingface bert-large + self.add_case(OpParam(1, 1, 1, 1024, input_data_type, output_data_type), model) + self.add_case(OpParam(1, 1, 1024, 1024, input_data_type, output_data_type), model) + self.add_case(OpParam(1, 1, 1024, 4096, input_data_type, output_data_type), model) + self.add_case(OpParam(1, 1, 1024, 30522, input_data_type, output_data_type), model) + # huggingface bert-large with default dims + model_param = ModelParam(8, 512) + self.add_model_cases(model_param, model, input_data_type, output_data_type) + # huggingface bert-large with large input dims + model_param = ModelParam(32, 1024) + self.add_model_cases(model_param, model, input_data_type, output_data_type) + + def case_profile(cls, op_param, time): + profile = f"(x y m n input_data_type) = ({op_param.x} {op_param.y} {op_param.m} {op_param.n} {op_param.input_data_type}), {time:7.4f} ms" + return profile + + +def main(): + parser = argparse.ArgumentParser() + add_arguments(parser) + args = parser.parse_args() + bm = BenchmarkCast(args) + bm.benchmark() + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/microbench/fast_gelu.py b/onnxruntime/python/tools/microbench/fast_gelu.py index 3014cf6234644..2d50e256a0642 100644 --- a/onnxruntime/python/tools/microbench/fast_gelu.py +++ b/onnxruntime/python/tools/microbench/fast_gelu.py @@ -1,3 +1,8 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + import argparse from dataclasses import dataclass import numpy as np diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py index 8c091d97f0086..1de45ee5c75b3 100644 --- a/onnxruntime/python/tools/microbench/matmul.py +++ b/onnxruntime/python/tools/microbench/matmul.py @@ -1,3 +1,8 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + import argparse from dataclasses import dataclass import numpy as np diff --git a/onnxruntime/python/tools/microbench/skip_layer_norm.py b/onnxruntime/python/tools/microbench/skip_layer_norm.py new file mode 100644 index 0000000000000..b6f8c5f9e15e0 --- /dev/null +++ b/onnxruntime/python/tools/microbench/skip_layer_norm.py @@ -0,0 +1,59 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +import argparse +from dataclasses import dataclass +import numpy as np +from benchmark import BenchmarkOp, add_arguments + + +@dataclass +class OpParam: + batch_size: int + seq_len: int + hidden_size: int + data_type: type + + +class BenchmarkSkipLayerNorm(BenchmarkOp): + def __init__(self, args): + BenchmarkOp.__init__(self, args) + + def create_inputs_outputs(cls, op_param): + np.random.seed(0) + input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type) + skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type) + gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type) + beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type) + bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type) + output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type) + + inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias} + outputs = {"return_val": output_data} + + return inputs, outputs + + def create_cases(self): + model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx" + data_type = np.float16 if self.args.precision == "fp16" else np.float32 + # bert-large + op_param = OpParam(1, 384, 1024, data_type) + self.add_case(op_param, model) + + def case_profile(cls, op_param, time): + profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms" + return profile + + +def main(): + parser = argparse.ArgumentParser() + add_arguments(parser) + args = parser.parse_args() + bm = BenchmarkSkipLayerNorm(args) + bm.benchmark() + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index c559112028399..3165c5be8ea2c 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -608,7 +608,7 @@ def compute_percentile(self): cdf = np.cumsum(hist/total) if self.symmetric: idx_right = np.searchsorted(cdf, percentile / 100.0) - thresholds_dict[tensor] = (-float(hist_edges[idx_ringht]), float(hist_edges[idx_right])) + thresholds_dict[tensor] = (-float(hist_edges[idx_right]), float(hist_edges[idx_right])) else: percent_to_cut_one_side = (100.0 - percentile) / 200.0 idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side) diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index 0d2e7feee7c13..889adf0c4531d 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -1803,6 +1803,21 @@ def _propagate_shape_and_type(self, node, input_index=0, output_index=0): vi = self.known_vi_[node.output[output_index]] vi.CopyFrom(helper.make_tensor_value_info(node.output[output_index], output_dtype, shape)) + def _is_none_dim(self, dim_value): + if type(dim_value) != str: + return False + if "unk__" not in dim_value: + return False + if dim_value in self.symbolic_dims_.keys(): + return False + return True + + def _is_shape_contains_none_dim(self, out_shape): + for out in out_shape: + if self._is_none_dim(out): + return out + return None + def _infer_impl(self, start_sympy_data=None): self.sympy_data_ = start_sympy_data or {} self.out_mp_.graph.ClearField('value_info') @@ -1956,7 +1971,8 @@ def get_prereq(node): if node.output[i_o] in self.sympy_data_: logger.debug(' Sympy Data: ' + str(self.sympy_data_[node.output[i_o]])) - if (out_shape is not None and None in out_shape) or out_type_undefined: + # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain + if (out_shape is not None and (None in out_shape or self._is_shape_contains_none_dim(out_shape))) or out_type_undefined: if self.auto_merge_: if node.op_type in [ 'Add', 'Sub', 'Mul', 'Div', 'MatMul', 'MatMulInteger', 'MatMulInteger16', 'Concat', @@ -1964,8 +1980,11 @@ def get_prereq(node): ]: shapes = [self._get_shape(node, i) for i in range(len(node.input))] if node.op_type in ['MatMul', 'MatMulInteger', 'MatMulInteger16']: - if None in out_shape: - idx = out_shape.index(None) + if None in out_shape or self._is_shape_contains_none_dim(out_shape): + if None in out_shape: + idx = out_shape.index(None) + else: + idx = out_shape.index(self._is_shape_contains_none_dim(out_shape)) dim_idx = [len(s) - len(out_shape) + idx for s in shapes] # only support auto merge for MatMul for dim < rank-2 when rank > 2 assert len(shapes[0]) > 2 and dim_idx[0] < len(shapes[0]) - 2 @@ -1978,7 +1997,7 @@ def get_prereq(node): if shapes: for idx in range(len(out_shape)): - if out_shape[idx] is not None: + if out_shape[idx] is not None and not self._is_none_dim(out_shape[idx]): continue # note that the broadcasting rule aligns from right to left # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 57741f6e1d238..ba8694da4d51e 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -34,6 +34,8 @@ python benchmark.py -e torchscript -g -p "fp16" Run ONNXRuntime and TorchScript on CPU for all models with quantization: python benchmark.py -e torchscript onnxruntime -p "int8" -o + Run OnnxRuntime with the ROCM provider and graph optimization script: + python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm It is recommended to use run_benchmark.sh to launch benchmark. """ @@ -51,6 +53,7 @@ from benchmark_helper import (OptimizerInfo, create_onnxruntime_session, Precision, setup_logger, get_latency_result, output_details, output_summary, output_fusion_statistics, inference_ort, inference_ort_with_io_binding, allocateOutputBuffers, ConfigModifier) +from fusion_options import FusionOptions from quantize_helper import QuantizeHelper from onnx_exporter import create_onnxruntime_input, load_pretrained_model, export_onnx_model_from_pt, export_onnx_model_from_tf @@ -71,7 +74,7 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, - model_source): + model_source, args): import onnxruntime results = [] @@ -92,6 +95,9 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier ) return results + if optimizer_info == OptimizerInfo.NOOPT: + logger.warning(f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied.") + for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: @@ -99,18 +105,20 @@ def run_onnxruntime(use_gpu, provider, model_names, model_class, config_modifier break input_names = all_input_names[:num_inputs] + args.model_type = MODELS[model_name][3] + fusion_options = FusionOptions.parse(args) if 'pt' in model_source: with torch.no_grad(): onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, - validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) + validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options) if 'tf' in model_source: onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, - validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) + validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options) if not is_valid_onnx_model: continue @@ -198,7 +206,7 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n for model_name in model_names: config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir) - config_modifier(config) + config_modifier.modify(config) model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) @@ -240,6 +248,7 @@ def run_pytorch(use_gpu, model_names, model_class, config_modifier, precision, n result = { "engine": "torchscript" if torchscript else "torch", "version": torch.__version__, + "providers": "NA", "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, @@ -315,7 +324,7 @@ def run_tensorflow(use_gpu, model_names, model_class, config_modifier, precision for model_name in model_names: config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) - config_modifier(config) + config_modifier.modify(config) model = load_pretrained_model(model_name, config=config, @@ -373,6 +382,7 @@ def lxmert_forward(): result = { "engine": "tensorflow", "version": tf.__version__, + "providers": "NA", "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, @@ -517,6 +527,8 @@ def parse_arguments(): default=None, help="Manually set the model's layer number") + FusionOptions.add_arguments(parser) + args = parser.parse_args() return args @@ -584,7 +596,7 @@ def main(): args.test_times, args.input_counts, args.optimizer_info, args.validate_onnx, args.cache_dir, args.onnx_dir, args.verbose, args.overwrite, args.disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, - args.model_source) + args.model_source, args) except: logger.error(f"Exception", exc_info=True) diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py index 6b621492b2ec2..7c13ca3c8d945 100644 --- a/onnxruntime/python/tools/transformers/bert_perf_test.py +++ b/onnxruntime/python/tools/transformers/bert_perf_test.py @@ -21,6 +21,7 @@ import psutil import csv import numpy as np +import torch import random from datetime import datetime import multiprocessing @@ -36,6 +37,7 @@ class TestSetting: test_cases: int test_times: int use_gpu: bool + use_io_binding: bool provider: str intra_op_num_threads: int seed: int @@ -119,6 +121,55 @@ def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_op return session +def numpy_type(torch_type): + type_map = {torch.float32: np.float32, + torch.float16: np.float16, + torch.int32: np.int32, + torch.int64: np.longlong} + return type_map[torch_type] + +def create_input_output_tensors(inputs, outputs, device): + input_tensors = {name: torch.from_numpy(array).to(device) + for name, array in inputs.items()} + output_tensors = {name: torch.from_numpy(array).to(device) + for name, array in outputs.items()} + return input_tensors, output_tensors + +def create_io_binding(sess, input_tensors, output_tensors): + io_binding = sess.io_binding() + for name, tensor in input_tensors.items(): + io_binding.bind_input(name, tensor.device.type, 0, + numpy_type(tensor.dtype), tensor.shape, + tensor.data_ptr()) + for name, tensor in output_tensors.items(): + io_binding.bind_output(name, tensor.device.type, 0, + numpy_type(tensor.dtype), tensor.shape, + tensor.data_ptr()) + return io_binding + +def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting): + results = [] + latency_list = [] + device = 'cuda' if test_setting.use_gpu else 'cpu' + for test_case_id, inputs in enumerate(all_inputs): + result = session.run(output_names, inputs) + results.append(result) + outputs = {} + for i in range(len(output_names)): + outputs[output_names[i]] = result[i] + + input_tensors, output_tensors = create_input_output_tensors(inputs, outputs, device) + io_binding = create_io_binding(session, input_tensors, output_tensors) + + # warm up once + session.run_with_iobinding(io_binding) + + start_time = timeit.default_timer() + session.run_with_iobinding(io_binding) + latency = timeit.default_timer() - start_time + latency_list.append(latency) + + return results, latency_list def onnxruntime_inference(session, all_inputs, output_names): if len(all_inputs) > 0: @@ -135,7 +186,6 @@ def onnxruntime_inference(session, all_inputs, output_names): latency_list.append(latency) return results, latency_list - def to_string(model_path, session, test_setting): sess_options = session.get_session_options() option = "model={},".format(os.path.basename(model_path)) @@ -159,9 +209,14 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op print("Running test:", key) all_latency_list = [] - for i in range(test_setting.test_times): - results, latency_list = onnxruntime_inference(session, all_inputs, output_names) - all_latency_list.extend(latency_list) + if test_setting.use_io_binding: + for i in range(test_setting.test_times): + results, latency_list = onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting) + all_latency_list.extend(latency_list) + else: + for i in range(test_setting.test_times): + results, latency_list = onnxruntime_inference(session, all_inputs, output_names) + all_latency_list.extend(latency_list) # latency in miliseconds latency_ms = np.array(all_latency_list) * 1000 @@ -269,6 +324,9 @@ def parse_arguments(): parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU") parser.set_defaults(use_gpu=False) + parser.add_argument('--use_io_binding', required=False, action='store_true', help="use io_binding") + parser.set_defaults(use_io_binding=False) + parser.add_argument("--provider", required=False, type=str, @@ -311,7 +369,7 @@ def main(): args.opt_level) for batch_size in batch_size_set: - test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu, + test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu, args.use_io_binding, args.provider, args.intra_op_num_threads, args.seed, args.verbose) print("test setting", test_setting) diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py index 3823e001e0634..b5b26c0b046a7 100644 --- a/onnxruntime/python/tools/transformers/float16.py +++ b/onnxruntime/python/tools/transformers/float16.py @@ -280,6 +280,12 @@ def convert_float_to_float16(model, if n.name not in graph_io_to_skip: n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 value_info_list.append(n) + if n.type.HasField('sequence_type'): + if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT: + if n.name not in graph_io_to_skip: + n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 + value_info_list.append(n) + queue = next_level for key, value in fp32_initializers.items(): diff --git a/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py b/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py index 2570673692a05..93d42ffb65c4b 100644 --- a/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_beamsearch_helper.py @@ -17,6 +17,7 @@ from transformers import GPT2LMHeadModel, GPT2Config from benchmark_helper import Precision from gpt2_helper import Gpt2Helper, Gpt2Inputs, GPT2ModelNoPastState, MyGPT2Model, MyGPT2LMHeadModel, MyGPT2LMHeadModel_NoPadding +from torch_onnx_export_helper import torch_onnx_export logger = logging.getLogger(__name__) @@ -36,7 +37,7 @@ def create_helper(helper_type="default"): class GPT2LMHeadModel_BeamSearchStep(GPT2LMHeadModel): - """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one + """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one step beam search.""" def __init__(self, config, batch_size, beam_size): super().__init__(config) @@ -120,7 +121,7 @@ def forward( class GPT2LMHeadModel_ConfigurableOneStepSearch(GPT2LMHeadModel): - """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one + """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and one step beam search with configuration support.""" def __init__(self, config, @@ -628,7 +629,7 @@ def export_onnx(model, Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export( + torch_onnx_export( model, args=tuple(input_list), f=onnx_model_path, diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py index d0a2b92c5f16f..cc7712e163df9 100644 --- a/onnxruntime/python/tools/transformers/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_helper.py @@ -21,6 +21,7 @@ from fusion_utils import FusionUtils from benchmark_helper import Precision from io_binding_helper import IOBindingHelper +from torch_onnx_export_helper import torch_onnx_export logger = logging.getLogger(__name__) @@ -402,7 +403,7 @@ def export_onnx(model, Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export(model, + torch_onnx_export(model, args=tuple(input_list), f=onnx_model_path, input_names=input_names, diff --git a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py index 3e9922bef3cfa..fb7b0adefb1fa 100644 --- a/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py +++ b/onnxruntime/python/tools/transformers/longformer/convert_longformer_to_onnx.py @@ -15,6 +15,8 @@ # # For inference of the onnx model, you will need onnxruntime-gpu 1.7.0 or above. +import sys +import os import torch import numpy as np import argparse @@ -25,6 +27,9 @@ from pathlib import Path from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from torch_onnx_export_helper import torch_onnx_export + @parse_args('v', 'v', 'v', 'v', 'v', 'v', 'v', 'i', 'i') def my_longformer_attention(g, input, weight, bias, mask, global_weight, global_bias, global_mask, num_heads, window): @@ -223,7 +228,7 @@ def export_longformer(model, onnx_model_path, export_padding): Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export(model, + torch_onnx_export(model, example_inputs, onnx_model_path, opset_version=11, diff --git a/onnxruntime/python/tools/transformers/models/t5/past_helper.py b/onnxruntime/python/tools/transformers/models/t5/past_helper.py index 3c585c23c8058..0a9eb37be9443 100644 --- a/onnxruntime/python/tools/transformers/models/t5/past_helper.py +++ b/onnxruntime/python/tools/transformers/models/t5/past_helper.py @@ -11,7 +11,6 @@ class PastKeyValuesHelper: """ Helper functions to process past key values for encoder-decoder model""" - @staticmethod def get_past_names(num_layers, present: bool = False): past_self_names = [] diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py index 5bfd530581d6e..26e5d9733e0c8 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py @@ -6,6 +6,8 @@ from pathlib import Path from typing import List, Union +import sys +import os import logging import numpy import torch @@ -14,6 +16,9 @@ from t5_encoder import T5EncoderInputs from past_helper import PastKeyValuesHelper +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +from torch_onnx_export_helper import torch_onnx_export + logger = logging.getLogger(__name__) @@ -21,7 +26,6 @@ class T5DecoderInit(torch.nn.Module): """ A T5 decoder with LM head to create initial past key values. This model is only called once during starting decoding. """ - def __init__(self, decoder: torch.nn.Module, lm_head: torch.nn.Module, @@ -58,7 +62,6 @@ def forward(self, decoder_input_ids: torch.Tensor, encoder_attention_mask: torch class T5Decoder(torch.nn.Module): """ A T5 decoder with LM head and past key values""" - def __init__(self, decoder, lm_head, config): super().__init__() self.decoder = decoder @@ -89,7 +92,6 @@ def forward(self, decoder_input_ids, encoder_attention_mask, encoder_hidden_stat class T5DecoderInputs: - def __init__(self, decoder_input_ids, encoder_attention_mask, encoder_hidden_states, past_key_values=None): self.decoder_input_ids: torch.LongTensor = decoder_input_ids self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask @@ -160,7 +162,6 @@ def to_list(self) -> List: class T5DecoderHelper: - @staticmethod def export_onnx(decoder: Union[T5Decoder, T5DecoderInit], device: torch.device, @@ -250,7 +251,7 @@ def export_onnx(decoder: Union[T5Decoder, T5DecoderInit], } Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export(decoder, + torch_onnx_export(decoder, args=tuple(input_list), f=onnx_model_path, export_params=True, diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py index c0086896b74d3..cf0f7f97abcf3 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py @@ -5,6 +5,8 @@ # -------------------------------------------------------------------------- import random +import sys +import os from pathlib import Path from typing import List import logging @@ -13,12 +15,14 @@ from transformers import T5Config from onnxruntime import InferenceSession +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +from torch_onnx_export_helper import torch_onnx_export + logger = logging.getLogger(__name__) class T5Encoder(torch.nn.Module): """ T5 encoder outputs only the last hidden state""" - def __init__(self, encoder, config: T5Config): super().__init__() self.encoder = encoder @@ -29,7 +33,6 @@ def forward(self, input_ids, attention_mask): class T5EncoderInputs: - def __init__(self, input_ids, attention_mask): self.input_ids: torch.LongTensor = input_ids self.attention_mask: torch.LongTensor = attention_mask @@ -44,7 +47,7 @@ def create_dummy(batch_size: int, sequence_length: int, vocab_size: int, sequence_length (int): sequence length vocab_size (int): vocaburary size device (torch.device): device of output tensors - + Returns: T5EncoderInputs: dummy inputs for encoder """ @@ -67,7 +70,6 @@ def to_list(self) -> List: class T5EncoderHelper: - @staticmethod def export_onnx(encoder: T5Encoder, device: torch.device, @@ -93,7 +95,7 @@ def export_onnx(encoder: T5Encoder, outputs = encoder(encoder_inputs.input_ids, encoder_inputs.attention_mask) Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export(encoder, + torch_onnx_export(encoder, args=tuple(encoder_inputs.to_list()), f=onnx_model_path, export_params=True, diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py index 29b82cda191f6..bbfff80591fc3 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py @@ -6,6 +6,8 @@ from pathlib import Path from typing import List +import sys +import os import logging import numpy import torch @@ -15,13 +17,15 @@ from t5_decoder import T5DecoderInit from past_helper import PastKeyValuesHelper +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +from torch_onnx_export_helper import torch_onnx_export + logger = logging.getLogger(__name__) class T5EncoderDecoderInit(torch.nn.Module): """ A combination of T5Encoder and T5DecoderInit. """ - def __init__(self, encoder: torch.nn.Module, decoder: torch.nn.Module, @@ -44,7 +48,6 @@ def forward(self, class T5EncoderDecoderInitInputs: - def __init__(self, encoder_input_ids, encoder_attention_mask, decoder_input_ids=None): self.encoder_input_ids: torch.LongTensor = encoder_input_ids self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask @@ -70,7 +73,6 @@ def to_list(self) -> List: class T5EncoderDecoderInitHelper: - @staticmethod def export_onnx(model: T5EncoderDecoderInit, device: torch.device, @@ -153,7 +155,7 @@ def export_onnx(model: T5EncoderDecoderInit, dynamic_axes[name] = {0: 'batch_size', 1: num_heads, 2: sequence_length, 3: head_size} Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) - torch.onnx.export(model, + torch_onnx_export(model, args=tuple(input_list), f=onnx_model_path, export_params=True, diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py index f04fa9941c45f..4bcb5d428463c 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py @@ -22,7 +22,6 @@ class T5Helper: - @staticmethod def get_onnx_path(output_dir: str, model_name_or_path: str, suffix: str = "", new_folder: bool = False) -> str: """Build onnx path diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index 30d767e93076a..04228cd02e888 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -15,6 +15,7 @@ from gpt2_helper import GPT2ModelNoPastState, PRETRAINED_GPT2_MODELS, TFGPT2ModelNoPastState from quantize_helper import QuantizeHelper from huggingface_models import MODEL_CLASSES +from torch_onnx_export_helper import torch_onnx_export os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -184,13 +185,14 @@ def optimize_onnx_model_by_ort(onnx_model_path, ort_model_path, use_gpu, overwri def optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, num_attention_heads, hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite, model_fusion_statistics, - use_external_data_format): + use_external_data_format, optimization_options=None): if overwrite or not os.path.exists(optimized_model_path): Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True) from optimizer import optimize_model from fusion_options import FusionOptions - optimization_options = FusionOptions(model_type) + if optimization_options == None: + optimization_options = FusionOptions(model_type) optimization_options.use_raw_attention_mask(use_raw_attention_mask) if Precision.FLOAT16 == precision: optimization_options.enable_gelu_approximation = True @@ -317,7 +319,8 @@ def validate_and_optimize_onnx(model_name, onnx_model_path, example_inputs, example_outputs_flatten, - output_names=None): + output_names, + fusion_options): is_valid_onnx_model = True if validate_onnx: is_valid_onnx_model = validate_onnx_model(onnx_model_path, example_inputs, example_outputs_flatten, use_gpu, @@ -330,7 +333,7 @@ def validate_and_optimize_onnx(model_name, False, use_external_data_format) optimize_onnx_model(model_name, onnx_model_path, optimized_model_path, model_type, config.num_attention_heads, config.hidden_size, use_gpu, precision, use_raw_attention_mask, overwrite, - model_fusion_statistics, use_external_data_format) + model_fusion_statistics, use_external_data_format, fusion_options) onnx_model_path = optimized_model_path if validate_onnx: @@ -352,7 +355,7 @@ def validate_and_optimize_onnx(model_name, def export_onnx_model_from_pt(model_name, opset_version, use_external_data_format, model_type, model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, - validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics): + validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options): config, model = load_pt_model(model_name, model_class, cache_dir, config_modifier) # config, model = load_pt_model_from_tf(model_name) @@ -384,7 +387,7 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten) replace_torch_functions() - torch.onnx.export(model=model, + torch_onnx_export(model=model, args=tuple(example_inputs.values()), f=onnx_model_path, input_names=list(example_inputs.keys()), @@ -401,14 +404,14 @@ def export_onnx_model_from_pt(model_name, opset_version, use_external_data_forma onnx_model_file, is_valid_onnx_model, vocab_size = validate_and_optimize_onnx( model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, config, model_fusion_statistics, onnx_model_path, - example_inputs, example_outputs_flatten, None) + example_inputs, example_outputs_flatten, None, fusion_options) return onnx_model_file, is_valid_onnx_model, vocab_size, max_input_size def export_onnx_model_from_tf(model_name, opset_version, use_external_data_format, model_type, model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, - validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics): + validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options): # Use CPU to export import tensorflow as tf tf.config.set_visible_devices([], 'GPU') @@ -495,6 +498,6 @@ def export_onnx_model_from_tf(model_name, opset_version, use_external_data_forma opt_onnx_model_file, onnx_model_file, is_valid_onnx_model, vocab_size = validate_and_optimize_onnx( model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, config, model_fusion_statistics, onnx_model_path, - example_inputs, example_outputs_flatten, output_names) + example_inputs, example_outputs_flatten, output_names, fusion_options) return opt_onnx_model_file, onnx_model_file, is_valid_onnx_model, vocab_size, max_input_size diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py new file mode 100644 index 0000000000000..0912ee396f20e --- /dev/null +++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py @@ -0,0 +1,68 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +import torch +TrainingMode = torch.onnx.TrainingMode +from packaging.version import Version + +def torch_onnx_export( + model, + args, + f, + export_params=True, + verbose=False, + training=TrainingMode.EVAL, + input_names=None, + output_names=None, + operator_export_type=None, + opset_version=None, + _retain_param_name=None, + do_constant_folding=True, + example_outputs=None, + strip_doc_string=None, + dynamic_axes=None, + keep_initializers_as_inputs=None, + custom_opsets=None, + enable_onnx_checker=None, + use_external_data_format=None, + export_modules_as_functions=False): + if Version(torch.__version__) >= Version("1.11.0"): + torch.onnx.export( + model=model, + args=args, + f=f, + export_params=export_params, + verbose=verbose, + training=training, + input_names=input_names, + output_names=output_names, + operator_export_type=operator_export_type, + opset_version=opset_version, + do_constant_folding=do_constant_folding, + dynamic_axes=dynamic_axes, + keep_initializers_as_inputs=keep_initializers_as_inputs, + custom_opsets=custom_opsets, + export_modules_as_functions=export_modules_as_functions) + else: + torch.onnx.export( + model=model, + args=args, + f=f, + export_params=export_params, + verbose=verbose, + training=training, + input_names=input_names, + output_names=output_names, + operator_export_type=operator_export_type, + opset_version=opset_version, + _retain_param_name=_retain_param_name, + do_constant_folding=do_constant_folding, + example_outputs=example_outputs, + strip_doc_string=strip_doc_string, + dynamic_axes=dynamic_axes, + keep_initializers_as_inputs=keep_initializers_as_inputs, + custom_opsets=custom_opsets, + enable_onnx_checker=enable_onnx_checker, + use_external_data_format=use_external_data_format) diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index c6c8e9a890d9d..05699be42c9de 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -1412,11 +1412,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { std::vector inputs = {&node_1}; std::vector outputs = {&node_2}; auto& cast_node = graph.AddNode("cast_1", "Cast", "node 2", inputs, outputs); - ONNX_NAMESPACE::AttributeProto to; - to.set_name("to"); - to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - to.set_i(static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); - cast_node.AddAttribute("to", to); + cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT}); } { std::vector inputs = {&node_2, &data_0}; @@ -1462,11 +1458,7 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { std::vector inputs = {&if_cond_input}; std::vector outputs = {&graph_if_input}; auto& cast_node = graph.AddNode("cast_9", "Cast", "node 2", inputs, outputs); - ONNX_NAMESPACE::AttributeProto to; - to.set_name("to"); - to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - to.set_i(static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); - cast_node.AddAttribute("to", to); + cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT}); } std::vector inputs = {&if_cond_input}; @@ -1600,11 +1592,7 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) { std::vector inputs = {&graph_0__value_1}; std::vector outputs = {&graph_0__value_2}; auto& cast_node = graph.AddNode("graph_0__cast_0", "Cast", "cast node in main graph", inputs, outputs); - ONNX_NAMESPACE::AttributeProto to; - to.set_name("to"); - to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - to.set_i(static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); - cast_node.AddAttribute("to", to); + cast_node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_FLOAT}); } { std::vector inputs = {&graph_0__value_2, &input_0}; diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc index a8af3e2be4b5b..9c9f3d3d2df99 100644 --- a/onnxruntime/test/framework/shape_inference_test.cc +++ b/onnxruntime/test/framework/shape_inference_test.cc @@ -84,18 +84,7 @@ TEST_F(ShapeInferenceTest, BasicTest) { Input("X1", type1); auto& node = Node("Cast", "X1", "Y1"); - //AttributeProto squeezed_axes; - //squeezed_axes.set_name("axes"); - //squeezed_axes.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS); - //squeezed_axes.add_ints(0); - //p_node->AddAttribute("axes", squeezed_axes); - AttributeProto cast_to; - cast_to.set_name("to"); - cast_to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT); - cast_to.set_i(ONNX_NAMESPACE::TensorProto_DataType_INT32); - //cast_to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING); - //cast_to.set_s("INT16"); - node.AddAttribute("to", cast_to); + node.AddAttribute("to", int64_t{ONNX_NAMESPACE::TensorProto_DataType_INT32}); DoShapeInference(); // check inferred shapes diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 1d658387ebcd9..96b725c75593d 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -1537,21 +1537,11 @@ TEST_F(GraphTest, AddTensorAttribute) { } void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, int64_t attr_value) { - AttributeProto attr; - attr.set_name(attr_name); - attr.set_type(AttributeProto_AttributeType_INT); - attr.set_i(attr_value); - p_node.AddAttribute(attr_name, attr); + p_node.AddAttribute(attr_name, attr_value); } void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, std::initializer_list attr_value) { - AttributeProto attr; - attr.set_name(attr_name); - attr.set_type(AttributeProto_AttributeType_INTS); - for (auto v : attr_value) { - attr.add_ints(v); - } - p_node.AddAttribute(attr_name, attr); + p_node.AddAttribute(attr_name, attr_value); } // Test that output type can be inferred for ops with a type-attribute diff --git a/onnxruntime/test/optimizer/qdq_test_utils.cc b/onnxruntime/test/optimizer/qdq_test_utils.cc index 607049917f976..9e3318dc0f79f 100644 --- a/onnxruntime/test/optimizer/qdq_test_utils.cc +++ b/onnxruntime/test/optimizer/qdq_test_utils.cc @@ -98,5 +98,36 @@ GetQDQTestCaseFn BuildQDQConcatTestCase(const std::vector>& }; } +GetQDQTestCaseFn BuildQDQConcatTestCaseUnsupportedInputScaleZp() { + return [](ModelTestBuilder& builder) { + const std::vector> input_shapes = { + {1, 6, 36}, + {1, 6, 8}, + {1, 6, 2}, + }; + int64_t axis = 2; + + std::vector input_args; + std::vector q_input_args; + + // set unmatched input scales/zp for test purpose + input_args.push_back(builder.MakeInput(input_shapes[0], -1.f, 1.f)); + q_input_args.push_back(AddQDQNodePair(builder, input_args.back(), 0.05f, 128)); + input_args.push_back(builder.MakeInput(input_shapes[1], -1.f, 1.f)); + q_input_args.push_back(AddQDQNodePair(builder, input_args.back(), 0.04f, 127)); + input_args.push_back(builder.MakeInput(input_shapes[2], -1.f, 1.f)); + q_input_args.push_back(AddQDQNodePair(builder, input_args.back(), 0.03f, 126)); + + auto* concat_output = builder.MakeIntermediate(); + Node& concat_node = builder.AddNode("Concat", q_input_args, {concat_output}); + concat_node.AddAttribute("axis", axis); + + auto* q_concat_output = builder.MakeIntermediate(); + builder.AddQuantizeLinearNode(concat_output, 0.05f, 128, q_concat_output); + auto* output_arg = builder.MakeOutput(); + builder.AddDequantizeLinearNode(q_concat_output, 0.05f, 128, output_arg); + }; +} + } // namespace test } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h index affa5baf9d1d3..2ee6abcb548f2 100644 --- a/onnxruntime/test/optimizer/qdq_test_utils.h +++ b/onnxruntime/test/optimizer/qdq_test_utils.h @@ -256,8 +256,9 @@ GetQDQTestCaseFn BuildQDQTransposeTestCase( } template -GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector& input_shape, const int64_t& axis = -1) { - return [input_shape, axis](ModelTestBuilder& builder) { +GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector& input_shape, const int64_t& axis, + float output_scales, OutputType output_zero_point) { + return [input_shape, axis, output_scales, output_zero_point](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput(input_shape, std::numeric_limits::min(), std::numeric_limits::max()); @@ -275,7 +276,7 @@ GetQDQTestCaseFn BuildQDQSoftMaxTestCase(const std::vector& input_shape softmax_node.AddAttribute("axis", axis); // add Q - builder.AddQuantizeLinearNode(softmax_output, 1.f / 256, 0, output_arg); + builder.AddQuantizeLinearNode(softmax_output, output_scales, output_zero_point, output_arg); }; } @@ -288,5 +289,7 @@ GetQDQTestCaseFn BuildQDQConcatTestCase(const std::vector>& bool has_input_int8 = false, bool has_output_int8 = false); +GetQDQTestCaseFn BuildQDQConcatTestCaseUnsupportedInputScaleZp(); + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index 0ee91ee05f763..2a7a04c122655 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -147,6 +147,7 @@ Status PerformanceRunner::Run() { << "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n" // Time between start and end of run. Less than Total time cost when running requests in parallel. << "Total inference run time: " << inference_duration.count() << " s\n" + << "Number of inferences per second: " << performance_result_.time_costs.size() / inference_duration.count() << " \n" << "Avg CPU usage: " << performance_result_.average_CPU_usage << " %\n" << "Peak working set size: " << performance_result_.peak_workingset_size << " bytes" << std::endl; @@ -188,7 +189,9 @@ Status PerformanceRunner::RunParallelDuration() { count++; counter++; tpool->Schedule([this, &counter, &m, &cv]() { - session_->ThreadSafeRun(); + auto status = RunOneIteration(); + if (!status.IsOK()) + std::cerr << status.ErrorMessage(); // Simplified version of Eigen::Barrier std::lock_guard lg(m); counter--; diff --git a/onnxruntime/test/providers/cpu/controlflow/if_test.cc b/onnxruntime/test/providers/cpu/controlflow/if_test.cc index 5facccbc1e4e7..0b41549d4e320 100644 --- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc @@ -84,7 +84,7 @@ class IfOpTester : public OpTester { *split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches *split_attribute->Add() = 2; - split_node.AddAttribute("split", attr_proto); + split_node.AddAttributeProto(std::move(attr_proto)); } } @@ -382,7 +382,7 @@ class IfOpTesterOnlyConstantNodesInConditionalBranches : public OpTester { then_constant_attr_tensor_proto->add_dims(1); then_constant_attr_tensor_proto->add_float_data(value); // Constant value of 10.f - then_constant_node.AddAttribute("value", then_constant_attr_proto); + then_constant_node.AddAttributeProto(std::move(then_constant_attr_proto)); auto status_then = graph_then.Resolve(); EXPECT_EQ(status_then, Status::OK()); diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc index 7628c3454a7b9..c64c9a87a4237 100644 --- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc @@ -802,7 +802,7 @@ TEST(Loop, Opset11WithNoVariadicInputsAndOutputs) { constant_attribute_tensor_proto->set_data_type(TensorProto_DataType_FLOAT); // float scalar *constant_attribute_tensor_proto->mutable_float_data()->Add() = 1.0f; // float scalar with value 1.0f - constant_node.AddAttribute("value", attr_proto); + constant_node.AddAttributeProto(std::move(attr_proto)); } graph.SetInputs({&iter_num_in, &cond_in}); diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc index 5bdac54702c60..dd485636a0ddd 100644 --- a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc +++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc @@ -4,7 +4,10 @@ #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) #include "core/common/logging/logging.h" #include "core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h" +#include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h" +#include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h" #include "core/session/inference_session.h" +#include "core/framework/tensorprotoutils.h" #include "test/common/tensor_op_test_utils.h" #include "test/framework/test_utils.h" #include "test/util/include/asserts.h" @@ -271,9 +274,10 @@ TEST(NnapiExecutionProviderTest, TestNoShapeInputModel) { << "No node should be taken by the NNAPI EP"; } -static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, - const char* test_description, - const EPVerificationParams& params = EPVerificationParams()) { +static void RunQDQModelTest( + const GetQDQTestCaseFn& build_test_case, + const char* test_description, + const EPVerificationParams& params = EPVerificationParams()) { onnxruntime::Model model(test_description, false, DefaultLoggingManager().DefaultLogger()); Graph& graph = model.MainGraph(); ModelTestBuilder helper(graph); @@ -290,15 +294,22 @@ static void RunQDQModelTest(const GetQDQTestCaseFn& build_test_case, std::make_unique(0), helper.feeds_, params); #else - ORT_UNUSED_PARAMETER(params); // test load only SessionOptions so; InferenceSessionWrapper session_object{so, GetEnvironment()}; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::make_unique(0))); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); ASSERT_STATUS_OK(session_object.Initialize()); - ASSERT_GT(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0) - << "Some nodes should have been taken by the NNAPI EP"; + if (params.ep_node_assignment == ExpectedEPNodeAssignment::None) { + ASSERT_EQ(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0) + << "No node should have been taken by the NNAPI EP"; + } else if (params.ep_node_assignment == ExpectedEPNodeAssignment::All) { + ASSERT_EQ(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), session_object.GetGraph().NumberOfNodes()) + << "All nodes should have been taken by the NNAPI EP"; + } else { + ASSERT_GT(CountAssignedNodes(session_object.GetGraph(), kNnapiExecutionProvider), 0) + << "Some nodes should have been taken by the NNAPI EP"; + } #endif } @@ -310,7 +321,7 @@ TEST(NnapiExecutionProviderTest, TestQDQConv) { {1, 1, 5, 5} /* input_shape */, {1, 1, 3, 3} /* weights_shape */), "nnapi_qdq_test_graph_conv", - {true /* verify_entire_graph_use_ep */}); + {ExpectedEPNodeAssignment::All}); } TEST(NnapiExecutionProviderTest, TestQDQResize) { @@ -326,7 +337,14 @@ TEST(NnapiExecutionProviderTest, TestQDQResize) { "linear" /* mode */, "asymmetric" /* coordinate_transformation_mode */), "nnapi_qdq_test_graph_resize", - {false /* verify_entire_graph_use_ep */}); + {ExpectedEPNodeAssignment::Some}); +} + +TEST(NnapiExecutionProviderTest, TestQDQResize_UnsupportedDefaultSetting) { + RunQDQModelTest(BuildQDQResizeTestCase({1, 3, 64, 64} /* input_shape */, + {1, 3, 32, 32} /* sizes_data */), + "nnapi_qdq_test_graph_resize_unsupported", + {ExpectedEPNodeAssignment::None}); } TEST(NnapiExecutionProviderTest, TestQDQAveragePool) { @@ -336,7 +354,7 @@ TEST(NnapiExecutionProviderTest, TestQDQAveragePool) { {1, 3, 32, 32} /* input_shape */), "nnapi_qdq_test_graph_averagepool", { - true /* verify_entire_graph_use_ep */, + ExpectedEPNodeAssignment::All, 1e-2f /* fp32_abs_err */, }); } @@ -348,7 +366,7 @@ TEST(NnapiExecutionProviderTest, TestQDQAdd) { {1, 23, 13, 13} /* input_shape */, "Add" /* op_type */), "nnapi_qdq_test_graph_add", - {true /* verify_entire_graph_use_ep */}); + {ExpectedEPNodeAssignment::All}); } TEST(NnapiExecutionProviderTest, TestQDQMul) { @@ -360,8 +378,8 @@ TEST(NnapiExecutionProviderTest, TestQDQMul) { "Mul" /* op_type */), "nnapi_qdq_test_graph_mul", { - true /* verify_entire_graph_use_ep */, - 1e-2f /* fp32_abs_err */, + ExpectedEPNodeAssignment::All, + 1e-2f /* fp32_abs_err */ }); } @@ -371,28 +389,36 @@ TEST(NnapiExecutionProviderTest, TestQDQTranspose) { {1, 3, 32, 32} /* input_shape */, {0, 3, 1, 2} /* perms */), "nnapi_qdq_test_graph_transpose", - { - true /* verify_entire_graph_use_ep */ - }); + {ExpectedEPNodeAssignment::All}); } TEST(NnapiExecutionProviderTest, TestQDQReshape) { RunQDQModelTest(BuildQDQReshapeTestCase({1, 3, 64, 64} /* input_shape */, {1, 64, 64, 3} /* reshape_shape */), "nnapi_qdq_test_graph_reshape", - { - true /* verify_entire_graph_use_ep */ - }); + {ExpectedEPNodeAssignment::All}); } TEST(NnapiExecutionProviderTest, TestQDQSoftMax) { RunQDQModelTest(BuildQDQSoftMaxTestCase( {1, 32} /* input_shape */, - static_cast(1) /* axis */), + static_cast(1) /* axis */, + 1.f / 256 /* output_scales */, + 0 /* output_zp */), "nnapi_qdq_test_graph_softmax", - { - true /* verify_entire_graph_use_ep */ - }); + {ExpectedEPNodeAssignment::All}); +} + +// This is to verify when Nnapi required scale and zero point are not satisfied +// the model can work as expected. (no nodes should be handled by Nnapi) +TEST(NnapiExecutionProviderTest, TestQDQSoftMax_UnsupportedOutputScaleAndZp) { + RunQDQModelTest(BuildQDQSoftMaxTestCase( + {1, 32} /* input_shape */, + static_cast(1) /* axis */, + 0.002f /* output_scales */, + 1 /* output_zp */), + "nnapi_qdq_test_graph_softmax_unsupported", + {ExpectedEPNodeAssignment::None}); } TEST(NnapiExecutionProviderTest, TestQDQConcat) { @@ -403,11 +429,26 @@ TEST(NnapiExecutionProviderTest, TestQDQConcat) { {1, 6, 2}, } /* input_shapes */, 2 /* axis */), - "nnapi_qdq_test_graph_concat", { - true /* verify_entire_graph_use_ep */ - }); + "nnapi_qdq_test_graph_concat", + {ExpectedEPNodeAssignment::All}); } +#if defined(__ANDROID__) +TEST(NnapiExecutionProviderTest, TestQDQConcat_UnsupportedInputScalesAndZp) { + // This is to verify all the inputs have the same scale and zp as input 0 for API 28- + // Currently, this test can only be run locally with a android emulator with API < 29 + // See https://developer.android.com/studio/run/emulator-commandline for some info on + // starting a testing android emulator in command line. (Run an android build with emulator started) + // TODO: consider to configure this and enable it to run in Android CI. + const auto* nnapi = NnApiImplementation(); + if (nnapi->nnapi_runtime_feature_level < ANEURALNETWORKS_FEATURE_LEVEL_3) { + RunQDQModelTest(BuildQDQConcatTestCaseUnsupportedInputScaleZp(), + "nnapi_qdq_test_graph_concat_unsupported", + {ExpectedEPNodeAssignment::None}); + } +} +#endif + #endif // !(ORT_MINIMAL_BUILD) TEST(NnapiExecutionProviderTest, NNAPIFlagsTest) { diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h index 74bae67c39b1e..37d957f60c94a 100644 --- a/onnxruntime/test/providers/provider_test_utils.h +++ b/onnxruntime/test/providers/provider_test_utils.h @@ -317,7 +317,6 @@ class OpTester { AddData(input_data_, name, dims_var, p_values, size, is_initializer, false, dim_params); } - template void AddInput(const char* name, std::initializer_list dims, const TensorShapeVector& values, bool is_initializer = false, const std::vector* dim_params = nullptr) { @@ -500,7 +499,6 @@ class OpTester { values ? values->size() : 0, is_initializer, false, dim_params, 0.0f, 0.0f, true); } - template void AddOptionalTypeTensorOutput(const char* name, const DimsVariant& dims, const std::initializer_list* expected_values = nullptr, @@ -520,7 +518,6 @@ class OpTester { sort_output, nullptr /* dim_params */, rel_error, abs_error, true); } - template void AddOptionalTypeSeqInput(const char* name, const SeqTensors* seq_tensors) { @@ -546,12 +543,12 @@ class OpTester { } /* - * Use this API to add an input *edge* to the node/op being tested that won't - * have any data passed into. - * Such an edge will have the qualifier OpSchema::Optional in the schema. - * This is exposed to ensure the op kernel implementations can be tested to handle - * presence/absence of such optional input edges. - */ + * Use this API to add an input *edge* to the node/op being tested that won't + * have any data passed into. + * Such an edge will have the qualifier OpSchema::Optional in the schema. + * This is exposed to ensure the op kernel implementations can be tested to handle + * presence/absence of such optional input edges. + */ template void AddOptionalInputEdge() { std::string name; // empty == input doesn't exist @@ -575,7 +572,7 @@ class OpTester { sort_output, nullptr /* dim_params */, rel_error, abs_error); } - template + template void AddOutput(const char* name, std::initializer_list dims, const T* p_values, const size_t size, bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) { const DimsVariant dims_var = std::vector(dims); @@ -583,7 +580,6 @@ class OpTester { sort_output, nullptr /* dim_params */, rel_error, abs_error); } - template void AddOutput(const char* name, const DimsVariant& dims, std::initializer_list expected_values, bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) { @@ -712,12 +708,12 @@ class OpTester { #endif /* - * Use this API to add an output *edge* to the node/op being tested that shouldn't have any - * data produced into. - * Such an edge will have the qualifier OpSchema::Optional in the schema. - * This is exposed to ensure the op kernel implementations can be tested to handle - * presence/absence of such optional output edges. - */ + * Use this API to add an output *edge* to the node/op being tested that shouldn't have any + * data produced into. + * Such an edge will have the qualifier OpSchema::Optional in the schema. + * This is exposed to ensure the op kernel implementations can be tested to handle + * presence/absence of such optional output edges. + */ template void AddOptionalOutputEdge() { std::string name; // empty == output doesn't exist @@ -786,6 +782,12 @@ class OpTester { custom_output_verifier_ = custom_output_verifier; } + void AddAttributeProto(ONNX_NAMESPACE::AttributeProto attr) { + add_attribute_funcs_.emplace_back([attr = std::move(attr)](onnxruntime::Node& node) { + node.AddAttributeProto(attr); + }); + } + template void AddAttribute(std::string name, T value) { // Generate a the proper AddAttribute call for later diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index fefd119f0c3da..34206ebbe7c87 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -36,7 +36,7 @@ void VerifyOutputs(const std::vector& fetches, const std::vector& fetches, const std::vectorsecond; + auto ranges = it->second; for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) { if (it2->first == 1) { ASSERT_EQ(it2->second.first, 3); @@ -439,7 +439,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { // check min/max shape ranges of dynamic shape dimensions for(auto it = shape_ranges2.cbegin(); it != shape_ranges2.cend(); ++it) { - auto ranges = it->second; + auto ranges = it->second; for (auto it2 = ranges.cbegin(); it2 != ranges.cend(); ++it2) { if (it2->first == 1) { ASSERT_EQ(it2->second.first, 1); @@ -470,7 +470,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { * We have following test parameters: * - engine_static: engine cache enabled with non-dynamic input shape * - engine_dynamic: engine cache enabled with dynamic input shape - * - timing_static: will be added + * - timing_static: will be added * - timing_dynamic: will be added */ INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static", @@ -591,11 +591,7 @@ TEST(TensorrtExecutionProviderTest, NodeIndexMappingTest) { auto& output_arg_1 = graph.GetOrCreateNodeArg("node_1_out", &uint8_tensor); outputs.push_back(&output_arg_1); auto& cast_node = graph.AddNode("cast1", "Cast", "node 1.", inputs, outputs); - AttributeProto attr_proto; - attr_proto.set_name("to"); - attr_proto.set_type(AttributeProto_AttributeType_INT); - attr_proto.set_i(2); - cast_node.AddAttribute("to", attr_proto); + cast_node.AddAttribute("to", int64_t{2}); inputs.clear(); inputs.push_back(&output_arg_1); @@ -603,11 +599,7 @@ TEST(TensorrtExecutionProviderTest, NodeIndexMappingTest) { outputs.clear(); outputs.push_back(&output_arg_2); auto& cast_node_2 = graph.AddNode("cast2", "Cast", "node 2.", inputs, outputs); - AttributeProto attr_proto_2; - attr_proto_2.set_name("to"); - attr_proto_2.set_type(AttributeProto_AttributeType_INT); - attr_proto_2.set_i(9); - cast_node_2.AddAttribute("to", attr_proto_2); + cast_node_2.AddAttribute("to", int64_t{9}); auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor); auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor); diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index e6e982aff1f5c..858f18c97e4da 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -1068,6 +1068,33 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self): so2.log_severity_level = 1 onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=so2, providers=onnxrt.get_available_providers()) + def testMemoryArenaShrinkage(self): + if platform.architecture()[0] == '32bit' or 'ppc' in platform.machine() or 'powerpc' in platform.machine(): + # on x86 or ppc builds, the CPU allocator does not use an arena + print("Skipping testMemoryArenaShrinkage in 32bit or powerpc platform.") + else: + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + + sess1 = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=['CPUExecutionProvider']) + input_name = sess1.get_inputs()[0].name + + # Shrink CPU memory after execution + ro1 = onnxrt.RunOptions() + ro1.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu:0") + self.assertEqual(ro1.get_run_config_entry("memory.enable_memory_arena_shrinkage"), "cpu:0") + sess1.run([], {input_name: x}, ro1) + + available_providers = onnxrt.get_available_providers() + if 'CUDAExecutionProvider' in available_providers: + sess2 = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=available_providers) + input_name = sess2.get_inputs()[0].name + + # Shrink CPU and GPU memory after execution + ro2 = onnxrt.RunOptions() + ro2.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu:0;gpu:0") + self.assertEqual(ro2.get_run_config_entry("memory.enable_memory_arena_shrinkage"), "cpu:0;gpu:0") + sess2.run([], {input_name: x}, ro2) + def testCheckAndNormalizeProviderArgs(self): from onnxruntime.capi.onnxruntime_inference_collection import check_and_normalize_provider_args diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py index 263dead104dc3..5abd6fdcffbde 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py +++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py @@ -25,21 +25,13 @@ def unique_element(lst): class TestSymbolicShapeInference(unittest.TestCase): def test_symbolic_shape_infer(self): - # skip these tests before this issue is fixed: - # https://github.com/microsoft/onnxruntime/issues/10761 - test_skip_due_to_onnx_1_11_shape_inference_change = ["GPT2", "GPT2_LM_HEAD", "test_GPT2"] - + cwd = os.getcwd() test_model_dir = os.path.join(cwd, '..', 'models') for filename in Path(test_model_dir).rglob('*.onnx'): if filename.name.startswith('.'): continue # skip some bad model files - if len(filename.parts) > 1 and \ - filename.parts[len(filename.parts) - 2] in test_skip_due_to_onnx_1_11_shape_inference_change: - print("Skip symbolic shape inference on : " + str(filename)) - continue - print("Running symbolic shape inference on : " + str(filename)) SymbolicShapeInference.infer_shapes(in_mp=onnx.load(str(filename)), auto_merge=True, diff --git a/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx b/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx index e9568e381d21b..debd5244abce5 100644 Binary files a/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx and b/onnxruntime/test/python/transformers/test_data/models/gpt2_megatron_opt.onnx differ diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h index 50859d826fa2d..e1e2e8a542325 100644 --- a/onnxruntime/test/util/include/test_utils.h +++ b/onnxruntime/test/util/include/test_utils.h @@ -15,11 +15,17 @@ class Graph; namespace test { +// If set to All: verify the entire graph is taken by ep +// If set to Some: verify that at least one node is assigned to ep +// If set to None: verify that no nodes is assigned to ep (typically for an expected failure path test case) +enum class ExpectedEPNodeAssignment { None, + Some, + All, }; + // struct to hold some verification params for RunAndVerifyOutputsWithEP struct EPVerificationParams { - // Verify the entire graph is taken by the EP - // if this is set to false, then will verify that at least one node is assigned to 'execution_provider' - bool verify_entire_graph_use_ep{false}; + + ExpectedEPNodeAssignment ep_node_assignment = ExpectedEPNodeAssignment::Some; // Some EP may use different rounding than ORT CPU EP, which may cause a bigger abs error than // the default of 1e-5f, especially for scenarios such as [Q -> Quantized op -> DQ] diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc index b069b08810cb8..ac7996581225f 100644 --- a/onnxruntime/test/util/test_utils.cc +++ b/onnxruntime/test/util/test_utils.cc @@ -123,9 +123,12 @@ void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id // make sure that some nodes are assigned to the EP, otherwise this test is pointless... const auto& graph2 = session_object2.GetGraph(); auto ep_nodes = CountAssignedNodes(graph2, provider_type); - if (params.verify_entire_graph_use_ep) { + if (params.ep_node_assignment == ExpectedEPNodeAssignment::All) { // Verify the entire graph is assigned to the EP ASSERT_EQ(ep_nodes, graph2.NumberOfNodes()) << "Not all nodes were assigned to " << provider_type; + } else if (params.ep_node_assignment == ExpectedEPNodeAssignment::None) { + // Check if expected failure path is correctly handled by ep. (only used in NNAPI EP QDQ model test case for now) + ASSERT_EQ(ep_nodes, 0) << "No nodes are supposed to be assigned to " << provider_type; } else { ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type; } diff --git a/orttraining/orttraining/eager/opgen/opgen/atenops.py b/orttraining/orttraining/eager/opgen/opgen/atenops.py index 8bb882571bdbb..1d80f2a48b9ae 100644 --- a/orttraining/orttraining/eager/opgen/opgen/atenops.py +++ b/orttraining/orttraining/eager/opgen/opgen/atenops.py @@ -8,6 +8,11 @@ from opgen.onnxops import * +import torch +from packaging import version + +TORCH_API_CHANGE_VERSION = "1.11.1" + kMSDomain = 'onnxruntime::kMSDomain' class ReluGrad(ONNXOp): @@ -79,7 +84,6 @@ def __init__(self, dY, X): 'aten::softshrink': Shrink('self', bias='lambd', lambd='lambd'), #yes, bias is set to 'lambd' 'aten::hardshrink': Shrink('self', bias=0, lambd='lambd'), 'aten::gelu' : Gelu('self'), - 'aten::gelu_backward' : GeluGrad('grad', 'self'), 'aten::max' : ReduceMax('self', keepdims=1), 'aten::min' : ReduceMin('self', keepdims=1), 'aten::_cat': Concat('tensors', 'dim'), @@ -95,6 +99,13 @@ def __init__(self, dY, X): 'aten::gt.Scalar_out' : MakeTorchFallback(), } +# Signature of gelu_backward was changed in this commit id 983ba5e585485ed61a0c0012ef6944f5685e3d97 and PR 61439 +# This is done to make sure it is backward and future compatible +if version.parse(torch.__version__) < version.parse(TORCH_API_CHANGE_VERSION): + hand_implemented['aten::gelu_backward'] = GeluGrad('grad', 'self') +else: + hand_implemented['aten::gelu_backward'] = GeluGrad('grad_output', 'self') + ops = {**ops, **hand_implemented} # TODO: this is a temporary allowlist for ops need type promotion # Need to enhance the support for onnx type constrains to automatically diff --git a/orttraining/orttraining/eager/ort_eager_common.h b/orttraining/orttraining/eager/ort_eager_common.h index 3de3c2d1b8266..e7f54b8d33c68 100644 --- a/orttraining/orttraining/eager/ort_eager_common.h +++ b/orttraining/orttraining/eager/ort_eager_common.h @@ -4,4 +4,5 @@ #pragma once // include the pybind header first, it will disable linking to pythonX_d.lib on Windows in debug mode #include "python/onnxruntime_pybind_state_common.h" -#include \ No newline at end of file +#include +#include \ No newline at end of file diff --git a/orttraining/orttraining/eager/test/ort_eps_test.py b/orttraining/orttraining/eager/test/ort_eps_test.py index 9122b09b2134f..9a5c8ba32b914 100644 --- a/orttraining/orttraining/eager/test/ort_eps_test.py +++ b/orttraining/orttraining/eager/test/ort_eps_test.py @@ -120,13 +120,12 @@ def test_import_custom_eps(self): ort_device = torch_ort.device(1) assert 'My EP provider created, with device id: 0, some_option: val' in out.capturedtext - #disable the print test for now as we need to merge a PR to pytorch first. - #def test_print(self): - # x = torch.ones(1, 2) - # ort_x = x.to('ort') - # with OutputGrabber() as out: - # print(ort_x) - # assert "tensor([[1., 1.]], device='ort:0')" in out.capturedtext + def test_print(self): + x = torch.ones(1, 2) + ort_x = x.to('ort') + with OutputGrabber() as out: + print(ort_x) + assert "tensor([[1., 1.]], device='ort:0')" in out.capturedtext if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp b/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp index 30849d15768bc..9d9ac0cbb5682 100644 --- a/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp +++ b/orttraining/orttraining/models/mnist/mnist_reader/mnist_reader_common.hpp @@ -38,8 +38,7 @@ inline std::unique_ptr read_mnist_file(const std::string& path, uint32_t file.open(path, std::ios::in | std::ios::binary | std::ios::ate); if (!file) { - std::cout << "Error opening file " << path << std::endl; - std::cout << std::system_error(errno, std::system_category(), "failed to open " + path).what(); + std::cout << "Error opening file " << path << " - system error " << errno << std::endl; return {}; } diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py index a721fc64a2615..cd3cd1e66c3f6 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py @@ -257,22 +257,28 @@ def permute_and_reshape_tensor(g, tensor, is_lhs, rank, perm, matmul_output_axes remaining_axes = [axis for axis in range(rank) if axis not in axes_to_remove] # Calculate the new shape, use 0 or -1 if possible. shape_tensors = [] - all_zeros = True + before_contiguous_axes = True + last_zero_dim = -1 + has_neg_one_dim = False for axis in remaining_axes: if axis == first_matmul_output_axis: shape_tensors.append(matmul_output_numel_tensor) - all_zeros = False + before_contiguous_axes = False elif axis == first_contraction_axis: shape_tensors.append(contraction_numel_tensor) - all_zeros = False - elif all_zeros: + before_contiguous_axes = False + elif before_contiguous_axes: shape_tensors.append(g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))) + last_zero_dim = len(shape_tensors) - 1 elif axis == remaining_axes[-1]: shape_tensors.append(g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + has_neg_one_dim = True else: single_axis_shape_tensor, _, shape_tensor = get_shape_tensor_by_axes( g, tensor, shape_tensor, [axis], False) shape_tensors.append(single_axis_shape_tensor) + if not has_neg_one_dim and last_zero_dim >= 0: + shape_tensors[last_zero_dim] = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)) # Adjust the perm. perm = [axis for axis in perm if axis not in axes_to_remove] new_axis = 0 @@ -458,16 +464,22 @@ def einsum(g, equation, tensor_list): # Need to Reshape the result for the example, the new shape is [size(s), size(m)]. if len(lhs_matmul_output_axes) != 1 or len(rhs_matmul_output_axes) != 1: shape_tensors = [g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))] * len(batched_axes) + last_zero_dim = len(shape_tensors) - 1 + has_neg_one_dim = False if lhs_matmul_output_axes: if len(lhs_matmul_output_axes) == 1: shape_tensors.append(g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))) + last_zero_dim = len(shape_tensors) - 1 else: shape_tensors.append(lhs_matmul_output_shape_tensor) if rhs_matmul_output_axes: if len(rhs_matmul_output_axes) == 1: shape_tensors.append(g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + has_neg_one_dim = True else: shape_tensors.append(rhs_matmul_output_shape_tensor) + if not has_neg_one_dim and last_zero_dim >= 0: + shape_tensors[last_zero_dim] = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)) result = reshape_tensor(g, result, shape_tensors) # Now output axes is ordered by [batched_axes, lhs_matmul_output_axes, rhs_matmut_output_axes], diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 7a1f0a07bc1f1..359f09114da70 100644 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -311,6 +311,10 @@ def _export_model(self, *inputs, **kwargs): # Model is not re-exported when the model parameters change. This can happen when the model is a stateful model, # or the user explicitly changed model parameters after the onnx export. + # Record random states here and restore later in case any of them gets changed during the export, + # e.g., some sympy functions in symbolic_shape_infer will change Python's random state. + random_states = _utils.get_random_states() + schema = _io._extract_schema( {'args': copy.copy(inputs), 'kwargs': copy.copy(kwargs)}) if self._onnx_models.exported_model and schema == self._input_info.schema and not self._original_model_has_changed: @@ -329,6 +333,9 @@ def _export_model(self, *inputs, **kwargs): self._onnx_models.exported_model = SymbolicShapeInference.infer_shapes(self._onnx_models.exported_model, auto_merge=True, guess_output_rank=True) + # Restore the recorded random states + _utils.set_random_states(random_states) + return True def _get_exported_model(self, input_schema, *inputs, **kwargs): diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index eb8d1ec3a9328..b617d574460b9 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -16,7 +16,6 @@ from ._fallback import (ORTModuleFallbackException, _FallbackPolicy, _FallbackManager) -from .torch_cpp_extensions.cpu.torch_interop_utils import clear_all_grad_fns from onnxruntime.capi import _pybind_state as C from onnxruntime.capi.onnxruntime_inference_collection import get_ort_device_type @@ -40,10 +39,6 @@ def __init__(self, model, debug_options: DebugOptions, fallback_manager: _Fallba def execution_session_run_forward(execution_session, onnx_model, device, gradient_accumulation_manager, *inputs): """Runs the forward graph on execution_session with given model inputs and device""" - # Clear all gradient functions, to avoid a deadlock issue. - # Check the called function for more detailed comments. - clear_all_grad_fns() - # TODO: Try to reuse the output buffers as some of the output tensors are same sizes, # especially the backward graph outputs. # REVIEW(codemzs): Consolidate Training Agent with InferenceAgent on C++ side to not diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py index 23dfea316a3eb..534efd2804ba8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_utils.py +++ b/orttraining/orttraining/python/training/ortmodule/_utils.py @@ -25,6 +25,24 @@ import types import warnings from distutils.version import LooseVersion +import random +import numpy as np + +def get_random_states(): + r_state = random.getstate() + np_state = np.random.get_state() + torch_state = torch.get_rng_state() + torch_cuda_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None + return r_state, np_state, torch_state, torch_cuda_state + +def set_random_states(states): + r_state, np_state, torch_state, torch_cuda_state = states + random.setstate(r_state) + np.random.set_state(np_state) + torch.set_rng_state(torch_state) + if torch_cuda_state is not None: + torch.cuda.set_rng_state(torch_cuda_state) + def _ortvalue_from_torch_tensor(torch_tensor): # TODO: Current DLPack doesn't support bool and PyTorch disables converting bool tensor to DLPack in recent commit. diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc index bc930899f40d6..a8445bf64f99d 100644 --- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc +++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/torch_interop_utils.cc @@ -114,16 +114,27 @@ void unregister_grad_fn(size_t ctx_address) PyNodeSharedPointerPool::GetInstance().UnRegisterGradFunc(ctx_address); } -// Supposed to be cleared on python program exit or before every forward run to resolve following issues: -// 1. When training program exits, PyNodeSharedPointerPool destructor is called, if grad_fns_ is not empty, +// Supposed to be cleared on python program exit to resolve following issue: +// When training program exits, PyNodeSharedPointerPool destructor is called, if grad_fns_ is not empty, // PyNode::release_variables() will be called. // (https://github.com/pytorch/pytorch/blob/15532595209d2daf34d35e10f8d3d3b64966aea2/torch/csrc/autograd/python_function.cpp#L168) // The other hand, there is known issue when acquiring GIL in pybind11 destructors, there will be probabbly deadlock issue. // (https://github.com/pybind/pybind11/issues/1446) // The resolution here, we remove all maintained states before program exits. -// 2. When forward functions is called repeated without corresponding backward calls, grad functions keeps accumulating without releasing -// (happening in backward) -void clear_all_grad_fns(){ + +// A known existing issue: when forward functions is called repeatedly without corresponding backward calls, +// grad functions keeps accumulating without releasing, there might be memory (bound to those gradient function) leaks. +// Ideally this usually won't happen in real training case, so it should be fine. + +// We CANNOT explictly clear grad functions before each forward pass to mitigate the known issue above. +// For example: +// loss1 = forward_run(inputs1) +// loss2 = forward_run(inputs2) +// loss = loss1 + loss2 +// loss.backward() +// If we clear grad functions in the beggining of the second `forward_run`, when `loss.backward()` runs, +// the backward path of `loss1` will fail to run PythonOpGrad ops (if there is any). +void clear_all_grad_fns() { PyNodeSharedPointerPool::GetInstance().ClearAll(); } diff --git a/orttraining/orttraining/test/gradient/allreduce_op_test.cc b/orttraining/orttraining/test/gradient/allreduce_op_test.cc index 1528ac6f5c705..9378c751a32cb 100644 --- a/orttraining/orttraining/test/gradient/allreduce_op_test.cc +++ b/orttraining/orttraining/test/gradient/allreduce_op_test.cc @@ -43,7 +43,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestReduceTwoTensors) { allreduce_test.AddOutput("G_new1", {3}, output_grad); allreduce_test.AddOutput("G_new2", {3}, output_grad); - allreduce_test.AddAttribute("reduce_algo", static_cast(0)); + allreduce_test.AddAttribute("reduce_algo", int64_t{0}); std::vector> providers; providers.push_back(DefaultCpuExecutionProvider()); @@ -86,7 +86,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestReduceTwoTensorsFP16) { allreduce_test.AddOutput("G_new1", {3}, output_grad_half); allreduce_test.AddOutput("G_new2", {3}, output_grad_half); - allreduce_test.AddAttribute("reduce_algo", static_cast(0)); + allreduce_test.AddAttribute("reduce_algo", int64_t{0}); std::vector> providers; providers.push_back(DefaultCpuExecutionProvider()); @@ -112,7 +112,7 @@ TEST(AllreduceTest, CPUAdasumAllreduceTestFailTensorCountMismatch) { allreduce_test.AddOutput("G_new1", {3}, {5.6301f, 6.5235f, 7.4169f}); allreduce_test.AddOutput("G_new2", {3}, {5.6301f, 6.5235f, 7.4169f}); - allreduce_test.AddAttribute("reduce_algo", static_cast(0)); + allreduce_test.AddAttribute("reduce_algo", int64_t{0}); std::vector> providers; providers.push_back(DefaultCpuExecutionProvider()); @@ -224,18 +224,8 @@ void build_optimizer_node(Graph& graph, auto& optimizer_node = graph.AddNode(input_gradient->Name() + "_adam_optimizer", "AdamOptimizer", "Adam optimizer.", optimizer_inputs, optimizer_outputs, nullptr /*attributes*/, kMSDomain); - ONNX_NAMESPACE::AttributeProto bias_correction_attribute, weight_decay_mode_attribute; - - bias_correction_attribute.set_name("do_bias_correction"); - bias_correction_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - bias_correction_attribute.set_i(0); - - weight_decay_mode_attribute.set_name("weight_decay_mode"); - weight_decay_mode_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - weight_decay_mode_attribute.set_i(0); - - optimizer_node.AddAttribute("do_bias_correction", bias_correction_attribute); - optimizer_node.AddAttribute("weight_decay_mode", weight_decay_mode_attribute); + optimizer_node.AddAttribute("do_bias_correction", int64_t{0}); + optimizer_node.AddAttribute("weight_decay_mode", int64_t{0}); } using AllreduceGraphConfigVector = std::vector(config[i]) + "_scaled_grad", "MixedPrecisionScale", "scale grad", scale_grad_inputs, {&scale_grad_output_arg}, nullptr /*attributes*/, kMSDomain); - ONNX_NAMESPACE::AttributeProto scale_attribute; - scale_attribute.set_name("to"); - scale_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - scale_attribute.set_i(static_cast(element_type)); - scaled_grad_node.AddAttribute("to", scale_attribute); + scaled_grad_node.AddAttribute("to", int64_t{element_type}); } // Set inputs of next node to be outputs of scale node. inputs.clear(); @@ -359,19 +340,9 @@ void build_allreduce_graph(Graph& graph, AllreduceGraphConfigVector& config, auto& allreduce_node = graph.AddNode("node_allreduce", allreduce_op_name, "node allreduce.", inputs, allreduce_outputs, nullptr /*attributes*/, kMSDomain); if (adasum_reduce_type != training::AdasumReductionType::None) { - // Attribute - ONNX_NAMESPACE::AttributeProto adasum_reduction_type_attribute; - adasum_reduction_type_attribute.set_name("reduce_algo"); - adasum_reduction_type_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - adasum_reduction_type_attribute.set_i(static_cast(adasum_reduce_type)); - allreduce_node.AddAttribute("reduce_algo", adasum_reduction_type_attribute); + allreduce_node.AddAttribute("reduce_algo", static_cast(adasum_reduce_type)); } else { - // Attribute - ONNX_NAMESPACE::AttributeProto group_type_attribute; - group_type_attribute.set_name("group_type"); - group_type_attribute.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); - group_type_attribute.set_i(0 /*data parallel*/); - allreduce_node.AddAttribute("group_type", group_type_attribute); + allreduce_node.AddAttribute("group_type", int64_t{0} /*data parallel*/); } if (build_optimizer) { diff --git a/orttraining/orttraining/test/gradient/gradient_checker.cc b/orttraining/orttraining/test/gradient/gradient_checker.cc index 3013f2686a9ea..417fc2861e36c 100644 --- a/orttraining/orttraining/test/gradient/gradient_checker.cc +++ b/orttraining/orttraining/test/gradient/gradient_checker.cc @@ -263,7 +263,7 @@ inline Status GradientChecker::InitOpTesterWithGraph( } // Currently only allows setting int attributes to zero. TODO: Expand this for (auto attr : attributes) { - op_session.AddAttribute(attr.name(), attr); + op_session.AddAttributeProto(attr); } // build graph diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py index b190a219e604f..aea0ed2fef134 100644 --- a/orttraining/orttraining/test/python/_test_helpers.py +++ b/orttraining/orttraining/test/python/_test_helpers.py @@ -215,88 +215,84 @@ def assert_values_are_close(input, other, rtol=1e-05, atol=1e-06): def enable_custom_autograd_function(module): enable_custom_autograd_support() -def run_with_pytorch_on_device(device, model, input_list, label_input, is_eval_mode=False): - with torch.no_grad(): - model = copy.deepcopy(model).to(device) +def _run_model_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False): if is_eval_mode: model.eval() else: model.train() - with torch.no_grad(): - inputs_on_device = [input_.to(device) for input_ in input_list] - for i, val in enumerate(input_list): - if val.requires_grad: - inputs_on_device[i].requires_grad_() - target = label_input.to(device) - - output = model(*inputs_on_device) - forward_outputs = [output] + def generate_inputs(input_list_, label_input_): + with torch.no_grad(): + inputs_on_device = [input_.to(device) for input_ in input_list_] + for i, val in enumerate(input_list_): + if val.requires_grad: + inputs_on_device[i].requires_grad_() + with torch.no_grad(): + target = label_input_.to(device) + return inputs_on_device, target + + inputs_on_device1, target1 = generate_inputs(input_list, label_input) + if run_forward_twice is True: + inputs_on_device2, target2 = generate_inputs(input_list, label_input) + + output1 = model(*inputs_on_device1) + if run_forward_twice is True: + output2 = model(*inputs_on_device2) + + forward_outputs = [output1] grad_outputs = [] if not is_eval_mode: criterion = torch.nn.MSELoss() - loss = criterion(output, target) + loss = criterion(output1, target1) + + if run_forward_twice is True: + loss += criterion(output2, target2) + loss.backward() for name, param in model.named_parameters(): if param.requires_grad: grad_outputs.append(param.grad) return forward_outputs, grad_outputs -def run_with_ort_on_device(device, model, input_list, label_input, is_eval_mode=False): +def run_with_pytorch_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False): + with torch.no_grad(): + model = copy.deepcopy(model).to(device) + + return _run_model_on_device(device, model, input_list, label_input, is_eval_mode, run_forward_twice) + +def run_with_ort_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False): with torch.no_grad(): model = copy.deepcopy(model) model.to(device) enable_custom_autograd_function(model) model = ORTModule(model) - if is_eval_mode: - model.eval() - else: - model.train() - - with torch.no_grad(): - inputs_on_device = [input_.to(device) for input_ in input_list] - for i, val in enumerate(input_list): - if val.requires_grad: - inputs_on_device[i].requires_grad_() - - target = label_input.to(device) - output = model(*inputs_on_device) - forward_outputs = [output] - grad_outputs = [] - if not is_eval_mode: - criterion = torch.nn.MSELoss() - loss = criterion(output, target) - loss.backward() - for name, param in model.named_parameters(): - if param.requires_grad: - grad_outputs.append(param.grad) - return forward_outputs, grad_outputs + return _run_model_on_device(device, model, input_list, label_input, is_eval_mode, run_forward_twice) def compare_tensor_list(val_list_a, val_list_b): for val_a, val_b in zip(val_list_a, val_list_b): assert_values_are_close(val_a, val_b, atol=1e-7, rtol=1e-6) def run_training_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, - ignore_grad_compare=False, expected_outputs=[], expected_grads=[]): + run_forward_twice=False, ignore_grad_compare=False, expected_outputs=[], expected_grads=[]): cpu = torch.device("cpu") def cpu_barrier_func(): pass run_training_test_on_device_and_compare( cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cpu_barrier_func, - ignore_grad_compare, expected_outputs, expected_grads) + run_forward_twice, ignore_grad_compare, expected_outputs, expected_grads) def cuda_barrier_func(): torch.cuda.synchronize() cuda = torch.device('cuda:0') run_training_test_on_device_and_compare( cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cuda_barrier_func, - ignore_grad_compare, expected_outputs, expected_grads) + run_forward_twice, ignore_grad_compare, expected_outputs, expected_grads) def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, barrier_func, - ignore_grad_compare=False, expected_outputs=[], expected_grads=[]): + run_forward_twice=False, ignore_grad_compare=False, expected_outputs=[], expected_grads=[]): repeats = 16 for i in range(repeats): m = pt_model_builder_func() @@ -307,11 +303,11 @@ def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_mo x_ort = copy.deepcopy(x) outputs, grads = run_with_pytorch_on_device( - device, m, [x], pt_model_label_input) + device, m, [x], pt_model_label_input, run_forward_twice=run_forward_twice) barrier_func() outputs_ort, grads_ort = run_with_ort_on_device( - device, m_ort, [x_ort], pt_model_label_input) + device, m_ort, [x_ort], pt_model_label_input, run_forward_twice=run_forward_twice) barrier_func() val_list_a = [o.detach().cpu() for o in outputs if o is not None] @@ -330,14 +326,16 @@ def run_training_test_on_device_and_compare(device, pt_model_builder_func, pt_mo if len(expected_grads) > 0: compare_tensor_list(val_list_a, expected_grads) -def run_evaluate_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input): +def run_evaluate_test_and_compare(pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, + run_forward_twice=False): cpu = torch.device("cpu") def cpu_barrier_func(): pass run_evaluate_test_on_device_and_compare( - cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cpu_barrier_func) + cpu, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, + cpu_barrier_func, run_forward_twice=run_forward_twice) def cuda_barrier_func(): torch.cuda.synchronize() @@ -345,9 +343,11 @@ def cuda_barrier_func(): cuda = torch.device('cuda:0') run_evaluate_test_on_device_and_compare( - cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, cuda_barrier_func) + cuda, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, + cuda_barrier_func, run_forward_twice=run_forward_twice) -def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator, pt_model_label_input, barrier_func): +def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_model_inputs_generator, + pt_model_label_input, barrier_func, run_forward_twice=False): repeats = 16 for i in range(repeats): m = pt_model_builder_func() @@ -357,11 +357,11 @@ def run_evaluate_test_on_device_and_compare(device, pt_model_builder_func, pt_mo x_ort = copy.deepcopy(x) outputs, grads = run_with_pytorch_on_device( - device, m, [x], pt_model_label_input, is_eval_mode=True) + device, m, [x], pt_model_label_input, is_eval_mode=True, run_forward_twice=run_forward_twice) barrier_func() outputs_ort, grads_ort = run_with_ort_on_device( - device, m_ort, [x_ort], pt_model_label_input, is_eval_mode=True) + device, m_ort, [x_ort], pt_model_label_input, is_eval_mode=True, run_forward_twice=run_forward_twice) barrier_func() val_list_a = [o.detach().cpu() for o in outputs if o is not None] diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 0ff12f9040394..acd1b0b6c32b4 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -1134,8 +1134,16 @@ def run_step(model, input): _test_helpers.assert_values_are_close(ort_prediction, pt_prediction) _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad) -@pytest.mark.parametrize("equation", ["s,se->se", "se,sc->sec", "se,se->s", "sec,sm->ecm", - "sec,ecm->sm", "ks,ksm->sm", "kes,ems->mek", "kes,ksm->ms"]) +# In PyTorch 1.11.0, there is issue during reduce node shape handling for exporter, so any sub-graph that +# contains ReduceProd will fail to run, for example, "sec,sm->ecm", "sec,ecm->sm". +# Currently skip these cases and test_gradient_correctness_einsum_2, +# will enable these tests again once the issue in PyTorch is fixed. +skip_torch_1_11 = pytest.mark.skipif(LooseVersion(torch.__version__) >= LooseVersion('1.11.0'), reason="PyTorch 1.11 incompatible") +@pytest.mark.parametrize("equation", [ + "s,se->se", "se,sc->sec", "se,se->s", "ks,ksm->sm", "kes,ems->mek", "kes,ksm->ms", + pytest.param("sec,sm->ecm", marks=[skip_torch_1_11]), + pytest.param("sec,ecm->sm", marks=[skip_torch_1_11]) +]) def test_gradient_correctness_einsum(equation): class NeuralNetEinsum(torch.nn.Module): def __init__(self, bias_size): @@ -1183,6 +1191,7 @@ def run_step(model, input_left, input_right): _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-3) _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, atol=1e-3, rtol=1e-3) +@skip_torch_1_11 def test_gradient_correctness_einsum_2(): class NeuralNetEinsum(torch.nn.Module): def __init__(self, bias_size): @@ -4818,3 +4827,43 @@ def run_step(model, x): _test_helpers.assert_values_are_close(pt_loss, ort_loss) _test_helpers.assert_values_are_close(pt_x.grad, ort_x.grad) assert ortmodule_module.ONNX_OPSET_VERSION == DEFAULT_OPSET + + +def test_random_states_unchanged_for_ortmodule(): + import numpy + + os.environ['ORTMODULE_FALLBACK_RETRY'] = 'False' + + class NeuralNetSlice(torch.nn.Module): + def __init__(self): + super(NeuralNetSlice, self).__init__() + self.dim = 32 + + def forward(self, x): + # This slice operation will call sympy.Min() when exporting, which will change Python's random state + return x[:self.dim, :] + + def random_state_equal(a, b): + assert type(a) == type(b) + if isinstance(a, tuple): + assert len(a) == len(b) + return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b)]) + if isinstance(a, numpy.ndarray): + return numpy.array_equal(a, b) + if isinstance(a, torch.Tensor): + return torch.equal(a, b) + return a == b + + model = NeuralNetSlice() + x = torch.randn(16, 16) + + ori_random_states = _utils.get_random_states() + + ort_model = ORTModule(model) + ort_model(x) + + new_random_states = _utils.get_random_states() + + assert random_state_equal(ori_random_states, new_random_states) + + del os.environ['ORTMODULE_FALLBACK_RETRY'] diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py index 3b2e6bc6a38f6..a3f118380c6af 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py @@ -138,6 +138,61 @@ def input_generator(): run_training_test_and_compare(model_builder, input_generator, label_input) +def test_GeLU_multiple_forward_runs(): + @torch.jit.script + def bias_gelu(bias, y): + x = bias + y + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + @torch.jit.script + def bias_gelu_backward(g, bias, y): + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff*g + + class GeLUFunction3(torch.autograd.Function): + @staticmethod + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(bias, input) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_backward(grad_output, bias, input) + return tmp, tmp + + class GeLUModel(torch.nn.Module): + def __init__(self, output_size): + super(GeLUModel, self).__init__() + self.relu = GeLUFunction3.apply + self.bias = Parameter(torch.empty( + output_size, + device=torch.cuda.current_device(), + dtype=torch.float)) + + with torch.no_grad(): + self.bias.uniform_() + + def forward(self, model_input): + out = self.relu(model_input, self.bias) + return out + + output_size = 1024 + + def model_builder(): + return GeLUModel(output_size) + + def input_generator(): + return torch.randn(output_size, dtype=torch.float) + + # generate a label that have same shape as forward output. + label_input = torch.ones([output_size]) + + run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True) + def test_MegatronF(): # MegatronGFunction is tested in distributed test files. class MegatronFFunction(torch.autograd.Function): diff --git a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc index 767ad6051d2cb..5ac1bba684d76 100644 --- a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc +++ b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc @@ -86,7 +86,7 @@ std::unique_ptr CreateOpTester(const onnxruntime::training::OpDef& op_def, int opset_version) { auto test = std::make_unique(op_def.type.c_str(), opset_version, op_def.domain.c_str()); for (auto attr : attributes) - test->AddAttribute(attr.name(), attr); + test->AddAttributeProto(attr); auto input_index = 0; for (auto& data : input_data) { diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml index 11efeb6d896f6..c7a95638ea622 100644 --- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml @@ -1,10 +1,16 @@ +# Known Limits +# 1. Anchors are not supported in GHA +# https://github.community/t/support-for-yaml-anchors/16128/90 +# 2. Nested Virutalizaiton isn't supported in Azure pipeline +# https://developercommunity.visualstudio.com/t/enable-nested-virtualization-on-azure-pipelines/466384 + jobs: -- job: Android_CI - pool: - vmImage: 'macOS-11' - timeoutInMinutes: 180 +- job: Build_CPU_EP + pool: Linux-CPU-2019 + timeoutInMinutes: 30 steps: - # Onnx has no 3.9 python package available yet, need to use python 3.8 to avoid build onnx package + # Onnx has no 3.9 python package available yet, need to use python 3.8 + # to avoid build onnx package # pythonVersion can be updated in Azure pipeline settings # https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=53 - task: UsePythonVersion@0 @@ -12,7 +18,7 @@ jobs: inputs: versionSpec: $(pythonVersion) - - script: brew install coreutils ninja + - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build displayName: Install coreutils and ninja - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd) @@ -27,12 +33,133 @@ jobs: displayName: Build Host Protoc - script: | - python3 tools/python/run_android_emulator.py \ + export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk + export ANDROID_HOME=/usr/local/lib/android/sdk + export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle + export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle + env | grep ANDROID + displayName: Set Android ENVs + + # Start switching to jdk 11 after the Android Emulator is started + # since Android SDK manager requires java 8 + - task: JavaToolInstaller@0 + displayName: Use jdk 11 + inputs: + versionSpec: '11' + jdkArchitectureOption: 'x64' + jdkSourceOption: 'PreInstalled' + + - script: | + python3 tools/ci_build/build.py \ + --android \ + --build_dir build \ + --android_sdk_path $ANDROID_HOME \ + --android_ndk_path $ANDROID_NDK_HOME \ + --android_abi=x86_64 \ + --android_api=30 \ + --skip_submodule_sync \ + --parallel \ + --cmake_generator=Ninja \ + --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \ + --build_java \ + --skip_tests + displayName: CPU EP, Build + + - task: CopyFiles@2 + displayName: Copy apks + inputs: + contents: 'build/**/*.apk' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true + + - task: CopyFiles@2 + displayName: Copy test data + inputs: + contents: 'build/**/testdata/**' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true + + - task: CopyFiles@2 + displayName: Copy test executables + inputs: + contents: 'build/Debug/*' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true + + - task: PublishBuildArtifacts@1 + inputs: + pathToPublish: $(Build.ArtifactStagingDirectory) + artifactName: CPUBuildOutput + +- job: Test_CPU_EP + pool: + vmImage: 'macOS-11' + dependsOn: Build_CPU_EP + condition: succeeded() + steps: + - task: DownloadPipelineArtifact@2 + inputs: + source: 'current' + artifact: 'CPUBuildOutput' + path: $(Build.SourcesDirectory) + + - script: | + python3 tools/python/run_android_emulator.py \ --android-sdk-root ${ANDROID_SDK_ROOT} \ --create-avd --system-image "system-images;android-30;google_apis;x86_64" \ --start --emulator-extra-args="-partition-size 4096" \ --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid - displayName: Start Android emulator + displayName: Start Android emulator + + - script: | + python3 tools/ci_build/build.py \ + --android \ + --build_dir build \ + --android_sdk_path $ANDROID_HOME \ + --android_ndk_path $ANDROID_NDK_HOME \ + --android_abi=x86_64 \ + --android_api=30 \ + --test + displayName: CPU EP, Test on Android Emulator + + - script: | + python3 tools/python/run_android_emulator.py \ + --android-sdk-root ${ANDROID_SDK_ROOT} \ + --stop \ + --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid + displayName: Stop Android emulator + condition: always() + +- job: Build_NNAPI_EP + pool: Linux-CPU-2019 + timeoutInMinutes: 30 + steps: + - task: UsePythonVersion@0 + displayName: Use Python $(pythonVersion) + inputs: + versionSpec: $(pythonVersion) + + - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build + displayName: Install coreutils and ninja + + - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd) + displayName: Setup gradle wrapper to use gradle 6.8.3 + + # We build the host protoc to /protobuf_install + - script: | + /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \ + $(Build.SourcesDirectory) \ + $(Build.BinariesDirectory)/protobuf \ + $(Build.SourcesDirectory)/protobuf_install + displayName: Build Host Protoc + + - script: | + export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk + export ANDROID_HOME=/usr/local/lib/android/sdk + export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle + export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle + env | grep ANDROID + displayName: set Android ENVs # Start switching to jdk 11 after the Android Emulator is started since Android SDK manager requires java 8 - task: JavaToolInstaller@0 @@ -45,39 +172,134 @@ jobs: - script: | python3 tools/ci_build/build.py \ --android \ - --build_dir build \ + --build_dir build_nnapi \ --android_sdk_path $ANDROID_HOME \ --android_ndk_path $ANDROID_NDK_HOME \ --android_abi=x86_64 \ - --android_api=30 \ + --android_api=29 \ --skip_submodule_sync \ --parallel \ + --use_nnapi \ --cmake_generator=Ninja \ --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \ - --build_java - displayName: CPU EP, Build and Test on Android Emulator + --build_java \ + --code_coverage \ + --skip_tests + displayName: NNAPI EP, Build - - script: /bin/bash tools/ci_build/github/android/run_nnapi_code_coverage.sh $(pwd) - displayName: NNAPI EP, Build, Test and Get Code Coverage on Android Emulator + - task: CopyFiles@2 + displayName: Copy apks + inputs: + contents: 'build_nnapi/**/*.apk' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true - - task: PublishPipelineArtifact@0 - displayName: 'Publish code coverage report' + - task: CopyFiles@2 + displayName: Copy test data inputs: - artifactName: "coverage_rpt.txt" - targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' - publishLocation: 'pipeline' + contents: 'build_nnapi/**/testdata/**' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true - - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) - # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator - displayName: Build Minimal ORT with NNAPI and run tests + - task: CopyFiles@2 + displayName: Copy Test Executables + inputs: + contents: 'build_nnapi/Debug/*' + targetFolder: $(Build.ArtifactStagingDirectory) + overWrite: true - - script: | - python3 tools/python/run_android_emulator.py \ + - task: PublishBuildArtifacts@1 + inputs: + pathToPublish: $(Build.ArtifactStagingDirectory) + artifactName: NNAPIBuildOutput + +- job: Test_NNAPI_EP + pool: + vmImage: 'macOS-11' + dependsOn: Build_NNAPI_EP + condition: succeeded() + steps: + - task: DownloadPipelineArtifact@2 + inputs: + source: 'current' + artifact: 'NNAPIBuildOutput' + path: $(Build.SourcesDirectory) + + - task: UsePythonVersion@0 + displayName: Use Python $(pythonVersion) + inputs: + versionSpec: $(pythonVersion) + + - script: | + python3 tools/python/run_android_emulator.py \ --android-sdk-root ${ANDROID_SDK_ROOT} \ - --stop \ + --create-avd --system-image "system-images;android-30;google_apis;x86_64" \ + --start --emulator-extra-args="-partition-size 4096" \ --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid - displayName: Stop Android emulator - condition: always() + displayName: Start Android emulator + + - script: | + python3 tools/ci_build/build.py \ + --android \ + --build_dir build_nnapi \ + --android_sdk_path $ANDROID_HOME \ + --android_ndk_path $ANDROID_NDK_HOME \ + --android_abi=x86_64 \ + --android_api=29 \ + --use_nnapi \ + --test \ + --code_coverage + displayName: NNAPI EP, Test, CodeCoverage on Android Emulator + + - script: | + python3 -m pip install gcovr && \ + python3 tools/ci_build/coverage.py \ + --build_dir build_nnapi \ + --android_sdk_path $ANDROID_HOME + displayName: Retrieve runtime code coverage files from the emulator and analyze + + - task: PublishPipelineArtifact@0 + displayName: 'Publish code coverage report' + inputs: + artifactName: "coverage_rpt.txt" + targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' + publishLocation: 'pipeline' + + # used by Build Minimal ORT + - script: brew install coreutils ninja + displayName: Install coreutils and ninja + + # We build the host protoc to /protobuf_install + - script: | + /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \ + $(Build.SourcesDirectory) \ + $(Build.BinariesDirectory)/protobuf \ + $(Build.SourcesDirectory)/protobuf_install + displayName: Build Host Protoc + + - script: /bin/bash tools/ci_build/github/android/setup_gradle_wrapper.sh $(pwd) + displayName: Setup gradle wrapper to use gradle 6.8.3 + + # Start switching to jdk 11 after the Android Emulator is started + # since Android SDK manager requires java 8 + - task: JavaToolInstaller@0 + displayName: Use jdk 11 + inputs: + versionSpec: '11' + jdkArchitectureOption: 'x64' + jdkSourceOption: 'PreInstalled' + + - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) + # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator + displayName: Build Minimal ORT with NNAPI and run tests + + - script: | + python3 tools/python/run_android_emulator.py \ + --android-sdk-root ${ANDROID_SDK_ROOT} \ + --stop \ + --emulator-pid-file $(Build.BinariesDirectory)/emulator.pid + displayName: Stop Android emulator + condition: always() - job: Update_Dashboard workspace: @@ -87,7 +309,9 @@ jobs: value: true pool: 'Linux-CPU-2019' condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - dependsOn: Android_CI + dependsOn: + - Test_CPU_EP + - Test_NNAPI_EP steps: - task: DownloadPipelineArtifact@0 displayName: 'Download code coverage report' diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml index 7447c45feadbc..b215c3ce3be66 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml @@ -13,6 +13,11 @@ jobs: inputs: versionSpec: '12.16.3' + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.8' + addToPath: true + - template: templates/get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cpu @@ -51,7 +56,7 @@ jobs: inputs: script: | set -e -x - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) @@ -70,56 +75,58 @@ jobs: rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl - - task: CmdLine@2 + - task: PythonScript@0 displayName: 'Run Release unit tests' inputs: - script: | - python3 $(Build.SourcesDirectory)/tools/ci_build/build.py \ - --build_dir $(Build.BinariesDirectory) \ - --cmake_generator Ninja \ - --config Release \ - --test \ - --skip_submodule_sync \ - --build_shared_lib \ - --parallel \ - --build_wheel \ - --enable_onnx_tests \ - --enable_transformers_tool_test \ - --build_nodejs \ + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py + workingDirectory: $(Build.BinariesDirectory)/Release + arguments: >- + --build_dir $(Build.BinariesDirectory) + --cmake_generator Ninja + --config Release + --test + --skip_submodule_sync + --build_shared_lib + --parallel + --build_wheel + --enable_onnx_tests + --enable_transformers_tool_test + --build_nodejs --ctest_path "" - task: CmdLine@2 displayName: 'Install Debug python package' inputs: script: | + set -e -x rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11 - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl - - task: CmdLine@2 + - task: PythonScript@0 displayName: 'Run Debug unit tests' inputs: - script: | - python3 $(Build.SourcesDirectory)/tools/ci_build/build.py \ - --build_dir $(Build.BinariesDirectory) \ - --cmake_generator Ninja \ - --config Debug \ - --test \ - --skip_submodule_sync \ - --build_shared_lib \ - --parallel \ - --build_wheel \ - --enable_onnx_tests \ - --enable_transformers_tool_test \ - --build_nodejs \ + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py + workingDirectory: $(Build.BinariesDirectory)/Debug + arguments: >- + --build_dir $(Build.BinariesDirectory) + --cmake_generator Ninja + --config Debug + --test + --skip_submodule_sync + --build_shared_lib + --parallel + --build_wheel + --enable_onnx_tests + --enable_transformers_tool_test + --build_nodejs --ctest_path "" - - task: CmdLine@2 + - task: PythonScript@0 displayName: 'Symbolic shape infer' inputs: - script: | - cd $(Build.BinariesDirectory)/Release - python3 $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py + scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py + workingDirectory: $(Build.BinariesDirectory)/Release - task: PublishTestResults@2 displayName: 'Publish unit test results' diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 8730a7ba2971a..c1c994d80b6ee 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -96,7 +96,7 @@ jobs: # We assume the machine doesn't have gcc and python development header files sudo rm -f /build /onnxruntime_src sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml index d3b8e10e2f2ff..317f36cde0922 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml @@ -13,6 +13,11 @@ jobs: inputs: versionSpec: '12.16.3' + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.8' + addToPath: true + - template: templates/get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cpu @@ -50,7 +55,7 @@ jobs: inputs: script: | set -e -x - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) @@ -71,34 +76,35 @@ jobs: rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl - - task: CmdLine@2 + + - task: PythonScript@0 displayName: 'Run Release unit tests' inputs: - script: | - cd /tmp - python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path "" + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py + workingDirectory: $(Build.BinariesDirectory)/Release + arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path "" - task: CmdLine@2 displayName: 'Install Debug python package' inputs: script: | + set -e -x rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11 - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl - - task: CmdLine@2 + - task: PythonScript@0 displayName: 'Run Debug unit tests' inputs: scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py arguments: --build_dir $(Build.BinariesDirectory) --cmake_generator Ninja --config Debug --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_training --enable_onnx_tests --build_nodejs --ctest_path "" - workingDirectory: /tmp + workingDirectory: $(Build.BinariesDirectory)/Debug - - task: CmdLine@2 + - task: PythonScript@0 displayName: 'Symbolic shape infer' inputs: - script: | - cd $(Build.BinariesDirectory)/Release - python3 $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py + scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py + workingDirectory: $(Build.BinariesDirectory)/Release - task: PublishTestResults@2 displayName: 'Publish unit test results' diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml index 512cb806ed67b..e3eb972f69c16 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-external-custom-ops.yml @@ -3,7 +3,7 @@ jobs: timeoutInMinutes: 120 workspace: clean: all - pool: Linux-CPU-2019 + pool: onnxruntime-training-linux-ext-custom-ops steps: - checkout: self clean: true diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml index 609bf0f72dce7..c6fd4c8d469c4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml @@ -341,7 +341,7 @@ stages: rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 sudo rm -f /build /onnxruntime_src sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index f0ea9431953ad..9fad69a341fec 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -116,7 +116,7 @@ stages: rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 sudo rm -f /build /onnxruntime_src sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) @@ -245,7 +245,7 @@ stages: rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 sudo rm -f /build /onnxruntime_src sudo ln -s $(Build.SourcesDirectory) /onnxruntime_src - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt # Test ORT with the latest ONNX release. export ONNX_VERSION=$(cat $(Build.SourcesDirectory)/cmake/external/onnx/VERSION_NUMBER) diff --git a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml index 90f5546e0cbd7..45f4997380bb6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml @@ -85,7 +85,7 @@ jobs: displayName: 'API Documentation Check and generate' - script: | - python -m pip install -q setuptools wheel numpy six + python -m pip install -q setuptools wheel numpy workingDirectory: '$(Build.BinariesDirectory)' displayName: 'Install python modules' @@ -173,7 +173,7 @@ jobs: - ${{ if eq(parameters.EnablePython, true) }}: - powershell: | - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname} workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' diff --git a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml index 1931114aa9c8c..e698956db2ccc 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-gpu-ci.yml @@ -98,7 +98,7 @@ jobs: displayName: 'API Documentation Check and generate' - script: | - python -m pip install -q setuptools wheel numpy six + python -m pip install -q setuptools wheel numpy workingDirectory: '$(Build.BinariesDirectory)' displayName: 'Install python modules' @@ -193,7 +193,7 @@ jobs: - ${{ if eq(parameters.EnablePython, true) }}: - powershell: | - python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname} workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index adc9b03a0eb06..31671520e4b00 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -7,4 +7,3 @@ git+http://github.com/onnx/onnx.git@be76ca7148396176784ba8733133b9fb1186ea0d#egg protobuf sympy==1.1.1 flatbuffers -six diff --git a/tools/python/PythonTools.md b/tools/python/PythonTools.md index 6f752ad2257b9..49a4ac6a337d1 100644 --- a/tools/python/PythonTools.md +++ b/tools/python/PythonTools.md @@ -133,7 +133,7 @@ optional arguments: image_to_pb: image_to_pb specific options - --resize RESIZE Provide the shape as comma separated values to resize the image to. e.g. --shape 200,200 + --resize RESIZE Provide the height and width to resize to as comma separated values. e.g. --shape 200,300 will resize to height 200 and width 300. --channels_last Transpose image from channels first to channels last. --add_batch_dim Prepend a batch dimension with value of 1 to the shape. i.e. convert from CHW to NCHW diff --git a/tools/python/onnx_test_data_utils.py b/tools/python/onnx_test_data_utils.py index 2121d579aec3b..e4e4c54aa12d6 100644 --- a/tools/python/onnx_test_data_utils.py +++ b/tools/python/onnx_test_data_utils.py @@ -52,7 +52,26 @@ def image_to_numpy(filename, shape, channels_last, add_batch_dim): img = PIL.Image.open(filename) if shape: - img = img.resize(shape, PIL.Image.ANTIALIAS) + w, h = img.size + new_w = shape[1] + new_h = shape[0] + + # use the dimension that needs to shrink the least to resize to an image where that dimension matches the + # target size. + w_ratio = new_w / w + h_ratio = new_h / h + ratio = w_ratio if w_ratio > h_ratio else h_ratio + interim_w = int(w * ratio) + interim_h = int(h * ratio) + img = img.resize((interim_w, interim_h), PIL.Image.ANTIALIAS) + + # center crop to the final target size + left = (interim_w - new_w) / 2 + top = (interim_h - new_h) / 2 + right = (interim_w + new_w) / 2 + bottom = (interim_h + new_h) / 2 + img = img.crop((left, top, right, bottom)) + img_as_np = np.array(img).astype(np.float32) if not channels_last: # HWC to CHW @@ -110,8 +129,8 @@ def get_arg_parser(): image_to_pb_group = parser.add_argument_group('image_to_pb', 'image_to_pb specific options') image_to_pb_group.add_argument('--resize', default=None, type=lambda s: [int(item) for item in s.split(',')], - help='Provide the shape as comma separated values to resize the image to.' - ' e.g. --shape 200,200') + help='Provide the height and width to resize to as comma separated values.' + ' e.g. --shape 200,300 will resize to height 200 and width 300.') image_to_pb_group.add_argument('--channels_last', action='store_true', help='Transpose image from channels first to channels last.') image_to_pb_group.add_argument('--add_batch_dim', action='store_true', diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp index c2aec18038829..4e872a46c42a4 100644 --- a/winml/test/api/LearningModelSessionAPITest.cpp +++ b/winml/test/api/LearningModelSessionAPITest.cpp @@ -476,10 +476,13 @@ static void WindowFunction(const wchar_t* window_operator_name, TensorKind kind) #endif #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS) -static void DiscreteFourierTransform(bool is_onesided = false) { - std::vector shape = {1, 5}; - std::vector output_shape = {1, 5, 2}; - output_shape[1] = is_onesided ? (1 + (shape[1] >> 1)) : shape[1]; +static void DiscreteFourierTransform(size_t axis, bool is_onesided = false) { + auto axis_dim = axis + 1; + printf("\nDiscrete Fourier Transform [axis=%d, is_onesided=%s]\n", static_cast(axis_dim), is_onesided ? "true" : "false"); + + std::vector shape = {2, 5, 8, 1}; + std::vector output_shape = {2, 5, 8, 2}; + output_shape[axis_dim] = is_onesided ? (1 + (shape[axis_dim] >> 1)) : shape[axis_dim]; auto model = LearningModelBuilder::Create(13) @@ -487,6 +490,7 @@ static void DiscreteFourierTransform(bool is_onesided = false) { .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Spectra", TensorKind::Float, output_shape)) .Operators().Add(Operator(L"DFT", MS_EXPERIMENTAL_DOMAIN) .SetInput(L"input", L"Input.Signal") + .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)})) .SetAttribute(L"onesided", TensorInt64Bit::CreateFromArray({}, {is_onesided})) .SetOutput(L"output", L"Output.Spectra")) .CreateModel(); @@ -495,19 +499,38 @@ static void DiscreteFourierTransform(bool is_onesided = false) { LearningModelBinding binding(session); // Populate binding - binding.Bind(L"Input.Signal", TensorFloat::CreateFromArray(shape, {1, 2, 3, 4, 5})); + binding.Bind( + L"Input.Signal", + TensorFloat::CreateFromArray( + shape, + {1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + })); // Evaluate auto result = session.Evaluate(binding, L""); - // Check results - printf("Output.Spectra\n"); - auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as(); - auto y_ivv = y_tensor.GetAsVectorView(); - for (int i = 0; i < output_shape[0] * output_shape[1] * 2; i += 2) { - printf("(%f + %fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1)); - } - printf("\n"); + // // Check results + // printf("Output.Spectra\n"); + // auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as(); + // auto y_ivv = y_tensor.GetAsVectorView(); + // for (uint32_t i = 0; i < y_ivv.Size(); i+=2) { + // auto format_size = 16 * (!is_onesided || axis == 0) + 10 * (is_onesided && axis == 1); + // if (i % format_size == 0 && i != 0) { + // printf("\n"); + // } + // printf("(%.2f + %.2fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1)); + // } + // printf("\n"); } #endif @@ -612,20 +635,20 @@ static void STFT(size_t batch_size, size_t signal_size, size_t dft_size, printf("%f, ", window_ivv.GetAt(i)); } printf("\n"); - printf("Output.STFT\n"); - // Check results - auto y_tensor = result.Outputs().Lookup(L"Output.STFT").as(); - auto y_ivv = y_tensor.GetAsVectorView(); - auto size = y_ivv.Size(); - WINML_EXPECT_EQUAL(size, n_dfts * output_shape[2] * 2); - for (size_t dft_idx = 0; dft_idx < n_dfts; dft_idx++) { - for (size_t i = 0; INT64(i) < output_shape[2]; i++) { - auto real_idx = static_cast((i * 2) + (2 * dft_idx * output_shape[2])); - printf("(%d, %f , %fi), ", static_cast(i), y_ivv.GetAt(real_idx), y_ivv.GetAt(real_idx + 1)); - } - } - - printf("\n"); + //printf("Output.STFT\n"); + //// Check results + //auto y_tensor = result.Outputs().Lookup(L"Output.STFT").as(); + //auto y_ivv = y_tensor.GetAsVectorView(); + //auto size = y_ivv.Size(); + //WINML_EXPECT_EQUAL(size, n_dfts * output_shape[2] * 2); + //for (size_t dft_idx = 0; dft_idx < n_dfts; dft_idx++) { + // for (size_t i = 0; INT64(i) < output_shape[2]; i++) { + // auto real_idx = static_cast((i * 2) + (2 * dft_idx * output_shape[2])); + // printf("(%d, %f , %fi), ", static_cast(i), y_ivv.GetAt(real_idx), y_ivv.GetAt(real_idx + 1)); + // } + //} + // + //printf("\n"); } #endif @@ -913,45 +936,88 @@ static void ModelBuilding_ConstantMatmul() { static void ModelBuilding_DiscreteFourierTransform() { #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS) - DiscreteFourierTransform(false /*onesided*/); - DiscreteFourierTransform(true /*onesided*/); + DiscreteFourierTransform(0, false /*onesided*/); + DiscreteFourierTransform(0, true /*onesided*/); + DiscreteFourierTransform(1, false /*onesided*/); + DiscreteFourierTransform(1, true /*onesided*/); + #endif } -static void ModelBuilding_DiscreteFourierTransformInverseIdentity() { #if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS) - std::vector shape = {1, 5}; - std::vector output_shape = {1, shape[1], 2}; +static void DiscreteFourierTransformInverse(size_t axis) { + std::vector shape = {2, 5, 8, 1}; + std::vector output_shape = {2, 5, 8, 2}; auto model = LearningModelBuilder::Create(13) .Inputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Input.TimeSignal", TensorKind::Float, shape)) .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Spectra", TensorKind::Float, output_shape)) + .Outputs().Add(LearningModelBuilder::CreateTensorFeatureDescriptor(L"Output.Inverse", TensorKind::Float, output_shape)) .Operators().Add(Operator(L"DFT", MS_EXPERIMENTAL_DOMAIN) .SetInput(L"input", L"Input.TimeSignal") - .SetOutput(L"output", L"DFTOutput")) - .Operators().Add(Operator(L"IDFT", MS_EXPERIMENTAL_DOMAIN) - .SetInput(L"input", L"DFTOutput") + .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)})) .SetOutput(L"output", L"Output.Spectra")) + .Operators().Add(Operator(L"IDFT", MS_EXPERIMENTAL_DOMAIN) + .SetInput(L"input", L"Output.Spectra") + .SetAttribute(L"axis", TensorInt64Bit::CreateFromArray({}, {INT64(axis)})) + .SetOutput(L"output", L"Output.Inverse")) .CreateModel(); LearningModelSession session(model); LearningModelBinding binding(session); + auto input_vector = + std::vector{ + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, + + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + 2, 4, 6, 8, 10, 12, 14, 16, + }; // Populate binding - binding.Bind(L"Input.TimeSignal", TensorFloat::CreateFromArray(shape, {1, 2, 3, 4, 5})); + binding.Bind( + L"Input.TimeSignal", + TensorFloat::CreateFromArray( + shape, + input_vector)); // Evaluate auto result = session.Evaluate(binding, L""); - + // Check results - printf("Output.Spectra\n"); - auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as(); + auto y_tensor = result.Outputs().Lookup(L"Output.Inverse").as(); auto y_ivv = y_tensor.GetAsVectorView(); - for (int i = 0; i < output_shape[0] * output_shape[1] * 2; i += 2) { - printf("(%f + %fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1)); + for (uint32_t i = 0; i < y_ivv.Size(); i += 2) { + WINML_EXPECT_TRUE(abs(y_ivv.GetAt(i) - input_vector[i / 2]) < .001); + WINML_EXPECT_TRUE(abs(y_ivv.GetAt(i + 1) - 0) < .001); } - printf("\n"); + + //printf("Output.Spectra\n"); + //auto y_tensor = result.Outputs().Lookup(L"Output.Spectra").as(); + //auto y_ivv = y_tensor.GetAsVectorView(); + //for (uint32_t i = 0; i < y_ivv.Size(); i+=2) { + // auto format_size = 16; + // if (i % format_size == 0 && i != 0) { + // printf("\n"); + // } + // printf("(%.2f + %.2fi), ", y_ivv.GetAt(i), y_ivv.GetAt(i + 1)); + //} + //printf("\n"); + +} +#endif + +static void ModelBuilding_DiscreteFourierTransformInverseIdentity() { +#if !defined(BUILD_INBOX) && defined(BUILD_MS_EXPERIMENTAL_OPS) + DiscreteFourierTransformInverse(0); + DiscreteFourierTransformInverse(1); #endif }