Merge remote-tracking branch 'apache-upstream/main' into unity-staging

mlc-ai · Oct 3, 2023 · 11c73a2 · 11c73a2
2 parents 063cd7f + 2890899
commit 11c73a2
Show file tree

Hide file tree

Showing 47 changed files with 1,038 additions and 517 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -69,6 +69,14 @@ jobs:
         shell: bash -l {0}
         run: >-
           python -m pytest -v tests/python/all-platform-minimal-test
+      - name: Minimal Metal Compile-Only
+        shell: bash -l {0}
+        run: >-
+          python -m pytest -v -s 'tests/python/unittest/test_allreduce.py::test_allreduce_sum_compile'
+      - name: Minimal Metal Compile-and-Run
+        shell: bash -l {0}
+        run: >-
+          python -m pytest -v -s 'tests/python/unittest/test_allreduce.py::test_allreduce_sum[dims0-metal]'
       - name: Test iOS RPC
         shell: bash -l {0}
         run: >-

diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md
@@ -122,6 +122,11 @@ Command line usage
 --input        - Numpy file for the model input (optional and we use random of not given)
 --output       - Numpy file name to dump the model output as numpy
 --dump-meta    - Dump model meta information
+--pre-compiled - The file name of a file where pre-compiled programs should be stored
+--profile      - Profile over all execution
+--dry-run      - Profile after given dry runs, default 10
+--run-count    - Profile for given runs, default 50
+--zero-copy    - Profile with zero copy api
 
   Example
   ./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
@@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the `rtvm` then After method
 `Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
 programs if the file exists. In opposite case the file will be created and
 pre-compiled programs will be saved to this file.
+
+# Performnace Profiling Options
+The tool has added few options to measure wall clock performance of the given model on Target natively.
+--profile : Can turn on the profiling
+--dry-run : The number of times dry run the model before mearuring the performance. Default value os 10
+--run-count : The number times to run the model and take an average. Default value is 50.
+--zero-copy: This option enables graph runtime zero copy to be used for input and output than byte copy to DLTensor.
+
+Performance profile options dumps information summary as given below.
+     Module Load              :27 ms
+     Graph Runtime Create     :11 ms
+     Params Read              :15 ms
+     Params Set               :41 ms
+     Pre Compiled Progs Load  :24 ms
+Total Load Time     :118 ms
+Average ExecTime    :27 ms
+Unload Time         :35.9236 ms
diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc
@@ -29,6 +29,7 @@
 #endif
 #include <dmlc/logging.h>
 
+#include <chrono>
 #include <cstring>
 #include <iostream>
 #include <sstream>
@@ -54,7 +55,11 @@ static const string kUsage =
     "--input        - Numpy file for the model input (optional and we use random of not given)\n"
     "--output       - Numpy file name to dump the model output as numpy\n"
     "--dump-meta    - Dump model meta information\n"
-    "--pre-compiled - The file name of a file where pre-compiled programs should be stored"
+    "--pre-compiled - The file name of a file where pre-compiled programs should be stored\n"
+    "--profile      - Profile over all execution\n"
+    "--dry-run      - Profile after given dry runs, default 10\n"
+    "--run-count    - Profile for given runs, default 50\n"
+    "--zero-copy    - Profile with zero copy api\n"
     "\n"
     "  Example\n"
     "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
@@ -68,14 +73,19 @@ static const string kUsage =
  * \arg input Numpy file for the model input
  * \arg output Numpy file name to dump the model output as numpy
  * \arg pre_compiled File name where pre-compiled programs should be stored
+ * \arg profile Do we profile overall execution
  */
 struct ToolArgs {
   string model;
   string device;
   string input;
   string output;
   string pre_compiled;
-  bool dump_meta = false;
+  bool dump_meta{false};
+  bool profile{false};
+  int dry_run{10};
+  int run_count{50};
+  bool zero_copy{false};
 };
 
 /*!
@@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
   LOG(INFO) << "Output        = " << args.output;
   LOG(INFO) << "Pre-compiled  = " << args.pre_compiled;
   LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
+  LOG(INFO) << "Profile       = " << ((args.profile) ? ("True") : ("False"));
+  LOG(INFO) << "Dry Run       = " << args.dry_run;
+  LOG(INFO) << "Run Count     = " << args.run_count;
+  LOG(INFO) << "Zero Copy     = " << ((args.zero_copy) ? ("True") : ("False"));
 }
 
 #if defined(__linux__) || defined(__ANDROID__)
@@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
   }
 
   args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
+
+  const string pprofile = GetCmdOption(argc, argv, "--profile", true);
+  if (!pprofile.empty()) {
+    args.profile = true;
+  }
+
+  const string pdry_run = GetCmdOption(argc, argv, "--dry-run=");
+  if (!pdry_run.empty()) {
+    args.dry_run = stoi(pdry_run);
+  }
+
+  const string prun = GetCmdOption(argc, argv, "--run-count=");
+  if (!prun.empty()) {
+    args.run_count = stoi(prun);
+  }
+
+  const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true);
+  if (!pzcopy.empty()) {
+    args.zero_copy = true;
+  }
 }
 
 /*!
@@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
 #endif
 
   // Initialize TVM Runner
-  TVMRunner runner = TVMRunner(args.model, args.device);
+  auto runner = new TVMRunner(args.model, args.device);
 
   // Load the model
-  runner.Load();
+  runner->Load();
   if (!args.pre_compiled.empty()) {
-    runner.UsePreCompiledPrograms(args.pre_compiled);
+    runner->UsePreCompiledPrograms(args.pre_compiled);
   }
 
   // Query Model meta Information
-  TVMMetaInfo mInfo = runner.GetMetaInfo();
+  TVMMetaInfo mInfo = runner->GetMetaInfo();
 
   // Print Meta Information
-  if (args.dump_meta) runner.PrintMetaInfo();
+  if (args.dump_meta) runner->PrintMetaInfo();
+
+  int total_exec_time = 0;
+
+  if (args.profile) {
+    if (args.dry_run) {
+      for (int ii = 0; ii < args.dry_run; ++ii) {
+        runner->Run();
+      }
+      TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
+    }
+    int total_time = 0;
+    std::map<std::string, NDArray> input_data_even, input_data_odd;
+    std::map<std::string, NDArray> output_data_even, output_data_odd;
+
+    std::map<std::string, char*> input_data;
+    std::map<std::string, char*> output_data;
+
+    // Alloc / populate and keep input data ready
+    for (auto& elem : mInfo.input_info) {
+      if (args.zero_copy) {
+        auto ndarr =
+            NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        input_data_even.insert({elem.first, ndarr});
+
+        ndarr =
+            NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        input_data_odd.insert({elem.first, ndarr});
+      } else {
+        char* data = (char*)malloc(runner->GetInputMemSize(elem.first));
+        input_data.insert({elem.first, data});
+      }
+    }
+
+    // Alloc and keep output bufers ready
+    for (auto& elem : mInfo.output_info) {
+      if (args.zero_copy) {
+        auto ndarr =
+            NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        output_data_even.insert({elem.first, ndarr});
+
+        ndarr =
+            NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        output_data_odd.insert({elem.first, ndarr});
+      } else {
+        char* data = (char*)malloc(runner->GetOutputMemSize(elem.first));
+        output_data.insert({elem.first, data});
+      }
+    }
+
+    for (int ii = 0; ii < args.run_count; ++ii) {
+      // Timer start
+      auto tstart = std::chrono::high_resolution_clock::now();
+      // Set random input for all input
+      for (auto& elem : mInfo.input_info) {
+        if (args.zero_copy) {
+          if (ii % 2) {
+            runner->SetInput(elem.first, input_data_even[elem.first]);
+          } else {
+            runner->SetInput(elem.first, input_data_odd[elem.first]);
+          }
+        } else {
+          runner->SetInput(elem.first, input_data[elem.first]);
+        }
+      }
+
+      if (args.zero_copy) {
+        // With zero copy set the result NDArray up front
+        for (auto& elem : mInfo.output_info) {
+          if (ii % 2) {
+            runner->SetOutput(elem.first, output_data_even[elem.first]);
+          } else {
+            runner->SetOutput(elem.first, output_data_odd[elem.first]);
+          }
+        }
+      }
 
-  if (args.input.empty() || args.output.empty()) {
+      // Run the model
+      runner->Run();
+
+      if (!args.zero_copy) {
+        // W/o zero copy we need to invoke explicite data copy
+        for (auto& elem : mInfo.output_info) {
+          runner->GetOutput(elem.first, output_data[elem.first]);
+        }
+      } else {
+        // Just wait for the run to complete.
+        TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
+      }
+
+      // Timer end
+      auto tend = std::chrono::high_resolution_clock::now();
+      LOG(INFO) << "Exec Time:" << static_cast<double>((tend - tstart).count()) / 1e6;
+      total_exec_time += static_cast<double>((tend - tstart).count()) / 1e6;
+    }
+
+    // Free input bufers
+    for (auto& elem : mInfo.input_info) {
+      free(input_data[elem.first]);
+    }
+
+    // Free output bufers
+    for (auto& elem : mInfo.output_info) {
+      free(output_data[elem.first]);
+    }
+  } else if (!args.input.empty() && !args.output.empty()) {
+    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
+    // Set Input from Numpy Input
+    runner->SetInput(args.input);
+    // Run the model
+    runner->Run();
+    // Get Output as Numpy dump
+    runner->GetOutput(args.output);
+  } else {
     LOG(INFO) << "Executing dry run ... ";
     // Set random input for all inputs
     for (auto& elem : mInfo.input_info) {
       LOG(INFO) << "Set Random Input for :" << elem.first;
       auto shape = elem.second.first;
-      size_t ssize = runner.GetInputMemSize(elem.first);
+      size_t ssize = runner->GetInputMemSize(elem.first);
       char* data = (char*)malloc(ssize);
       LOG(INFO) << "Random Input Size:" << ssize << "  bytes";
-      runner.SetInput(elem.first, data);
+      runner->SetInput(elem.first, data);
       free(data);
     }
-
     // Run the model
-    runner.Run();
-
+    runner->Run();
     // Get Output and dump few values
     for (auto& elem : mInfo.output_info) {
       LOG(INFO) << "Get Output for :" << elem.first;
       auto shape = elem.second.first;
-      size_t ssize = runner.GetOutputMemSize(elem.first);
+      size_t ssize = runner->GetOutputMemSize(elem.first);
       char* data = (char*)malloc(ssize);
-      runner.GetOutput(elem.first, data);
+      runner->GetOutput(elem.first, data);
       LOG(INFO) << "Output Size:" << ssize << "  bytes";
       free(data);
     }
-  } else {
-    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
-
-    // Set Input from Numpy Input
-    runner.SetInput(args.input);
-
-    // Run the model
-    runner.Run();
+  }
 
-    // Get Output as Numpy dump
-    runner.GetOutput(args.output);
+  if (args.profile) {
+    // Print Stats
+    runner->PrintStats();
   }
+  auto tstart = std::chrono::high_resolution_clock::now();
+  delete runner;
+  auto tend = std::chrono::high_resolution_clock::now();
 
+  if (args.profile) {
+    LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " ms";
+    LOG(INFO) << "Unload Time      :" << static_cast<double>((tend - tstart).count()) / 1e6
+              << " ms";
+  }
   return 0;
 }