Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache-upstream/main' into unity-staging
Browse files Browse the repository at this point in the history
  • Loading branch information
junrushao committed Oct 3, 2023
2 parents 063cd7f + 2890899 commit 11c73a2
Show file tree
Hide file tree
Showing 47 changed files with 1,038 additions and 517 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ jobs:
shell: bash -l {0}
run: >-
python -m pytest -v tests/python/all-platform-minimal-test
- name: Minimal Metal Compile-Only
shell: bash -l {0}
run: >-
python -m pytest -v -s 'tests/python/unittest/test_allreduce.py::test_allreduce_sum_compile'
- name: Minimal Metal Compile-and-Run
shell: bash -l {0}
run: >-
python -m pytest -v -s 'tests/python/unittest/test_allreduce.py::test_allreduce_sum[dims0-metal]'
- name: Test iOS RPC
shell: bash -l {0}
run: >-
Expand Down
22 changes: 22 additions & 0 deletions apps/cpp_rtvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ Command line usage
--input - Numpy file for the model input (optional and we use random of not given)
--output - Numpy file name to dump the model output as numpy
--dump-meta - Dump model meta information
--pre-compiled - The file name of a file where pre-compiled programs should be stored
--profile - Profile over all execution
--dry-run - Profile after given dry runs, default 10
--run-count - Profile for given runs, default 50
--zero-copy - Profile with zero copy api

Example
./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
Expand Down Expand Up @@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the `rtvm` then After method
`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
programs if the file exists. In opposite case the file will be created and
pre-compiled programs will be saved to this file.

# Performnace Profiling Options
The tool has added few options to measure wall clock performance of the given model on Target natively.
--profile : Can turn on the profiling
--dry-run : The number of times dry run the model before mearuring the performance. Default value os 10
--run-count : The number times to run the model and take an average. Default value is 50.
--zero-copy: This option enables graph runtime zero copy to be used for input and output than byte copy to DLTensor.

Performance profile options dumps information summary as given below.
Module Load :27 ms
Graph Runtime Create :11 ms
Params Read :15 ms
Params Set :41 ms
Pre Compiled Progs Load :24 ms
Total Load Time :118 ms
Average ExecTime :27 ms
Unload Time :35.9236 ms
199 changes: 174 additions & 25 deletions apps/cpp_rtvm/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#endif
#include <dmlc/logging.h>

#include <chrono>
#include <cstring>
#include <iostream>
#include <sstream>
Expand All @@ -54,7 +55,11 @@ static const string kUsage =
"--input - Numpy file for the model input (optional and we use random of not given)\n"
"--output - Numpy file name to dump the model output as numpy\n"
"--dump-meta - Dump model meta information\n"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored\n"
"--profile - Profile over all execution\n"
"--dry-run - Profile after given dry runs, default 10\n"
"--run-count - Profile for given runs, default 50\n"
"--zero-copy - Profile with zero copy api\n"
"\n"
" Example\n"
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
Expand All @@ -68,14 +73,19 @@ static const string kUsage =
* \arg input Numpy file for the model input
* \arg output Numpy file name to dump the model output as numpy
* \arg pre_compiled File name where pre-compiled programs should be stored
* \arg profile Do we profile overall execution
*/
struct ToolArgs {
string model;
string device;
string input;
string output;
string pre_compiled;
bool dump_meta = false;
bool dump_meta{false};
bool profile{false};
int dry_run{10};
int run_count{50};
bool zero_copy{false};
};

/*!
Expand All @@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
LOG(INFO) << "Output = " << args.output;
LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
LOG(INFO) << "Profile = " << ((args.profile) ? ("True") : ("False"));
LOG(INFO) << "Dry Run = " << args.dry_run;
LOG(INFO) << "Run Count = " << args.run_count;
LOG(INFO) << "Zero Copy = " << ((args.zero_copy) ? ("True") : ("False"));
}

#if defined(__linux__) || defined(__ANDROID__)
Expand Down Expand Up @@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
}

args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");

const string pprofile = GetCmdOption(argc, argv, "--profile", true);
if (!pprofile.empty()) {
args.profile = true;
}

const string pdry_run = GetCmdOption(argc, argv, "--dry-run=");
if (!pdry_run.empty()) {
args.dry_run = stoi(pdry_run);
}

const string prun = GetCmdOption(argc, argv, "--run-count=");
if (!prun.empty()) {
args.run_count = stoi(prun);
}

const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true);
if (!pzcopy.empty()) {
args.zero_copy = true;
}
}

/*!
Expand All @@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
#endif

// Initialize TVM Runner
TVMRunner runner = TVMRunner(args.model, args.device);
auto runner = new TVMRunner(args.model, args.device);

// Load the model
runner.Load();
runner->Load();
if (!args.pre_compiled.empty()) {
runner.UsePreCompiledPrograms(args.pre_compiled);
runner->UsePreCompiledPrograms(args.pre_compiled);
}

// Query Model meta Information
TVMMetaInfo mInfo = runner.GetMetaInfo();
TVMMetaInfo mInfo = runner->GetMetaInfo();

// Print Meta Information
if (args.dump_meta) runner.PrintMetaInfo();
if (args.dump_meta) runner->PrintMetaInfo();

int total_exec_time = 0;

if (args.profile) {
if (args.dry_run) {
for (int ii = 0; ii < args.dry_run; ++ii) {
runner->Run();
}
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
}
int total_time = 0;
std::map<std::string, NDArray> input_data_even, input_data_odd;
std::map<std::string, NDArray> output_data_even, output_data_odd;

std::map<std::string, char*> input_data;
std::map<std::string, char*> output_data;

// Alloc / populate and keep input data ready
for (auto& elem : mInfo.input_info) {
if (args.zero_copy) {
auto ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
input_data_even.insert({elem.first, ndarr});

ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
input_data_odd.insert({elem.first, ndarr});
} else {
char* data = (char*)malloc(runner->GetInputMemSize(elem.first));
input_data.insert({elem.first, data});
}
}

// Alloc and keep output bufers ready
for (auto& elem : mInfo.output_info) {
if (args.zero_copy) {
auto ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
output_data_even.insert({elem.first, ndarr});

ndarr =
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
DLDevice{GetTVMDevice(args.device), 0});
output_data_odd.insert({elem.first, ndarr});
} else {
char* data = (char*)malloc(runner->GetOutputMemSize(elem.first));
output_data.insert({elem.first, data});
}
}

for (int ii = 0; ii < args.run_count; ++ii) {
// Timer start
auto tstart = std::chrono::high_resolution_clock::now();
// Set random input for all input
for (auto& elem : mInfo.input_info) {
if (args.zero_copy) {
if (ii % 2) {
runner->SetInput(elem.first, input_data_even[elem.first]);
} else {
runner->SetInput(elem.first, input_data_odd[elem.first]);
}
} else {
runner->SetInput(elem.first, input_data[elem.first]);
}
}

if (args.zero_copy) {
// With zero copy set the result NDArray up front
for (auto& elem : mInfo.output_info) {
if (ii % 2) {
runner->SetOutput(elem.first, output_data_even[elem.first]);
} else {
runner->SetOutput(elem.first, output_data_odd[elem.first]);
}
}
}

if (args.input.empty() || args.output.empty()) {
// Run the model
runner->Run();

if (!args.zero_copy) {
// W/o zero copy we need to invoke explicite data copy
for (auto& elem : mInfo.output_info) {
runner->GetOutput(elem.first, output_data[elem.first]);
}
} else {
// Just wait for the run to complete.
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
}

// Timer end
auto tend = std::chrono::high_resolution_clock::now();
LOG(INFO) << "Exec Time:" << static_cast<double>((tend - tstart).count()) / 1e6;
total_exec_time += static_cast<double>((tend - tstart).count()) / 1e6;
}

// Free input bufers
for (auto& elem : mInfo.input_info) {
free(input_data[elem.first]);
}

// Free output bufers
for (auto& elem : mInfo.output_info) {
free(output_data[elem.first]);
}
} else if (!args.input.empty() && !args.output.empty()) {
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
// Set Input from Numpy Input
runner->SetInput(args.input);
// Run the model
runner->Run();
// Get Output as Numpy dump
runner->GetOutput(args.output);
} else {
LOG(INFO) << "Executing dry run ... ";
// Set random input for all inputs
for (auto& elem : mInfo.input_info) {
LOG(INFO) << "Set Random Input for :" << elem.first;
auto shape = elem.second.first;
size_t ssize = runner.GetInputMemSize(elem.first);
size_t ssize = runner->GetInputMemSize(elem.first);
char* data = (char*)malloc(ssize);
LOG(INFO) << "Random Input Size:" << ssize << " bytes";
runner.SetInput(elem.first, data);
runner->SetInput(elem.first, data);
free(data);
}

// Run the model
runner.Run();

runner->Run();
// Get Output and dump few values
for (auto& elem : mInfo.output_info) {
LOG(INFO) << "Get Output for :" << elem.first;
auto shape = elem.second.first;
size_t ssize = runner.GetOutputMemSize(elem.first);
size_t ssize = runner->GetOutputMemSize(elem.first);
char* data = (char*)malloc(ssize);
runner.GetOutput(elem.first, data);
runner->GetOutput(elem.first, data);
LOG(INFO) << "Output Size:" << ssize << " bytes";
free(data);
}
} else {
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;

// Set Input from Numpy Input
runner.SetInput(args.input);

// Run the model
runner.Run();
}

// Get Output as Numpy dump
runner.GetOutput(args.output);
if (args.profile) {
// Print Stats
runner->PrintStats();
}
auto tstart = std::chrono::high_resolution_clock::now();
delete runner;
auto tend = std::chrono::high_resolution_clock::now();

if (args.profile) {
LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " ms";
LOG(INFO) << "Unload Time :" << static_cast<double>((tend - tstart).count()) / 1e6
<< " ms";
}
return 0;
}

Expand Down
Loading

0 comments on commit 11c73a2

Please sign in to comment.