Skip to content

Commit

Permalink
Fix ze_peak explicit scaling benchmark (#88)
Browse files Browse the repository at this point in the history
* Fix ze_peak explicit scaling benchmark

The explicit scaling code for ze_peak violates L0 spec and has no
overlap between sub-devices. This PR corrects these issues.

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>

* Fix ze_peak explicit scaling benchmark

The explicit scaling code for ze_peak violates L0 spec and has no
overlap between sub-devices. This PR corrects these issues.

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>

---------

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>
  • Loading branch information
lyu authored Nov 6, 2024
1 parent 703dc58 commit ae0ea49
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 49 deletions.
27 changes: 13 additions & 14 deletions perf_tests/ze_peak/src/global_bw.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -37,7 +37,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
numItems = numItems - (numItems % context.sub_device_count);
if (verbose)
std::cout << "splitting the total work items ::" << numItems
<< "across subdevices ::" << context.sub_device_count
<< " across subdevices ::" << context.sub_device_count
<< std::endl;
numItems = set_workgroups(context, numItems / context.sub_device_count,
&workgroup_info);
Expand Down Expand Up @@ -121,7 +121,6 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
throw std::runtime_error("zeCommandListAppendMemoryCopy failed: " +
std::to_string(result));
}
i++;
}
} else {
result = zeCommandListAppendMemoryCopy(
Expand Down Expand Up @@ -264,7 +263,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
setup_function(context, global_offset_v16,
"global_bandwidth_v16_global_offset", inputBuf, outputBuf);
}
std::cout << "Global memory bandwidth (GBPS)\n";
std::cout << "Global memory bandwidth (GB/s)\n";

timed = 0;
timed_lo = 0;
Expand Down Expand Up @@ -295,15 +294,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v1, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v1, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -332,15 +331,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v2, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v2, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -370,15 +369,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v4, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v4, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -407,15 +406,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v8, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v8, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -443,15 +442,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v16, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v16, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

if (context.sub_device_count) {
Expand Down
16 changes: 8 additions & 8 deletions perf_tests/ze_peak/src/transfer_bw.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -198,7 +198,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size);
}
std::cout << "GPU Copy Host to Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -215,7 +215,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size);
}
std::cout << "GPU Copy Shared Memory to Host : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -232,7 +232,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size, true);
}
std::cout << "System Memory Copy to Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -249,7 +249,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size, false);
}
std::cout << "System Memory Copy from Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

current_sub_device_id = 0;

Expand Down Expand Up @@ -328,7 +328,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
if (verbose)
std::cout << "device buffer allocated\n";

std::cout << "Transfer Bandwidth (GBPS)\n";
std::cout << "Transfer Bandwidth (GB/s)\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -345,7 +345,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
local_memory_size);
}
std::cout << "enqueueWriteBuffer : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -362,7 +362,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
local_memory_size);
}
std::cout << "enqueueReadBuffer : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

current_sub_device_id = 0;

Expand Down
74 changes: 47 additions & 27 deletions perf_tests/ze_peak/src/ze_peak.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -40,8 +40,7 @@ std::vector<uint8_t> L0Context::load_binary_file(const std::string &file_path) {
binary_file.resize(length);
stream.read(reinterpret_cast<char *>(binary_file.data()), length);
if (verbose)
std::cout << "Binary file loaded"
<< "\n";
std::cout << "Binary file loaded\n";
stream.close();

return binary_file;
Expand Down Expand Up @@ -144,6 +143,7 @@ void L0Context::print_ze_device_properties(
<< "\n"
<< " * UUID : " << id << "\n"
<< " * coreClockRate : " << std::dec << props.coreClockRate << "\n"
<< " * maxMemAllocSize : " << props.maxMemAllocSize << " bytes\n"
<< std::endl;
}

Expand Down Expand Up @@ -846,13 +846,21 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,

if (type == TimingMeasurement::BANDWIDTH) {
if (context.sub_device_count) {
SUCCESS_OR_TERMINATE(
zeCommandListReset(context.cmd_list[current_sub_device_id]));
// This branch is taken when we're running the FLAT device hierarchy and
// there are multiple sub-devices per device
if (current_sub_device_id == 0) {
// This is the beginning of the entire explicit scaling benchmark, reset
// all cmdlists for all subdevices just once
for (uint32_t i = 0; i < context.sub_device_count; i++) {
SUCCESS_OR_TERMINATE(zeCommandListReset(context.cmd_list[i]));
}
}
} else {
SUCCESS_OR_TERMINATE(zeCommandListReset(context.command_list));
}

if (context.sub_device_count) {
// Explicit scaling: warmup on the current subdevice
if (verbose) {
std::cout << "current_sub_device_id value is ::"
<< current_sub_device_id << std::endl;
Expand All @@ -864,6 +872,8 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
throw std::runtime_error("zeCommandListAppendLaunchKernel failed: " +
std::to_string(result));
}
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
} else {
result = zeCommandListAppendLaunchKernel(
context.command_list, function,
Expand Down Expand Up @@ -894,35 +904,45 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,

for (uint32_t i = 0; i < warmup_iterations; i++) {
run_command_queue(context);
synchronize_command_queue(context);
}
if (verbose)
std::cout << "Warmup finished\n";

if (context.sub_device_count) {
if (context.sub_device_count == current_sub_device_id + 1) {
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
current_sub_device_id = context.sub_device_count - 1;
}
} else {
synchronize_command_queue(context);
if (context.sub_device_count) {
SUCCESS_OR_TERMINATE(
zeCommandListReset(context.cmd_list[current_sub_device_id]));
// Append memcpy & barriers to the current cmdlist and execute once
// This is required for explicit scaling since we don't do multi-threaded
// submission, so long cmdlists are needed to achieve overlap
for (uint32_t i = 0; i < iters; i++) {
SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(
context.cmd_list[current_sub_device_id], function,
&workgroup_info.thread_group_dimensions, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
}
SUCCESS_OR_TERMINATE(
zeCommandListClose(context.cmd_list[current_sub_device_id]));
}

timer.start();
for (uint32_t i = 0; i < iters; i++) {
if (context.sub_device_count) {
run_command_queue(context);

if (context.sub_device_count) {
if (context.sub_device_count == current_sub_device_id + 1) {
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
current_sub_device_id = context.sub_device_count - 1;
if (context.sub_device_count == current_sub_device_id + 1) {
// This is the last subdevice, sync with all subdevices and measure the
// time Otherwise we skip synchronization and the callee of this
// function proceed to the remaining subdevices
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
} else {
current_sub_device_id = context.sub_device_count - 1;
}
} else {
for (uint32_t i = 0; i < iters; i++) {
run_command_queue(context);
synchronize_command_queue(context);
}
}
Expand Down

0 comments on commit ae0ea49

Please sign in to comment.