Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ze_peak explicit scaling benchmark #88

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions perf_tests/ze_peak/src/global_bw.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -37,7 +37,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
numItems = numItems - (numItems % context.sub_device_count);
if (verbose)
std::cout << "splitting the total work items ::" << numItems
<< "across subdevices ::" << context.sub_device_count
<< " across subdevices ::" << context.sub_device_count
<< std::endl;
numItems = set_workgroups(context, numItems / context.sub_device_count,
&workgroup_info);
Expand Down Expand Up @@ -121,7 +121,6 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
throw std::runtime_error("zeCommandListAppendMemoryCopy failed: " +
std::to_string(result));
}
i++;
}
} else {
result = zeCommandListAppendMemoryCopy(
Expand Down Expand Up @@ -264,7 +263,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
setup_function(context, global_offset_v16,
"global_bandwidth_v16_global_offset", inputBuf, outputBuf);
}
std::cout << "Global memory bandwidth (GBPS)\n";
std::cout << "Global memory bandwidth (GB/s)\n";

timed = 0;
timed_lo = 0;
Expand Down Expand Up @@ -295,15 +294,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v1, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v1, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -332,15 +331,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v2, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v2, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -370,15 +369,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v4, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v4, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -407,15 +406,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v8, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v8, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

timed = 0;
Expand Down Expand Up @@ -443,15 +442,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
timed = (timed_lo < timed_go) ? timed_lo : timed_go;
gbps = calculate_gbps(timed,
numItems * context.sub_device_count * sizeof(float));
std::cout << gbps << " GFLOPS\n";
std::cout << gbps << " GB/s\n";
} else {
timed_lo = run_kernel(context, local_offset_v16, workgroup_info, type);
timed_go = run_kernel(context, global_offset_v16, workgroup_info, type);
timed = (timed_lo < timed_go) ? timed_lo : timed_go;

gbps = calculate_gbps(timed, numItems * sizeof(float));

std::cout << gbps << " GBPS\n";
std::cout << gbps << " GB/s\n";
}

if (context.sub_device_count) {
Expand Down
16 changes: 8 additions & 8 deletions perf_tests/ze_peak/src/transfer_bw.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -198,7 +198,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size);
}
std::cout << "GPU Copy Host to Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -215,7 +215,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size);
}
std::cout << "GPU Copy Shared Memory to Host : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -232,7 +232,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size, true);
}
std::cout << "System Memory Copy to Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -249,7 +249,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
local_memory_size, false);
}
std::cout << "System Memory Copy from Shared Memory : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

current_sub_device_id = 0;

Expand Down Expand Up @@ -328,7 +328,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
if (verbose)
std::cout << "device buffer allocated\n";

std::cout << "Transfer Bandwidth (GBPS)\n";
std::cout << "Transfer Bandwidth (GB/s)\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -345,7 +345,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
local_memory_size);
}
std::cout << "enqueueWriteBuffer : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
Expand All @@ -362,7 +362,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
local_memory_size);
}
std::cout << "enqueueReadBuffer : ";
std::cout << gflops << " GBPS\n";
std::cout << gflops << " GB/s\n";

current_sub_device_id = 0;

Expand Down
74 changes: 47 additions & 27 deletions perf_tests/ze_peak/src/ze_peak.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (C) 2019 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -40,8 +40,7 @@ std::vector<uint8_t> L0Context::load_binary_file(const std::string &file_path) {
binary_file.resize(length);
stream.read(reinterpret_cast<char *>(binary_file.data()), length);
if (verbose)
std::cout << "Binary file loaded"
<< "\n";
std::cout << "Binary file loaded\n";
stream.close();

return binary_file;
Expand Down Expand Up @@ -144,6 +143,7 @@ void L0Context::print_ze_device_properties(
<< "\n"
<< " * UUID : " << id << "\n"
<< " * coreClockRate : " << std::dec << props.coreClockRate << "\n"
<< " * maxMemAllocSize : " << props.maxMemAllocSize << " bytes\n"
<< std::endl;
}

Expand Down Expand Up @@ -846,13 +846,21 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,

if (type == TimingMeasurement::BANDWIDTH) {
if (context.sub_device_count) {
SUCCESS_OR_TERMINATE(
zeCommandListReset(context.cmd_list[current_sub_device_id]));
// This branch is taken when we're running the FLAT device hierarchy and
// there are multiple sub-devices per device
if (current_sub_device_id == 0) {
lyu marked this conversation as resolved.
Show resolved Hide resolved
// This is the beginning of the entire explicit scaling benchmark, reset
// all cmdlists for all subdevices just once
for (uint32_t i = 0; i < context.sub_device_count; i++) {
SUCCESS_OR_TERMINATE(zeCommandListReset(context.cmd_list[i]));
}
}
} else {
SUCCESS_OR_TERMINATE(zeCommandListReset(context.command_list));
}

if (context.sub_device_count) {
// Explicit scaling: warmup on the current subdevice
if (verbose) {
std::cout << "current_sub_device_id value is ::"
<< current_sub_device_id << std::endl;
Expand All @@ -864,6 +872,8 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
throw std::runtime_error("zeCommandListAppendLaunchKernel failed: " +
std::to_string(result));
}
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
} else {
result = zeCommandListAppendLaunchKernel(
context.command_list, function,
Expand Down Expand Up @@ -894,35 +904,45 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,

for (uint32_t i = 0; i < warmup_iterations; i++) {
run_command_queue(context);
synchronize_command_queue(context);
}
if (verbose)
std::cout << "Warmup finished\n";

if (context.sub_device_count) {
if (context.sub_device_count == current_sub_device_id + 1) {
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
current_sub_device_id = context.sub_device_count - 1;
}
} else {
synchronize_command_queue(context);
if (context.sub_device_count) {
SUCCESS_OR_TERMINATE(
zeCommandListReset(context.cmd_list[current_sub_device_id]));
// Append memcpy & barriers to the current cmdlist and execute once
// This is required for explicit scaling since we don't do multi-threaded
// submission, so long cmdlists are needed to achieve overlap
for (uint32_t i = 0; i < iters; i++) {
SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(
context.cmd_list[current_sub_device_id], function,
&workgroup_info.thread_group_dimensions, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
}
SUCCESS_OR_TERMINATE(
zeCommandListClose(context.cmd_list[current_sub_device_id]));
}

timer.start();
for (uint32_t i = 0; i < iters; i++) {
if (context.sub_device_count) {
lyu marked this conversation as resolved.
Show resolved Hide resolved
run_command_queue(context);

if (context.sub_device_count) {
if (context.sub_device_count == current_sub_device_id + 1) {
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
current_sub_device_id = context.sub_device_count - 1;
if (context.sub_device_count == current_sub_device_id + 1) {
// This is the last subdevice, sync with all subdevices and measure the
// time Otherwise we skip synchronization and the callee of this
// function proceed to the remaining subdevices
current_sub_device_id = 0;
while (current_sub_device_id < context.sub_device_count) {
synchronize_command_queue(context);
current_sub_device_id++;
}
} else {
current_sub_device_id = context.sub_device_count - 1;
}
} else {
for (uint32_t i = 0; i < iters; i++) {
run_command_queue(context);
synchronize_command_queue(context);
}
}
Expand Down
Loading