oneapi-src · nrspruit · Nov 6, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/perf_tests/ze_peak/src/global_bw.cpp b/perf_tests/ze_peak/src/global_bw.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     numItems = numItems - (numItems % context.sub_device_count);
     if (verbose)
       std::cout << "splitting the total work items ::" << numItems
-                << "across subdevices ::" << context.sub_device_count
+                << " across subdevices ::" << context.sub_device_count
                 << std::endl;
     numItems = set_workgroups(context, numItems / context.sub_device_count,
                               &workgroup_info);
@@ -121,7 +121,6 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
         throw std::runtime_error("zeCommandListAppendMemoryCopy failed: " +
                                  std::to_string(result));
       }
-      i++;
     }
   } else {
     result = zeCommandListAppendMemoryCopy(
@@ -264,7 +263,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     setup_function(context, global_offset_v16,
                    "global_bandwidth_v16_global_offset", inputBuf, outputBuf);
   }
-  std::cout << "Global memory bandwidth (GBPS)\n";
+  std::cout << "Global memory bandwidth (GB/s)\n";
 
   timed = 0;
   timed_lo = 0;
@@ -295,15 +294,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v1, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v1, workgroup_info, type);
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -332,15 +331,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v2, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v2, workgroup_info, type);
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -370,15 +369,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v4, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v4, workgroup_info, type);
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -407,15 +406,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v8, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v8, workgroup_info, type);
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -443,15 +442,15 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v16, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v16, workgroup_info, type);
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   if (context.sub_device_count) {

diff --git a/perf_tests/ze_peak/src/transfer_bw.cpp b/perf_tests/ze_peak/src/transfer_bw.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -198,7 +198,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                    local_memory_size);
   }
   std::cout << "GPU Copy Host to Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -215,7 +215,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                    local_memory_size);
   }
   std::cout << "GPU Copy Shared Memory to Host : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -232,7 +232,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                     local_memory_size, true);
   }
   std::cout << "System Memory Copy to Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -249,7 +249,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                     local_memory_size, false);
   }
   std::cout << "System Memory Copy from Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   current_sub_device_id = 0;
 
@@ -328,7 +328,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
   if (verbose)
     std::cout << "device buffer allocated\n";
 
-  std::cout << "Transfer Bandwidth (GBPS)\n";
+  std::cout << "Transfer Bandwidth (GB/s)\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -345,7 +345,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
                                    local_memory_size);
   }
   std::cout << "enqueueWriteBuffer : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -362,7 +362,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
                                    local_memory_size);
   }
   std::cout << "enqueueReadBuffer : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   current_sub_device_id = 0;
 

diff --git a/perf_tests/ze_peak/src/ze_peak.cpp b/perf_tests/ze_peak/src/ze_peak.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,7 @@ std::vector<uint8_t> L0Context::load_binary_file(const std::string &file_path) {
   binary_file.resize(length);
   stream.read(reinterpret_cast<char *>(binary_file.data()), length);
   if (verbose)
-    std::cout << "Binary file loaded"
-              << "\n";
+    std::cout << "Binary file loaded\n";
   stream.close();
 
   return binary_file;
@@ -144,6 +143,7 @@ void L0Context::print_ze_device_properties(
             << "\n"
             << " * UUID : " << id << "\n"
             << " * coreClockRate : " << std::dec << props.coreClockRate << "\n"
+            << " * maxMemAllocSize : " << props.maxMemAllocSize << " bytes\n"
             << std::endl;
 }
 
@@ -846,13 +846,21 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
 
   if (type == TimingMeasurement::BANDWIDTH) {
     if (context.sub_device_count) {
-      SUCCESS_OR_TERMINATE(
-          zeCommandListReset(context.cmd_list[current_sub_device_id]));
+      // This branch is taken when we're running the FLAT device hierarchy and
+      // there are multiple sub-devices per device
+      if (current_sub_device_id == 0) {
+        // This is the beginning of the entire explicit scaling benchmark, reset
+        // all cmdlists for all subdevices just once
+        for (uint32_t i = 0; i < context.sub_device_count; i++) {
+          SUCCESS_OR_TERMINATE(zeCommandListReset(context.cmd_list[i]));
+        }
+      }
     } else {
       SUCCESS_OR_TERMINATE(zeCommandListReset(context.command_list));
     }
 
     if (context.sub_device_count) {
+      // Explicit scaling: warmup on the current subdevice
       if (verbose) {
         std::cout << "current_sub_device_id value is ::"
                   << current_sub_device_id << std::endl;
@@ -864,6 +872,8 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
         throw std::runtime_error("zeCommandListAppendLaunchKernel failed: " +
                                  std::to_string(result));
       }
+      SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
+          context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
     } else {
       result = zeCommandListAppendLaunchKernel(
           context.command_list, function,
@@ -894,35 +904,45 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
 
     for (uint32_t i = 0; i < warmup_iterations; i++) {
       run_command_queue(context);
+      synchronize_command_queue(context);
+    }
+    if (verbose)
+      std::cout << "Warmup finished\n";
 
-      if (context.sub_device_count) {
-        if (context.sub_device_count == current_sub_device_id + 1) {
-          current_sub_device_id = 0;
-          while (current_sub_device_id < context.sub_device_count) {
-            synchronize_command_queue(context);
-            current_sub_device_id++;
-          }
-          current_sub_device_id = context.sub_device_count - 1;
-        }
-      } else {
-        synchronize_command_queue(context);
+    if (context.sub_device_count) {
+      SUCCESS_OR_TERMINATE(
+          zeCommandListReset(context.cmd_list[current_sub_device_id]));
+      // Append memcpy & barriers to the current cmdlist and execute once
+      // This is required for explicit scaling since we don't do multi-threaded
+      // submission, so long cmdlists are needed to achieve overlap
+      for (uint32_t i = 0; i < iters; i++) {
+        SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(
+            context.cmd_list[current_sub_device_id], function,
+            &workgroup_info.thread_group_dimensions, nullptr, 0, nullptr));
+        SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
+            context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
       }
+      SUCCESS_OR_TERMINATE(
+          zeCommandListClose(context.cmd_list[current_sub_device_id]));
     }
 
     timer.start();
-    for (uint32_t i = 0; i < iters; i++) {
+    if (context.sub_device_count) {
       run_command_queue(context);
-
-      if (context.sub_device_count) {
-        if (context.sub_device_count == current_sub_device_id + 1) {
-          current_sub_device_id = 0;
-          while (current_sub_device_id < context.sub_device_count) {
-            synchronize_command_queue(context);
-            current_sub_device_id++;
-          }
-          current_sub_device_id = context.sub_device_count - 1;
+      if (context.sub_device_count == current_sub_device_id + 1) {
+        // This is the last subdevice, sync with all subdevices and measure the
+        // time Otherwise we skip synchronization and the callee of this
+        // function proceed to the remaining subdevices
+        current_sub_device_id = 0;
+        while (current_sub_device_id < context.sub_device_count) {
+          synchronize_command_queue(context);
+          current_sub_device_id++;
         }
-      } else {
+        current_sub_device_id = context.sub_device_count - 1;
+      }
+    } else {
+      for (uint32_t i = 0; i < iters; i++) {
+        run_command_queue(context);
         synchronize_command_queue(context);
       }
     }