intel · aelovikov-intel · Feb 12, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 23, 2024
@@ -24,7 +24,7 @@
 // the tests, please check if they pass without the discard_events property, if
 // they don't pass then it's most likely a general issue unrelated to
 // discard_events.
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <iostream>

@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is `nullptr` for all PI calls that
 // should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.

@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is not `nullptr` for all PI calls
 // that should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.

@@ -2,7 +2,6 @@
 // https://github.com/intel/llvm/issues/7330.
 // UNSUPPORTED: opencl && gpu
 // RUN: %{build} -o %t.out
-
 // RUN: %{run} %t.out
 
 // The test checks that each PI call to the queue returns a discarded event
@@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) {
   sycl::range<1> range(BUFFER_SIZE);
 
   auto Dev = Q.get_device();
-  int *x = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *x = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(x != nullptr);
-  int *y = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *y = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(y != nullptr);
 
   sycl::event DiscardedEvent;

@@ -9,11 +9,14 @@
 #include <iostream>
 #include <sycl/sycl.hpp>
 
-template <typename T>
-sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
-                                     T *out_data, int num_wg, int group_size) {
+template <typename T, typename AccessorT>
+sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data,
+                                     AccessorT &out_data, int num_wg,
+                                     int group_size) {
   sycl::nd_range<1> thread_range(num_wg * group_size, group_size);
   sycl::event event = queue->submit([&](sycl::handler &cgh) {
+    cgh.require(in_data);
+    cgh.require(out_data);
     cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) {
       auto id = item.get_global_linear_id();
       auto group = item.get_group();
@@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
   return event;
 }
 
-template <typename T>
-void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data,
-                              int num_wg, int group_size) {
-  compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size);
+template <typename T, typename AccessorT>
+void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data,
+                              AccessorT &out_data, int num_wg, int group_size) {
+  compiler_group_scan_impl<T>(queue, in_data, out_data, num_wg, group_size);
 }
 
 int main(int argc, const char **argv) {
-  int num_wg = 1;
-  int group_size = 16;
+  constexpr int num_wg = 1;
+  constexpr int group_size = 16;
 
   sycl::queue queue;
-
-  typedef int T;
-  size_t nelems = num_wg * group_size;
-  T *data = sycl::malloc_shared<T>(nelems, queue);
-  T *result = sycl::malloc_shared<T>(nelems, queue);
-  queue.fill<T>(data, T(2), nelems).wait();
-  queue.memset(result, 0, nelems * sizeof(T)).wait();
-
-  test_compiler_group_scan(&queue, data, result, num_wg, group_size);
-  queue.wait();
-  T expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
-                  256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
-  for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) {
-    assert(result[i] == expected[i]);
+  constexpr size_t nelems = num_wg * group_size;
+  int data[nelems];
+  int result[nelems];
+  for (size_t i = 0; i < nelems; ++i) {
+    data[i] = 2;
+    result[i] = 0;
+  }
+  sycl::buffer<int> data_buf{&data[0], sycl::range{nelems}};
+  sycl::buffer<int> result_buf{&result[0], sycl::range{nelems}};
+  sycl::accessor data_acc{data_buf};
+  sycl::accessor result_acc{result_buf};
+  test_compiler_group_scan<int>(&queue, data_acc, result_acc, num_wg,
+                                group_size);
+  sycl::host_accessor result_host{result_buf};
+  int expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
+                    256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
+  for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) {
+    assert(result_host[i] == expected[i]);
   }
-  sycl::free(data, queue);
-  sycl::free(result, queue);
   return 0;
 }
@@ -41,27 +41,29 @@ void testRootGroup() {
           max_num_work_group_sync>(q);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
-
-  int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
+  sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
-    auto root = it.ext_oneapi_get_root_group();
-    data[root.get_local_id()] = root.get_local_id();
-    sycl::group_barrier(root);
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor data{dataBuf, h};
+    h.parallel_for<class RootGroupKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          auto root = it.ext_oneapi_get_root_group();
+          data[root.get_local_id()] = root.get_local_id();
+          sycl::group_barrier(root);
 
-    root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
-    int sum = data[root.get_local_id()] +
-              data[root.get_local_range() - root.get_local_id() - 1];
-    sycl::group_barrier(root);
-    data[root.get_local_id()] = sum;
+          root =
+              sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
+          int sum = data[root.get_local_id()] +
+                    data[root.get_local_range() - root.get_local_id() - 1];
+          sycl::group_barrier(root);
+          data[root.get_local_id()] = sum;
+        });
   });
-  q.wait();
-
+  sycl::host_accessor data{dataBuf};
   const int workItemCount = static_cast<int>(range.get_global_range().size());
   for (int i = 0; i < workItemCount; i++) {
     assert(data[i] == (workItemCount - 1));
   }
-  sycl::free(data, q);
 }
 
 void testRootGroupFunctions() {
@@ -76,44 +78,45 @@ void testRootGroupFunctions() {
       sycl::ext::oneapi::experimental::use_root_sync};
 
   constexpr int testCount = 10;
-  bool *testResults = sycl::malloc_shared<bool>(testCount, q);
+  sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupFunctionsKernel>(
-      range, props, [=](sycl::nd_item<1> it) {
-        const auto root = it.ext_oneapi_get_root_group();
-        if (root.leader() || root.get_local_id() == 3) {
-          testResults[0] = root.get_group_id() == sycl::id<1>(0);
-          testResults[1] = root.leader()
-                               ? root.get_local_id() == sycl::id<1>(0)
-                               : root.get_local_id() == sycl::id<1>(3);
-          testResults[2] = root.get_group_range() == sycl::range<1>(1);
-          testResults[3] =
-              root.get_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[4] =
-              root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[5] = root.get_group_linear_id() == 0;
-          testResults[6] =
-              root.get_local_linear_id() == root.get_local_id().get(0);
-          testResults[7] = root.get_group_linear_range() == 1;
-          testResults[8] = root.get_local_linear_range() == WorkGroupSize;
-
-          const auto child =
-              sycl::ext::oneapi::experimental::get_child_group(root);
-          const auto grandchild =
-              sycl::ext::oneapi::experimental::get_child_group(child);
-          testResults[9] = child == it.get_group();
-          static_assert(
-              std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
-                             sycl::sub_group>,
-              "get_child_group(sycl::group) must return a sycl::sub_group");
-        }
-      });
-  q.wait();
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor testResults{testResultsBuf, h};
+    h.parallel_for<class RootGroupFunctionsKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          const auto root = it.ext_oneapi_get_root_group();
+          if (root.leader() || root.get_local_id() == 3) {
+            testResults[0] = root.get_group_id() == sycl::id<1>(0);
+            testResults[1] = root.leader()
+                                 ? root.get_local_id() == sycl::id<1>(0)
+                                 : root.get_local_id() == sycl::id<1>(3);
+            testResults[2] = root.get_group_range() == sycl::range<1>(1);
+            testResults[3] =
+                root.get_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[4] =
+                root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[5] = root.get_group_linear_id() == 0;
+            testResults[6] =
+                root.get_local_linear_id() == root.get_local_id().get(0);
+            testResults[7] = root.get_group_linear_range() == 1;
+            testResults[8] = root.get_local_linear_range() == WorkGroupSize;
 
+            const auto child =
+                sycl::ext::oneapi::experimental::get_child_group(root);
+            const auto grandchild =
+                sycl::ext::oneapi::experimental::get_child_group(child);
+            testResults[9] = child == it.get_group();
+            static_assert(
+                std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
+                               sycl::sub_group>,
+                "get_child_group(sycl::group) must return a sycl::sub_group");
+          }
+        });
+  });
+  sycl::host_accessor testResults{testResultsBuf};
   for (int i = 0; i < testCount; i++) {
     assert(testResults[i]);
   }
-  sycl::free(testResults, q);
 }
 
 int main() {

@@ -34,7 +34,8 @@ int main() {
   Failed += Check(Q, "host_task", [&]() {
     return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); });
   });
-
+  if (!Q.get_device().has(sycl::aspect::usm_shared_allocations))
+    return Failed;
   constexpr size_t N = 64;
   int *Data1 = sycl::malloc_shared<int>(N, Q);
   int *Data2 = sycl::malloc_shared<int>(N, Q);

@@ -15,20 +15,26 @@ int main() {
   sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
   sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
 
-  int *DevData = sycl::malloc_shared<int>(N, Dev, Ctx);
+  sycl::buffer<int> DevDataBuf{sycl::range{N}};
+  sycl::accessor DevData{DevDataBuf};
   int *HostData = (int *)malloc(N * sizeof(int) * 10);
 
   for (size_t I = 0; I < 10; ++I) {
-    Q1.fill<int>(DevData, 0, N);
-    sycl::event E1 = Q1.parallel_for(
-        N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    Q1.fill(DevData, 0);
+    sycl::event E1 = Q1.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(
+          N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    });
 
     Q2.ext_oneapi_set_external_event(E1);
-    sycl::event E2 =
-        Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    sycl::event E2 = Q2.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    });
 
     Q1.ext_oneapi_set_external_event(E2);
-    Q1.copy(DevData, HostData + N * I, N);
+    Q1.copy(DevData, HostData + N * I);
   }
 
   Q1.wait_and_throw();
@@ -46,9 +52,6 @@ int main() {
       }
     }
   }
-
-  sycl::free(DevData, Ctx);
   free(HostData);
-
   return Failures;
 }
@@ -74,7 +74,7 @@ int main() {
   // CHECK-CACHE: piKernelRelease
   // CHECK-CACHE: piProgramRelease
   // CHECK-CACHE: piEventsWait
-  auto *p = malloc_shared<int>(1, q);
+  auto *p = malloc_device<int>(1, q);
   for (int i = 0; i < 2; ++i)
     q.submit([&](handler &cgh) {
        cgh.set_specialization_constant<spec_id>(i);

@@ -14,19 +14,21 @@ int main() {
   queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
   queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-  int *in1 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in2 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in3 = sycl::malloc_shared<int>(dataSize, q1);
-  int *tmp = sycl::malloc_shared<int>(dataSize, q1);
-  int *out = sycl::malloc_shared<int>(dataSize, q1);
-
-  for (size_t i = 0; i < dataSize; ++i) {
-    in1[i] = i * 2;
-    in2[i] = i * 3;
-    in3[i] = i * 4;
-    tmp[i] = -1;
-    out[i] = -1;
-  }
+  int *in1 = sycl::malloc_device<int>(dataSize, q1);
+  int *in2 = sycl::malloc_device<int>(dataSize, q1);
+  int *in3 = sycl::malloc_device<int>(dataSize, q1);
+  int *tmp = sycl::malloc_device<int>(dataSize, q1);
+  int *out = sycl::malloc_device<int>(dataSize, q1);
+
+  q1.single_task<class InitKernel>([=]() {
+      for (size_t i = 0; i < dataSize; ++i) {
+        in1[i] = i * 2;
+        in2[i] = i * 3;
+        in3[i] = i * 4;
+        tmp[i] = -1;
+        out[i] = -1;
+      }
+    }).wait();
 
   ext::codeplay::experimental::fusion_wrapper fw1{q1};
   fw1.start_fusion();
@@ -71,10 +73,12 @@ int main() {
 
   q1.wait();
   q2.wait();
-
+  int host_out[dataSize];
+  q1.memcpy(host_out, out, dataSize * sizeof(int));
+  q1.wait();
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (40 * i * i) && "Computation error");
+    assert(host_out[i] == (40 * i * i) && "Computation error");
   }
   sycl::free(in1, q1);
   sycl::free(in2, q1);