From bee45c380d0c603809a4813ec1c4cac6967a8fca Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 18 Jan 2024 16:15:54 -0800
Subject: [PATCH 01/19] Throw when USM not supported by Device

---
 sycl/source/detail/usm/usm_impl.cpp | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index 396750acf3044..4260bb8569b07 100644
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -252,6 +252,11 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt,
   PrepareNotify.scopedNotify(
       (uint16_t)xpti::trace_point_type_t::mem_alloc_begin);
 #endif
+  if (Kind == alloc::device &&
+    !Dev.has(sycl::aspect::usm_device_allocations)) {
+        throw sycl::exception(sycl::errc::feature_not_supported,
+            "Device does not support Unified Shared Memory!");
+  }
   void *RetVal =
       alignedAllocInternal(Alignment, Size, getSyclObjImpl(Ctxt).get(),
                            getSyclObjImpl(Dev).get(), Kind, PropList);

From f3b90a4b8fac46549fd6d1577be75017dc70684d Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 13:52:48 -0800
Subject: [PATCH 02/19] Rewrite tests

---
 .../discard_events_mixed_calls.cpp            |  2 +-
 .../DiscardEvents/discard_events_usm.cpp      |  2 +-
 .../discard_events_usm_ooo_queue.cpp          |  2 +-
 sycl/test-e2e/DiscardEvents/invalid_event.cpp |  2 +-
 .../exclusive_scan_over_group.cpp             | 55 ++++++-----
 sycl/test-e2e/GroupAlgorithm/root_group.cpp   | 97 ++++++++++---------
 .../InOrderEventsExt/get_last_event.cpp       |  3 +-
 .../InOrderEventsExt/set_external_event.cpp   | 23 +++--
 .../KernelAndProgram/disable-caching.cpp      |  7 +-
 .../sync_two_queues_event_dep.cpp             | 49 ++++++----
 .../test-e2e/KernelFusion/sync_usm_mem_op.cpp |  2 +-
 11 files changed, 132 insertions(+), 112 deletions(-)

diff --git a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
index 8206ea5b449c3..8399951d16298 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
@@ -24,7 +24,7 @@
 // the tests, please check if they pass without the discard_events property, if
 // they don't pass then it's most likely a general issue unrelated to
 // discard_events.
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <iostream>
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
index 11288d6620bfd..48ab65c68896c 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is `nullptr` for all PI calls that
 // should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
index cfe72db0c1232..96d53a632beb6 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is not `nullptr` for all PI calls
 // that should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.
diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
index 273e74afb6c25..4fc0ee0f1495b 100644
--- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp
+++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
@@ -2,7 +2,7 @@
 // https://github.com/intel/llvm/issues/7330.
 // UNSUPPORTED: opencl && gpu
 // RUN: %{build} -o %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 // RUN: %{run} %t.out
 
 // The test checks that each PI call to the queue returns a discarded event
diff --git a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
index 6411131ef33ff..8f7ee3a55352c 100644
--- a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
@@ -9,11 +9,14 @@
 #include <iostream>
 #include <sycl/sycl.hpp>
 
-template <typename T>
-sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
-                                     T *out_data, int num_wg, int group_size) {
+template <typename T, typename AccessorT>
+sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data,
+                                     AccessorT &out_data, int num_wg,
+                                     int group_size) {
   sycl::nd_range<1> thread_range(num_wg * group_size, group_size);
   sycl::event event = queue->submit([&](sycl::handler &cgh) {
+    cgh.require(in_data);
+    cgh.require(out_data);
     cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) {
       auto id = item.get_global_linear_id();
       auto group = item.get_group();
@@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
   return event;
 }
 
-template <typename T>
-void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data,
-                              int num_wg, int group_size) {
-  compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size);
+template <typename T, typename AccessorT>
+void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data,
+                              AccessorT &out_data, int num_wg, int group_size) {
+  compiler_group_scan_impl<T>(queue, in_data, out_data, num_wg, group_size);
 }
 
 int main(int argc, const char **argv) {
-  int num_wg = 1;
-  int group_size = 16;
+  constexpr int num_wg = 1;
+  constexpr int group_size = 16;
 
   sycl::queue queue;
-
-  typedef int T;
-  size_t nelems = num_wg * group_size;
-  T *data = sycl::malloc_shared<T>(nelems, queue);
-  T *result = sycl::malloc_shared<T>(nelems, queue);
-  queue.fill<T>(data, T(2), nelems).wait();
-  queue.memset(result, 0, nelems * sizeof(T)).wait();
-
-  test_compiler_group_scan(&queue, data, result, num_wg, group_size);
-  queue.wait();
-  T expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
-                  256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
-  for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) {
-    assert(result[i] == expected[i]);
+  constexpr size_t nelems = num_wg * group_size;
+  int data[nelems];
+  int result[nelems];
+  for (size_t i = 0; i < nelems; ++i) {
+    data[i] = 2;
+    result[i] = 0;
+  }
+  sycl::buffer<int> data_buf{&data[0], sycl::range{nelems}};
+  sycl::buffer<int> result_buf{&result[0], sycl::range{nelems}};
+  sycl::accessor data_acc{data_buf};
+  sycl::accessor result_acc{result_buf};
+  test_compiler_group_scan<int>(&queue, data_acc, result_acc, num_wg,
+                                group_size);
+  sycl::host_accessor result_host{result_buf};
+  int expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
+                    256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
+  for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) {
+    assert(result_host[i] == expected[i]);
   }
-  sycl::free(data, queue);
-  sycl::free(result, queue);
   return 0;
 }
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index ba0c49fa68bf7..584ecfd22fe0b 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -41,27 +41,29 @@ void testRootGroup() {
           max_num_work_group_sync>(q);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
-
-  int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
+  sycl::buffer<int> data_buf{sycl::range{maxWGs * WorkGroupSize}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
-    auto root = it.ext_oneapi_get_root_group();
-    data[root.get_local_id()] = root.get_local_id();
-    sycl::group_barrier(root);
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor data{data_buf, h};
+    h.parallel_for<class RootGroupKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          auto root = it.ext_oneapi_get_root_group();
+          data[root.get_local_id()] = root.get_local_id();
+          sycl::group_barrier(root);
 
-    root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
-    int sum = data[root.get_local_id()] +
-              data[root.get_local_range() - root.get_local_id() - 1];
-    sycl::group_barrier(root);
-    data[root.get_local_id()] = sum;
+          root =
+              sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
+          int sum = data[root.get_local_id()] +
+                    data[root.get_local_range() - root.get_local_id() - 1];
+          sycl::group_barrier(root);
+          data[root.get_local_id()] = sum;
+        });
   });
-  q.wait();
-
+  sycl::host_accessor data{data_buf};
   const int workItemCount = static_cast<int>(range.get_global_range().size());
   for (int i = 0; i < workItemCount; i++) {
     assert(data[i] == (workItemCount - 1));
   }
-  sycl::free(data, q);
 }
 
 void testRootGroupFunctions() {
@@ -76,44 +78,45 @@ void testRootGroupFunctions() {
       sycl::ext::oneapi::experimental::use_root_sync};
 
   constexpr int testCount = 10;
-  bool *testResults = sycl::malloc_shared<bool>(testCount, q);
+  sycl::buffer<bool> testResults_buf{sycl::range{testCount}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupFunctionsKernel>(
-      range, props, [=](sycl::nd_item<1> it) {
-        const auto root = it.ext_oneapi_get_root_group();
-        if (root.leader() || root.get_local_id() == 3) {
-          testResults[0] = root.get_group_id() == sycl::id<1>(0);
-          testResults[1] = root.leader()
-                               ? root.get_local_id() == sycl::id<1>(0)
-                               : root.get_local_id() == sycl::id<1>(3);
-          testResults[2] = root.get_group_range() == sycl::range<1>(1);
-          testResults[3] =
-              root.get_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[4] =
-              root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[5] = root.get_group_linear_id() == 0;
-          testResults[6] =
-              root.get_local_linear_id() == root.get_local_id().get(0);
-          testResults[7] = root.get_group_linear_range() == 1;
-          testResults[8] = root.get_local_linear_range() == WorkGroupSize;
-
-          const auto child =
-              sycl::ext::oneapi::experimental::get_child_group(root);
-          const auto grandchild =
-              sycl::ext::oneapi::experimental::get_child_group(child);
-          testResults[9] = child == it.get_group();
-          static_assert(
-              std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
-                             sycl::sub_group>,
-              "get_child_group(sycl::group) must return a sycl::sub_group");
-        }
-      });
-  q.wait();
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor testResults{testResults_buf, h};
+    h.parallel_for<class RootGroupFunctionsKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          const auto root = it.ext_oneapi_get_root_group();
+          if (root.leader() || root.get_local_id() == 3) {
+            testResults[0] = root.get_group_id() == sycl::id<1>(0);
+            testResults[1] = root.leader()
+                                 ? root.get_local_id() == sycl::id<1>(0)
+                                 : root.get_local_id() == sycl::id<1>(3);
+            testResults[2] = root.get_group_range() == sycl::range<1>(1);
+            testResults[3] =
+                root.get_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[4] =
+                root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[5] = root.get_group_linear_id() == 0;
+            testResults[6] =
+                root.get_local_linear_id() == root.get_local_id().get(0);
+            testResults[7] = root.get_group_linear_range() == 1;
+            testResults[8] = root.get_local_linear_range() == WorkGroupSize;
 
+            const auto child =
+                sycl::ext::oneapi::experimental::get_child_group(root);
+            const auto grandchild =
+                sycl::ext::oneapi::experimental::get_child_group(child);
+            testResults[9] = child == it.get_group();
+            static_assert(
+                std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
+                               sycl::sub_group>,
+                "get_child_group(sycl::group) must return a sycl::sub_group");
+          }
+        });
+  });
+  sycl::host_accessor testResults{testResults_buf};
   for (int i = 0; i < testCount; i++) {
     assert(testResults[i]);
   }
-  sycl::free(testResults, q);
 }
 
 int main() {
diff --git a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
index 3393202b5a370..aeb4ab44acb49 100644
--- a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
@@ -34,7 +34,8 @@ int main() {
   Failed += Check(Q, "host_task", [&]() {
     return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); });
   });
-
+  if (!Q.get_device().has(sycl::aspect::usm_shared_allocations))
+    return Failed;
   constexpr size_t N = 64;
   int *Data1 = sycl::malloc_shared<int>(N, Q);
   int *Data2 = sycl::malloc_shared<int>(N, Q);
diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
index 45e5815606dbe..4f2a409fa3d46 100644
--- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
@@ -15,20 +15,26 @@ int main() {
   sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
   sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
 
-  int *DevData = sycl::malloc_shared<int>(N, Dev, Ctx);
+  sycl::buffer<int> DevDatabuf{sycl::range{N}};
+  sycl::accessor DevData{DevDatabuf};
   int *HostData = (int *)malloc(N * sizeof(int) * 10);
 
   for (size_t I = 0; I < 10; ++I) {
-    Q1.fill<int>(DevData, 0, N);
-    sycl::event E1 = Q1.parallel_for(
-        N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    Q1.fill(DevData, 0);
+    sycl::event E1 = Q1.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(
+          N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    });
 
     Q2.ext_oneapi_set_external_event(E1);
-    sycl::event E2 =
-        Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    sycl::event E2 = Q2.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    });
 
     Q1.ext_oneapi_set_external_event(E2);
-    Q1.copy(DevData, HostData + N * I, N);
+    Q1.copy(DevData, HostData + N * I);
   }
 
   Q1.wait_and_throw();
@@ -47,8 +53,5 @@ int main() {
     }
   }
 
-  sycl::free(DevData, Ctx);
-  free(HostData);
-
   return Failures;
 }
diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
index 4276efe36366f..b48467af03a63 100644
--- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
+++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
@@ -74,16 +74,15 @@ int main() {
   // CHECK-CACHE: piKernelRelease
   // CHECK-CACHE: piProgramRelease
   // CHECK-CACHE: piEventsWait
-  auto *p = malloc_shared<int>(1, q);
+  sycl::buffer<int> p_buf{sycl::range{1}};
   for (int i = 0; i < 2; ++i)
     q.submit([&](handler &cgh) {
+       sycl::accessor p{p_buf, cgh};
        cgh.set_specialization_constant<spec_id>(i);
        cgh.parallel_for(1, [=](auto, kernel_handler kh) {
-         *p = kh.get_specialization_constant<spec_id>();
+         p[0] = kh.get_specialization_constant<spec_id>();
        });
      }).wait();
-
-  free(p, q);
 }
 
 // (Program cache releases)
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index 38286c48b8a0f..e6096f231ef9b 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -13,27 +13,37 @@ int main() {
 
   queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
   queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
-
-  int *in1 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in2 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in3 = sycl::malloc_shared<int>(dataSize, q1);
-  int *tmp = sycl::malloc_shared<int>(dataSize, q1);
-  int *out = sycl::malloc_shared<int>(dataSize, q1);
-
+  int in1_arr[dataSize];
+  int in2_arr[dataSize];
+  int in3_arr[dataSize];
+  int tmp_arr[dataSize];
+  int out_arr[dataSize];
   for (size_t i = 0; i < dataSize; ++i) {
-    in1[i] = i * 2;
-    in2[i] = i * 3;
-    in3[i] = i * 4;
-    tmp[i] = -1;
-    out[i] = -1;
+    in1_arr[i] = i * 2;
+    in2_arr[i] = i * 3;
+    in3_arr[i] = i * 4;
+    tmp_arr[i] = -1;
+    out_arr[i] = -1;
   }
-
+  sycl::buffer<int> in1_buf{&in1_arr[0], sycl::range{dataSize}};
+  sycl::buffer<int> in2_buf{&in2_arr[0], sycl::range{dataSize}};
+  sycl::buffer<int> in3_buf{&in3_arr[0], sycl::range{dataSize}};
+  sycl::buffer<int> tmp_buf{&tmp_arr[0], sycl::range{dataSize}};
+  sycl::buffer<int> out_buf{&out_arr[0], sycl::range{dataSize}};
+  sycl::accessor in1{in1_buf};
+  sycl::accessor in2{in2_buf};
+  sycl::accessor in3{in3_buf};
+  sycl::accessor tmp{tmp_buf};
+  sycl::accessor out{out_buf};
   ext::codeplay::experimental::fusion_wrapper fw1{q1};
   fw1.start_fusion();
 
   assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode");
 
   auto kernel1 = q1.submit([&](handler &cgh) {
+    cgh.require(in1);
+    cgh.require(in2);
+    cgh.require(tmp);
     cgh.parallel_for<class KernelOne>(
         dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
   });
@@ -42,6 +52,7 @@ int main() {
   fw2.start_fusion();
 
   auto kernel3 = q2.submit([&](handler &cgh) {
+    cgh.require(tmp);
     cgh.depends_on(kernel1);
     cgh.parallel_for<class KernelThree>(dataSize,
                                         [=](id<1> i) { tmp[i] *= 2; });
@@ -55,6 +66,9 @@ int main() {
   assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode");
 
   auto kernel2 = q1.submit([&](handler &cgh) {
+    cgh.require(tmp);
+    cgh.require(in3);
+    cgh.require(out);
     cgh.depends_on(kernel3);
     cgh.parallel_for<class KernelTwo>(
         dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
@@ -71,16 +85,11 @@ int main() {
 
   q1.wait();
   q2.wait();
-
+  sycl::host_accessor out_host{out_buf};
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (40 * i * i) && "Computation error");
+    assert(out_host[i] == (40 * i * i) && "Computation error");
   }
-  sycl::free(in1, q1);
-  sycl::free(in2, q1);
-  sycl::free(in3, q1);
-  sycl::free(tmp, q1);
-  sycl::free(out, q1);
   return 0;
 }
 
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 0e6bc288812fd..1728ca8dd6002 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
-
+// REQUIRES: aspect-usm_shared_allocations
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 

From 86e91445557f893ea6bf1f803751da280bf74985 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 13:56:24 -0800
Subject: [PATCH 03/19] Remove rogue changes

---
 sycl/source/detail/usm/usm_impl.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index 965a4f47f57c7..3db9df8b52337 100755
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -273,11 +273,6 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt,
   PrepareNotify.scopedNotify(
       (uint16_t)xpti::trace_point_type_t::mem_alloc_begin);
 #endif
-  if (Kind == alloc::device &&
-    !Dev.has(sycl::aspect::usm_device_allocations)) {
-        throw sycl::exception(sycl::errc::feature_not_supported,
-            "Device does not support Unified Shared Memory!");
-  }
   void *RetVal =
       alignedAllocInternal(Alignment, Size, getSyclObjImpl(Ctxt).get(),
                            getSyclObjImpl(Dev).get(), Kind, PropList);

From 273fbf23422c26bdb9e0c21fdc3ea3cf72982232 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 16:52:51 -0800
Subject: [PATCH 04/19] Fix reduction tests

---
 .../test-e2e/Reduction/reduction_internal.cpp | 18 ++---
 .../Reduction/reduction_range_item.cpp        | 81 ++++++++++---------
 2 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp
index 57947a25176e1..e16f8d750a154 100644
--- a/sycl/test-e2e/Reduction/reduction_internal.cpp
+++ b/sycl/test-e2e/Reduction/reduction_internal.cpp
@@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) {
          cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl,
          [=](auto Item, auto &Red) { Red.combine(T{1}); });
    }).wait();
-
-  auto *Result = malloc_shared<T>(1, q);
+  sycl::buffer<T> Result_buf{sycl::range{1}};
   q.submit([&](handler &cgh) {
-     auto RedAcc = GetRedAcc(cgh);
-     cgh.single_task([=]() { *Result = RedAcc[0]; });
-   }).wait();
-
+    sycl::accessor Result{Result_buf, cgh};
+    auto RedAcc = GetRedAcc(cgh);
+    cgh.single_task([=]() { Result[0] = RedAcc[0]; });
+  });
+  sycl::host_accessor Result{Result_buf};
   auto N = get_global_range(Range).size();
   int Expected = InitToIdentity ? N : Init + N;
 #if defined(__PRETTY_FUNCTION__)
@@ -94,10 +94,8 @@ static void test(RedStorage &Storage, RangeTy Range) {
 #elif defined(__FUNCSIG__)
   std::cout << __FUNCSIG__;
 #endif
-  std::cout << ": " << *Result << ", expected " << Expected << std::endl;
-  assert(*Result == Expected);
-
-  free(Result, q);
+  std::cout << ": " << Result[0] << ", expected " << Expected << std::endl;
+  assert(Result[0] == Expected);
 }
 
 template <int... Inds, class F>
diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp
index ed9806e4753b5..3c886063215fa 100644
--- a/sycl/test-e2e/Reduction/reduction_range_item.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp
@@ -10,44 +10,49 @@ using namespace sycl;
 
 int main() {
   queue q;
-  auto *RedMem = malloc_shared<int>(1, q);
-  auto *Success = malloc_shared<bool>(1, q);
-  *Success = true;
+  bool Success_val = true;
+  int RedMem_val = 0;
+  sycl::buffer RedMem_buf{&RedMem_val, sycl::range{1}};
+  sycl::buffer Success_buf{&Success_val, sycl::range{1}};
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor RedMem{RedMem_buf, h};
+    sycl::accessor Success{Success_buf, h};
+    h.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
+                   [=](item<1> Item, auto &Red) {
+                     Red += 1;
+                     if (Item.get_range(0) != 7)
+                       Success[0] = false;
+                     if (Item.get_id(0) == 7)
+                       Success[0] = false;
+                   });
+  });
+  sycl::host_accessor RedMem{RedMem_buf};
+  sycl::host_accessor Success{Success_buf};
+  assert(RedMem[0] == 7);
+  assert(Success[0]);
+  RedMem[0] = 0;
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor RedMem{RedMem_buf, h};
+    sycl::accessor Success{Success_buf, h};
+    h.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
+                   [=](item<2> Item, auto &Red) {
+                     Red += 1;
+                     if (Item.get_range(0) != 1030)
+                       *Success = false;
+                     if (Item.get_range(1) != 7)
+                       *Success = false;
+
+                     if (Item.get_id(0) == 1030)
+                       *Success = false;
+                     if (Item.get_id(1) == 7)
+                       *Success = false;
+                   });
+  });
+  sycl::host_accessor RedMem{RedMem_buf};
+  sycl::host_accessor Success{Success_buf};
+
+  assert(RedMem[0] == 1030 * 7);
+  assert(Success[0]);
 
-  *RedMem = 0;
-  q.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
-                 [=](item<1> Item, auto &Red) {
-                   Red += 1;
-                   if (Item.get_range(0) != 7)
-                     *Success = false;
-                   if (Item.get_id(0) == 7)
-                     *Success = false;
-                 })
-      .wait();
-
-  assert(*RedMem == 7);
-  assert(*Success);
-
-  *RedMem = 0;
-  q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
-                 [=](item<2> Item, auto &Red) {
-                   Red += 1;
-                   if (Item.get_range(0) != 1030)
-                     *Success = false;
-                   if (Item.get_range(1) != 7)
-                     *Success = false;
-
-                   if (Item.get_id(0) == 1030)
-                     *Success = false;
-                   if (Item.get_id(1) == 7)
-                     *Success = false;
-                 })
-      .wait();
-
-  assert(*RedMem == 1030 * 7);
-  assert(*Success);
-
-  free(RedMem, q);
-  free(Success, q);
   return 0;
 }

From ef2a6cc2f83b116a524e5b84677ed20795ff0fb7 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 16:53:52 -0800
Subject: [PATCH 05/19] Fix reduction tests

---
 sycl/test-e2e/Reduction/reduction_span.cpp      | 2 +-
 sycl/test-e2e/Reduction/reduction_span_pack.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp
index 6c06d377eabe2..63f5ec1ece9be 100644
--- a/sycl/test-e2e/Reduction/reduction_span.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span.cpp
@@ -3,7 +3,7 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
+// REQUIRES: aspect-usm_shared_allocations
 // This test performs basic checks of reductions initialized with a sycl::span
 
 #include <sycl/sycl.hpp>
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index 46862ffe45cf9..706dca1363dd3 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -6,7 +6,7 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
+// REQUIRES: aspect-usm_shared_allocations
 // This test performs basic checks of reductions initialized with a pack
 // containing at least one sycl::span
 

From 604893fc9c94c48c22cc341db19a35708bde3c24 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 16:59:36 -0800
Subject: [PATCH 06/19] Fix reduction tests

---
 .../Reduction/reduction_range_item.cpp        | 83 +++++++++----------
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp
index 3c886063215fa..485959860456d 100644
--- a/sycl/test-e2e/Reduction/reduction_range_item.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp
@@ -3,56 +3,51 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
 
 int main() {
   queue q;
-  bool Success_val = true;
-  int RedMem_val = 0;
-  sycl::buffer RedMem_buf{&RedMem_val, sycl::range{1}};
-  sycl::buffer Success_buf{&Success_val, sycl::range{1}};
-  q.submit([&](sycl::handler &h) {
-    sycl::accessor RedMem{RedMem_buf, h};
-    sycl::accessor Success{Success_buf, h};
-    h.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
-                   [=](item<1> Item, auto &Red) {
-                     Red += 1;
-                     if (Item.get_range(0) != 7)
-                       Success[0] = false;
-                     if (Item.get_id(0) == 7)
-                       Success[0] = false;
-                   });
-  });
-  sycl::host_accessor RedMem{RedMem_buf};
-  sycl::host_accessor Success{Success_buf};
-  assert(RedMem[0] == 7);
-  assert(Success[0]);
-  RedMem[0] = 0;
-  q.submit([&](sycl::handler &h) {
-    sycl::accessor RedMem{RedMem_buf, h};
-    sycl::accessor Success{Success_buf, h};
-    h.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
-                   [=](item<2> Item, auto &Red) {
-                     Red += 1;
-                     if (Item.get_range(0) != 1030)
-                       *Success = false;
-                     if (Item.get_range(1) != 7)
-                       *Success = false;
-
-                     if (Item.get_id(0) == 1030)
-                       *Success = false;
-                     if (Item.get_id(1) == 7)
-                       *Success = false;
-                   });
-  });
-  sycl::host_accessor RedMem{RedMem_buf};
-  sycl::host_accessor Success{Success_buf};
-
-  assert(RedMem[0] == 1030 * 7);
-  assert(Success[0]);
+  auto *RedMem = malloc_shared<int>(1, q);
+  auto *Success = malloc_shared<bool>(1, q);
+  *Success = true;
+
+  *RedMem = 0;
+  q.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
+                 [=](item<1> Item, auto &Red) {
+                   Red += 1;
+                   if (Item.get_range(0) != 7)
+                     *Success = false;
+                   if (Item.get_id(0) == 7)
+                     *Success = false;
+                 })
+      .wait();
+
+  assert(*RedMem == 7);
+  assert(*Success);
+
+  *RedMem = 0;
+  q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
+                 [=](item<2> Item, auto &Red) {
+                   Red += 1;
+                   if (Item.get_range(0) != 1030)
+                     *Success = false;
+                   if (Item.get_range(1) != 7)
+                     *Success = false;
+
+                   if (Item.get_id(0) == 1030)
+                     *Success = false;
+                   if (Item.get_id(1) == 7)
+                     *Success = false;
+                 })
+      .wait();
+
+  assert(*RedMem == 1030 * 7);
+  assert(*Success);
 
+  free(RedMem, q);
+  free(Success, q);
   return 0;
 }

From 406ceb0e616b7ae8a583cd57d0e20a558f455fc5 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 19:13:02 -0800
Subject: [PATCH 07/19] Revert disable-caching to use usm

---
 sycl/test-e2e/KernelAndProgram/disable-caching.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
index b48467af03a63..cd937a5adf207 100644
--- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
+++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
@@ -6,6 +6,7 @@
 // RUN: | FileCheck %s
 // RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 %{run} %t.out \
 // RUN: | FileCheck %s --check-prefixes=CHECK-CACHE
+// REQUIRES: aspect-usm_shared_allocations
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
@@ -74,15 +75,16 @@ int main() {
   // CHECK-CACHE: piKernelRelease
   // CHECK-CACHE: piProgramRelease
   // CHECK-CACHE: piEventsWait
-  sycl::buffer<int> p_buf{sycl::range{1}};
+  auto *p = malloc_shared<int>(1, q);
   for (int i = 0; i < 2; ++i)
     q.submit([&](handler &cgh) {
-       sycl::accessor p{p_buf, cgh};
        cgh.set_specialization_constant<spec_id>(i);
        cgh.parallel_for(1, [=](auto, kernel_handler kh) {
-         p[0] = kh.get_specialization_constant<spec_id>();
+         *p = kh.get_specialization_constant<spec_id>();
        });
      }).wait();
+
+  free(p, q);
 }
 
 // (Program cache releases)

From c2ba1ad50f8d6fcd2b46f42e4710d053af8542bf Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 19:52:06 -0800
Subject: [PATCH 08/19] Minor fixes

---
 .../Regression/exclusive-scan-char-short.cpp  | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
index a7d3601210fa8..743d7de3b6d85 100644
--- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
+++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
@@ -16,24 +16,27 @@ int n_fail = 0;
 template <typename T, typename OpT> void test() {
   auto op = OpT();
   auto init = sycl::known_identity_v<decltype(op), T>;
-  auto *p = malloc_shared<T>(1, q);
-  *p = 0;
+  T p_val = 0;
+  sycl::buffer<T> p_buf{&p_val, sycl::range{1}};
+  sycl::accessor p{p_buf};
   T ref;
-  emu::exclusive_scan(p, p + 1, &ref, init, op);
+  emu::exclusive_scan(p.begin(), p.end(), &ref, init, op);
   range r(1);
-  q.parallel_for(nd_range(r, r), [=](nd_item<1> it) {
-     auto g = it.get_group();
-     *p = exclusive_scan_over_group(g, *p, op);
-   }).wait();
-
-  if (*p != ref) {
+  q.submit([&](sycl::handler &h) {
+    h.require(p);
+    h.parallel_for(nd_range(r, r), [=](nd_item<1> it) {
+      auto g = it.get_group();
+      p[0] = exclusive_scan_over_group(g, p[0], op);
+    });
+  });
+  sycl::host_accessor p_host{p_buf};
+  if (p_host[0] != ref) {
     std::cout << "test " << cur_test << " fail\n";
-    std::cout << "got:      " << int(*p) << "\n";
+    std::cout << "got:      " << int(p_host[0]) << "\n";
     std::cout << "expected: " << int(ref) << "\n\n";
     ++n_fail;
   }
   ++cur_test;
-  free(p, q);
 }
 
 int main() {

From f3b2142f098c00524d8c03c99f112c585b020b1c Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 21:15:34 -0800
Subject: [PATCH 09/19] Add more tests

---
 .../Regression/exclusive-scan-char-short.cpp  | 25 +++++-----
 .../Regression/group_local_linear_id.cpp      | 20 ++++----
 sycl/test-e2e/Regression/half_operators.cpp   | 46 +++++++++----------
 3 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
index 743d7de3b6d85..a7d3601210fa8 100644
--- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
+++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
@@ -16,27 +16,24 @@ int n_fail = 0;
 template <typename T, typename OpT> void test() {
   auto op = OpT();
   auto init = sycl::known_identity_v<decltype(op), T>;
-  T p_val = 0;
-  sycl::buffer<T> p_buf{&p_val, sycl::range{1}};
-  sycl::accessor p{p_buf};
+  auto *p = malloc_shared<T>(1, q);
+  *p = 0;
   T ref;
-  emu::exclusive_scan(p.begin(), p.end(), &ref, init, op);
+  emu::exclusive_scan(p, p + 1, &ref, init, op);
   range r(1);
-  q.submit([&](sycl::handler &h) {
-    h.require(p);
-    h.parallel_for(nd_range(r, r), [=](nd_item<1> it) {
-      auto g = it.get_group();
-      p[0] = exclusive_scan_over_group(g, p[0], op);
-    });
-  });
-  sycl::host_accessor p_host{p_buf};
-  if (p_host[0] != ref) {
+  q.parallel_for(nd_range(r, r), [=](nd_item<1> it) {
+     auto g = it.get_group();
+     *p = exclusive_scan_over_group(g, *p, op);
+   }).wait();
+
+  if (*p != ref) {
     std::cout << "test " << cur_test << " fail\n";
-    std::cout << "got:      " << int(p_host[0]) << "\n";
+    std::cout << "got:      " << int(*p) << "\n";
     std::cout << "expected: " << int(ref) << "\n\n";
     ++n_fail;
   }
   ++cur_test;
+  free(p, q);
 }
 
 int main() {
diff --git a/sycl/test-e2e/Regression/group_local_linear_id.cpp b/sycl/test-e2e/Regression/group_local_linear_id.cpp
index d3562c2b25cbf..8fc216f76df31 100644
--- a/sycl/test-e2e/Regression/group_local_linear_id.cpp
+++ b/sycl/test-e2e/Regression/group_local_linear_id.cpp
@@ -11,14 +11,17 @@ int main() {
   const sycl::range<3> GlobalRange(2, 8, 16);
   const sycl::range<3> LocalRange(2, 4, 4);
   sycl::queue Q;
-  bool *ReadSame = sycl::malloc_shared<bool>(GlobalRange.size(), Q);
-  Q.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange},
-                 [=](sycl::nd_item<3> Item) {
-                   ReadSame[Item.get_global_linear_id()] =
-                       Item.get_local_linear_id() ==
-                       Item.get_group().get_local_linear_id();
-                 })
-      .wait();
+  sycl::buffer<bool> ReadSame_buf{GlobalRange.size()};
+  Q.submit([&](sycl::handler &h) {
+    sycl::accessor ReadSame{ReadSame_buf, h};
+    h.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange},
+                   [=](sycl::nd_item<3> Item) {
+                     ReadSame[Item.get_global_linear_id()] =
+                         Item.get_local_linear_id() ==
+                         Item.get_group().get_local_linear_id();
+                   });
+  });
+  sycl::host_accessor ReadSame{ReadSame_buf};
   int Failures = 0;
   for (size_t I = 0; I < GlobalRange.size(); ++I) {
     if (ReadSame[I])
@@ -26,6 +29,5 @@ int main() {
     ++Failures;
     std::cout << "Read mismatch at index " << I << std::endl;
   }
-  sycl::free(ReadSame, Q);
   return Failures;
 }
diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp
index b227806a02bc0..7b663efd13e34 100644
--- a/sycl/test-e2e/Regression/half_operators.cpp
+++ b/sycl/test-e2e/Regression/half_operators.cpp
@@ -7,11 +7,6 @@
 
 using namespace sycl;
 
-template <typename T>
-using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
-
-template <typename T> using shared_vector = std::vector<T, shared_allocator<T>>;
-
 template <typename T> bool are_bitwise_equal(T lhs, T rhs) {
   constexpr size_t size{sizeof(T)};
 
@@ -33,33 +28,34 @@ template <typename T> bool test(sycl::queue &queue) {
 
   static const T inexact = static_cast<T>(0.1);
 
-  shared_vector<T> result_source{NumElems, shared_allocator<T>{queue}};
-  shared_vector<T> input{NumElems, shared_allocator<T>{queue}};
+  std::vector<T> result_source_vec{NumElems};
+  std::vector<T> input_vec{NumElems};
 
   for (size_t i = 0; i < NumElems; ++i) {
-    input[i] = inexact * i;
+    input_vec[i] = inexact * i;
   }
-
-  queue.submit([&](sycl::handler &cgh) {
-    auto out_source = result_source.data();
-    auto in = input.data();
-
-    cgh.single_task<>([=]() {
-      for (size_t i = 0; i < NumElems; ++i) {
-        auto source = in[i];
-        ++source;
-        out_source[i] = source;
-      }
+  {
+    sycl::buffer<T> result_source_buf{result_source_vec};
+    sycl::buffer<T> input_buf{input_vec};
+    queue.submit([&](sycl::handler &cgh) {
+      sycl::accessor out_source{result_source_buf, cgh};
+      sycl::accessor in{input_buf, cgh};
+      cgh.single_task<>([=]() {
+        for (size_t i = 0; i < NumElems; ++i) {
+          auto source = in[i];
+          ++source;
+          out_source[i] = source;
+        }
+      });
     });
-  });
-  queue.wait_and_throw();
-
+    queue.wait_and_throw();
+  } // buffers go out of scope here and write back to the vectors
   for (size_t i = 0; i < NumElems; ++i) {
-    T expected_value = input[i] + 1;
+    T expected_value = input_vec[i] + 1;
 
-    if (!are_bitwise_equal(expected_value, result_source[i])) {
+    if (!are_bitwise_equal(expected_value, result_source_vec[i])) {
       pass = false;
-      std::cout << "Sample failed retrieved value: " << result_source[i]
+      std::cout << "Sample failed retrieved value: " << result_source_vec[i]
                 << ", but expected: " << expected_value << ", at index: " << i
                 << std::endl;
     }

From 01a6431eebb629316fd73cc3791e03eeaafbf9cf Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 21:16:51 -0800
Subject: [PATCH 10/19] Add more tests

---
 sycl/test-e2e/Regression/exclusive-scan-char-short.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
index a7d3601210fa8..81e5ddc8b1a27 100644
--- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
+++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 // This test ensures the result computed by exclusive_scan_over_group
 // for the first work item when given a short or char argument with
 // the maximum or minimum operator is computed correctly.

From c73e919136ba236c80c5b8e0838bc6e92cc2d644 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 22:33:59 -0800
Subject: [PATCH 11/19] Fix vector initialization bug

---
 sycl/test-e2e/Regression/half_operators.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp
index 7b663efd13e34..7309a032ea35a 100644
--- a/sycl/test-e2e/Regression/half_operators.cpp
+++ b/sycl/test-e2e/Regression/half_operators.cpp
@@ -28,8 +28,8 @@ template <typename T> bool test(sycl::queue &queue) {
 
   static const T inexact = static_cast<T>(0.1);
 
-  std::vector<T> result_source_vec{NumElems};
-  std::vector<T> input_vec{NumElems};
+  std::vector<T> result_source_vec(NumElems);
+  std::vector<T> input_vec(NumElems);;
 
   for (size_t i = 0; i < NumElems; ++i) {
     input_vec[i] = inexact * i;

From f2bde72fd392c2d37f29b1fe8b0da9902ef8104c Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Wed, 7 Feb 2024 22:55:09 -0800
Subject: [PATCH 12/19] Fix formatting

---
 sycl/test-e2e/Regression/half_operators.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp
index 7309a032ea35a..14f6924260ec6 100644
--- a/sycl/test-e2e/Regression/half_operators.cpp
+++ b/sycl/test-e2e/Regression/half_operators.cpp
@@ -29,7 +29,7 @@ template <typename T> bool test(sycl::queue &queue) {
   static const T inexact = static_cast<T>(0.1);
 
   std::vector<T> result_source_vec(NumElems);
-  std::vector<T> input_vec(NumElems);;
+  std::vector<T> input_vec(NumElems);
 
   for (size_t i = 0; i < NumElems; ++i) {
     input_vec[i] = inexact * i;

From ae3e35effaf65bcdc885dd6a3f31687f7f0d13e7 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 8 Feb 2024 07:02:57 -0800
Subject: [PATCH 13/19] Change malloc_shared to malloc_device

---
 .../sync_two_queues_event_dep.cpp             | 54 ++++++++-----------
 .../test-e2e/KernelFusion/sync_usm_mem_op.cpp | 26 ++++-----
 2 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index e6096f231ef9b..13b199c2bc0ba 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -13,37 +13,29 @@ int main() {
 
   queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
   queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
-  int in1_arr[dataSize];
-  int in2_arr[dataSize];
-  int in3_arr[dataSize];
-  int tmp_arr[dataSize];
-  int out_arr[dataSize];
-  for (size_t i = 0; i < dataSize; ++i) {
-    in1_arr[i] = i * 2;
-    in2_arr[i] = i * 3;
-    in3_arr[i] = i * 4;
-    tmp_arr[i] = -1;
-    out_arr[i] = -1;
-  }
-  sycl::buffer<int> in1_buf{&in1_arr[0], sycl::range{dataSize}};
-  sycl::buffer<int> in2_buf{&in2_arr[0], sycl::range{dataSize}};
-  sycl::buffer<int> in3_buf{&in3_arr[0], sycl::range{dataSize}};
-  sycl::buffer<int> tmp_buf{&tmp_arr[0], sycl::range{dataSize}};
-  sycl::buffer<int> out_buf{&out_arr[0], sycl::range{dataSize}};
-  sycl::accessor in1{in1_buf};
-  sycl::accessor in2{in2_buf};
-  sycl::accessor in3{in3_buf};
-  sycl::accessor tmp{tmp_buf};
-  sycl::accessor out{out_buf};
+
+  int *in1 = sycl::malloc_device<int>(dataSize, q1);
+  int *in2 = sycl::malloc_device<int>(dataSize, q1);
+  int *in3 = sycl::malloc_device<int>(dataSize, q1);
+  int *tmp = sycl::malloc_device<int>(dataSize, q1);
+  int *out = sycl::malloc_device<int>(dataSize, q1);
+
+  q1.single_task<class InitKernel>([=]() {
+      for (size_t i = 0; i < dataSize; ++i) {
+        in1[i] = i * 2;
+        in2[i] = i * 3;
+        in3[i] = i * 4;
+        tmp[i] = -1;
+        out[i] = -1;
+      }
+    }).wait();
+
   ext::codeplay::experimental::fusion_wrapper fw1{q1};
   fw1.start_fusion();
 
   assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode");
 
   auto kernel1 = q1.submit([&](handler &cgh) {
-    cgh.require(in1);
-    cgh.require(in2);
-    cgh.require(tmp);
     cgh.parallel_for<class KernelOne>(
         dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
   });
@@ -52,7 +44,6 @@ int main() {
   fw2.start_fusion();
 
   auto kernel3 = q2.submit([&](handler &cgh) {
-    cgh.require(tmp);
     cgh.depends_on(kernel1);
     cgh.parallel_for<class KernelThree>(dataSize,
                                         [=](id<1> i) { tmp[i] *= 2; });
@@ -66,9 +57,6 @@ int main() {
   assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode");
 
   auto kernel2 = q1.submit([&](handler &cgh) {
-    cgh.require(tmp);
-    cgh.require(in3);
-    cgh.require(out);
     cgh.depends_on(kernel3);
     cgh.parallel_for<class KernelTwo>(
         dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
@@ -85,11 +73,15 @@ int main() {
 
   q1.wait();
   q2.wait();
-  sycl::host_accessor out_host{out_buf};
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out_host[i] == (40 * i * i) && "Computation error");
+    assert(out[i] == (40 * i * i) && "Computation error");
   }
+  sycl::free(in1, q1);
+  sycl::free(in2, q1);
+  sycl::free(in3, q1);
+  sycl::free(tmp, q1);
+  sycl::free(out, q1);
   return 0;
 }
 
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 1728ca8dd6002..a2899ff41d375 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,6 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
-// REQUIRES: aspect-usm_shared_allocations
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
@@ -16,21 +15,24 @@ int main() {
 
   queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-  int *in1 = sycl::malloc_shared<int>(dataSize, q);
-  int *in2 = sycl::malloc_shared<int>(dataSize, q);
-  int *in3 = sycl::malloc_shared<int>(dataSize, q);
-  int *tmp = sycl::malloc_shared<int>(dataSize, q);
-  int *out = sycl::malloc_shared<int>(dataSize, q);
+  int *in1 = sycl::malloc_device<int>(dataSize, q);
+  int *in2 = sycl::malloc_device<int>(dataSize, q);
+  int *in3 = sycl::malloc_device<int>(dataSize, q);
+  int *tmp = sycl::malloc_device<int>(dataSize, q);
+  int *out = sycl::malloc_device<int>(dataSize, q);
   int dst[dataSize];
-
   for (size_t i = 0; i < dataSize; ++i) {
-    in1[i] = i * 2;
-    in2[i] = i * 3;
-    in3[i] = i * 4;
-    tmp[i] = -1;
-    out[i] = -1;
     dst[i] = -1;
   }
+  q.single_task<class InitKernel>([=]() {
+     for (size_t i = 0; i < dataSize; ++i) {
+       in1[i] = i * 2;
+       in2[i] = i * 3;
+       in3[i] = i * 4;
+       tmp[i] = -1;
+       out[i] = -1;
+     }
+   }).wait();
 
   ext::codeplay::experimental::fusion_wrapper fw{q};
   fw.start_fusion();

From 6d25416fb67ab2e2e93896a4ecafb49a03628d8f Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 8 Feb 2024 08:24:10 -0800
Subject: [PATCH 14/19] Fix access of device USM ptr from host

---
 sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp | 5 ++++-
 sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp           | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index 13b199c2bc0ba..79347114ec2a4 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -73,9 +73,12 @@ int main() {
 
   q1.wait();
   q2.wait();
+  int host_out[dataSize];
+  q1.memcpy(host_out, out, dataSize * sizeof(int));
+  q1.wait();
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (40 * i * i) && "Computation error");
+    assert(host_out[i] == (40 * i * i) && "Computation error");
   }
   sycl::free(in1, q1);
   sycl::free(in2, q1);
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index a2899ff41d375..f3d2cc97b9ae5 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -61,14 +61,16 @@ int main() {
          "Queue should not be in fusion mode anymore");
 
   fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
-
+  int host_out[dataSize];
+  q.memcpy(host_out, out, dataSize * sizeof(int));
+  q.wait();
   for (size_t i = 0; i < dataSize; ++i) {
-    std::cout << out[i] << ", ";
+    std::cout << host_out[i] << ", ";
   }
   std::cout << "\n";
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(host_out[i] == (20 * i * i) && "Computation error");
     assert(dst[i] == (5 * i) && "Computation error");
   }
 

From f0d325f56892730803eeeb817421c4d3c8e6ff68 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 8 Feb 2024 11:52:28 -0800
Subject: [PATCH 15/19] Finish off rewriting regression tests

---
 sycl/test-e2e/Regression/pf-wg-atomic64.cpp    |  5 +++--
 .../Regression/range-rounding-this-id.cpp      | 18 +++++++++++-------
 .../Regression/reduction_64bit_atomic64.cpp    |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
index f2985b5a33b1d..d7adc4e6c153b 100644
--- a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
+++ b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
@@ -10,11 +10,12 @@ using AtomicRefT =
 
 int main() {
   queue q;
-  auto *p = malloc_shared<unsigned long long>(1, q);
+  sycl::buffer<unsigned long long> p_buf{sycl::range{1}};
   try {
     q.submit([&](sycl::handler &cgh) {
+       sycl::accessor p{p_buf, cgh};
        cgh.parallel_for_work_group(range{1}, range{1}, [=](group<1>) {
-         AtomicRefT feature(*p);
+         AtomicRefT feature(p[0]);
          feature += 42;
        });
      }).wait();
diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
index 33fa41c60cc68..5e3baf2355084 100644
--- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp
+++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
@@ -30,13 +30,17 @@ template <int D> void test(queue &q) {
     id<D> this_id;
     id<D> ref_id;
   };
-  std::vector<T, usm_allocator<T, usm::alloc::shared>> vec(range.size(), q);
-  auto *p = vec.data();
-  q.parallel_for(range, [=](auto it) {
-     p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id<D>(),
-                              it.get_id()};
-   }).wait_and_throw();
-
+  std::vector<T> vec(range.size());
+  {
+    sycl::buffer<T> p_buf{vec};
+    q.submit([&](sycl::handler &h) {
+       sycl::accessor p{p_buf, h};
+       q.parallel_for(range, [=](auto it) {
+         p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id<D>(),
+                                  it.get_id()};
+       });
+     }).wait_and_throw();
+  } // p_buf goes out of scope here and writed back to vec
   for (const auto &[this_item, ref_item] : vec) {
     if (this_item != ref_item) {
       std::cout << "fail: " << this_item << " != " << ref_item << "\n";
diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
index dc138d9b79da8..11affac9df7b9 100644
--- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
+++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
@@ -2,7 +2,7 @@
 // RUN: %{build} -o %t.out
 //
 // RUN: %{run} %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 // Tests that a previously known case for reduction doesn't cause a requirement
 // for atomic64.
 // TODO: When aspect requirements are added to testing, this test could be set

From ceff46aaf3a120d0a8e0f27ce7c7098b517cfffe Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Thu, 8 Feb 2024 13:18:47 -0800
Subject: [PATCH 16/19] Fix runtime error

---
 sycl/test-e2e/Regression/range-rounding-this-id.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
index 5e3baf2355084..a5970adf46e41 100644
--- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp
+++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
@@ -35,7 +35,7 @@ template <int D> void test(queue &q) {
     sycl::buffer<T> p_buf{vec};
     q.submit([&](sycl::handler &h) {
        sycl::accessor p{p_buf, h};
-       q.parallel_for(range, [=](auto it) {
+       h.parallel_for(range, [=](auto it) {
          p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id<D>(),
                                   it.get_id()};
        });

From 971e87545adad9f3378787aa2de8e26e98e0ff0f Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 9 Feb 2024 15:49:12 -0800
Subject: [PATCH 17/19] Rewrite tests using malloc_device and fix casing

---
 sycl/test-e2e/DiscardEvents/invalid_event.cpp |  5 ++--
 sycl/test-e2e/GroupAlgorithm/root_group.cpp   | 12 ++++----
 .../InOrderEventsExt/set_external_event.cpp   |  6 ++--
 .../test-e2e/Reduction/reduction_internal.cpp |  6 ++--
 .../Reduction/reduction_range_item.cpp        | 25 +++++++++-------
 sycl/test-e2e/Reduction/reduction_span.cpp    |  9 +++---
 .../Reduction/reduction_span_pack.cpp         | 29 ++++++++++---------
 .../Regression/reduction_64bit_atomic64.cpp   |  3 +-
 8 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
index 4fc0ee0f1495b..17c6a492a10c2 100644
--- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp
+++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
@@ -2,7 +2,6 @@
 // https://github.com/intel/llvm/issues/7330.
 // UNSUPPORTED: opencl && gpu
 // RUN: %{build} -o %t.out
-// REQUIRES: aspect-usm_shared_allocations
 // RUN: %{run} %t.out
 
 // The test checks that each PI call to the queue returns a discarded event
@@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) {
   sycl::range<1> range(BUFFER_SIZE);
 
   auto Dev = Q.get_device();
-  int *x = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *x = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(x != nullptr);
-  int *y = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *y = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(y != nullptr);
 
   sycl::event DiscardedEvent;
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index 584ecfd22fe0b..e346ea142b759 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -41,10 +41,10 @@ void testRootGroup() {
           max_num_work_group_sync>(q);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
-  sycl::buffer<int> data_buf{sycl::range{maxWGs * WorkGroupSize}};
+  sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
   q.submit([&](sycl::handler &h) {
-    sycl::accessor data{data_buf, h};
+    sycl::accessor data{dataBuf, h};
     h.parallel_for<class RootGroupKernel>(
         range, props, [=](sycl::nd_item<1> it) {
           auto root = it.ext_oneapi_get_root_group();
@@ -59,7 +59,7 @@ void testRootGroup() {
           data[root.get_local_id()] = sum;
         });
   });
-  sycl::host_accessor data{data_buf};
+  sycl::host_accessor data{dataBuf};
   const int workItemCount = static_cast<int>(range.get_global_range().size());
   for (int i = 0; i < workItemCount; i++) {
     assert(data[i] == (workItemCount - 1));
@@ -78,10 +78,10 @@ void testRootGroupFunctions() {
       sycl::ext::oneapi::experimental::use_root_sync};
 
   constexpr int testCount = 10;
-  sycl::buffer<bool> testResults_buf{sycl::range{testCount}};
+  sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
   q.submit([&](sycl::handler &h) {
-    sycl::accessor testResults{testResults_buf, h};
+    sycl::accessor testResults{testResultsBuf, h};
     h.parallel_for<class RootGroupFunctionsKernel>(
         range, props, [=](sycl::nd_item<1> it) {
           const auto root = it.ext_oneapi_get_root_group();
@@ -113,7 +113,7 @@ void testRootGroupFunctions() {
           }
         });
   });
-  sycl::host_accessor testResults{testResults_buf};
+  sycl::host_accessor testResults{testResultsBuf};
   for (int i = 0; i < testCount; i++) {
     assert(testResults[i]);
   }
diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
index 4f2a409fa3d46..61987d5b5b3cb 100644
--- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
@@ -15,8 +15,8 @@ int main() {
   sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
   sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
 
-  sycl::buffer<int> DevDatabuf{sycl::range{N}};
-  sycl::accessor DevData{DevDatabuf};
+  sycl::buffer<int> DevDataBuf{sycl::range{N}};
+  sycl::accessor DevData{DevDataBuf};
   int *HostData = (int *)malloc(N * sizeof(int) * 10);
 
   for (size_t I = 0; I < 10; ++I) {
@@ -52,6 +52,6 @@ int main() {
       }
     }
   }
-
+  free(HostData);
   return Failures;
 }
diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp
index e16f8d750a154..39c64932802da 100644
--- a/sycl/test-e2e/Reduction/reduction_internal.cpp
+++ b/sycl/test-e2e/Reduction/reduction_internal.cpp
@@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) {
          cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl,
          [=](auto Item, auto &Red) { Red.combine(T{1}); });
    }).wait();
-  sycl::buffer<T> Result_buf{sycl::range{1}};
+  sycl::buffer<T> ResultBuf{sycl::range{1}};
   q.submit([&](handler &cgh) {
-    sycl::accessor Result{Result_buf, cgh};
+    sycl::accessor Result{ResultBuf, cgh};
     auto RedAcc = GetRedAcc(cgh);
     cgh.single_task([=]() { Result[0] = RedAcc[0]; });
   });
-  sycl::host_accessor Result{Result_buf};
+  sycl::host_accessor Result{ResultBuf};
   auto N = get_global_range(Range).size();
   int Expected = InitToIdentity ? N : Init + N;
 #if defined(__PRETTY_FUNCTION__)
diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp
index 485959860456d..06d4f33280f06 100644
--- a/sycl/test-e2e/Reduction/reduction_range_item.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp
@@ -3,15 +3,14 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-// REQUIRES: aspect-usm_shared_allocations
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
 
 int main() {
   queue q;
-  auto *RedMem = malloc_shared<int>(1, q);
-  auto *Success = malloc_shared<bool>(1, q);
+  auto *RedMem = malloc_device<int>(1, q);
+  auto *Success = malloc_device<bool>(1, q);
   *Success = true;
 
   *RedMem = 0;
@@ -24,11 +23,15 @@ int main() {
                      *Success = false;
                  })
       .wait();
-
-  assert(*RedMem == 7);
-  assert(*Success);
-
-  *RedMem = 0;
+  int RedMemHost;
+  bool SuccessHost;
+  q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait();
+  q.memcpy(&SuccessHost, Success, sizeof(bool)).wait();
+  assert(RedMemHost == 7);
+  assert(SuccessHost);
+
+  RedMemHost = 0;
+  q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait();
   q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
                  [=](item<2> Item, auto &Red) {
                    Red += 1;
@@ -44,8 +47,10 @@ int main() {
                  })
       .wait();
 
-  assert(*RedMem == 1030 * 7);
-  assert(*Success);
+  q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait();
+  q.memcpy(&SuccessHost, Success, sizeof(bool)).wait();
+  assert(RedMemHost == 1030 * 7);
+  assert(SuccessHost);
 
   free(RedMem, q);
   free(Success, q);
diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp
index 63f5ec1ece9be..23ee61e805f9b 100644
--- a/sycl/test-e2e/Reduction/reduction_span.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span.cpp
@@ -3,7 +3,6 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-// REQUIRES: aspect-usm_shared_allocations
 // This test performs basic checks of reductions initialized with a sycl::span
 
 #include <sycl/sycl.hpp>
@@ -44,7 +43,7 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  T *Output = malloc_shared<T>(N, Q);
+  T *Output = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -70,11 +69,13 @@ void test(queue Q, Range Rng, T Identity, T Value) {
   }
 
   bool Passed = true;
+  T OutputHost[N];
+  Q.memcpy(OutputHost, Output, N * sizeof(T)).wait();
   for (size_t I = 0; I < N; ++I) {
     if (I < Size % N) {
-      Passed &= (Output[I] == Expected);
+      Passed &= (OutputHost[I] == Expected);
     } else {
-      Passed &= (Output[I] == ExpectedRemainder);
+      Passed &= (OutputHost[I] == ExpectedRemainder);
     }
   }
 
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index 706dca1363dd3..b6e118c081cb0 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -6,7 +6,6 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-// REQUIRES: aspect-usm_shared_allocations
 // This test performs basic checks of reductions initialized with a pack
 // containing at least one sycl::span
 
@@ -49,9 +48,9 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test1(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  int *Sum = malloc_shared<int>(1, Q);
+  int *Sum = malloc_device<int>(1, Q);
   Q.single_task([=]() { *Sum = 0; }).wait();
-  T *Output = malloc_shared<T>(N, Q);
+  T *Output = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -81,11 +80,13 @@ void test1(queue Q, Range Rng, T Identity, T Value) {
   }
 
   bool Passed = true;
+  T OutputHost[N];
+  Q.memcpy(OutputHost, Output, N * sizeof(T)).wait();
   for (size_t I = 0; I < N; ++I) {
     if (I < Size % N) {
-      Passed &= (Output[I] == Expected);
+      Passed &= (OutputHost[I] == Expected);
     } else {
-      Passed &= (Output[I] == ExpectedRemainder);
+      Passed &= (OutputHost[I] == ExpectedRemainder);
     }
   }
   Passed &= (*Sum == Size);
@@ -101,9 +102,9 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test2(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  int *Output1 = malloc_shared<int>(N, Q);
+  int *Output1 = malloc_device<int>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait();
-  T *Output2 = malloc_shared<T>(N, Q);
+  T *Output2 = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -121,7 +122,10 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
   } else /*if (SubmissionMode == submission_mode::queue) */ {
     Q.parallel_for(Rng, Redu1, Redu2, Kern).wait();
   }
-
+  int Output1Host[N];
+  T Output2Host[N];
+  Q.memcpy(Output1Host, Output1, N * sizeof(int)).wait();
+  Q.memcpy(Output2Host, Output2, N * sizeof(T)).wait();
   size_t Size = getLinearSize(Rng);
   bool Passed = true;
   // Span1
@@ -132,12 +136,11 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
       ExpectedRemainder = Expected;
       Expected += 1;
     }
-
     for (size_t I = 0; I < N; ++I) {
       if (I < Size % N) {
-        Passed &= (Output1[I] == Expected);
+        Passed &= (Output1Host[I] == Expected);
       } else {
-        Passed &= (Output1[I] == ExpectedRemainder);
+        Passed &= (Output1Host[I] == ExpectedRemainder);
       }
     }
   }
@@ -153,9 +156,9 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
 
     for (size_t I = 0; I < N; ++I) {
       if (I < Size % N) {
-        Passed &= (Output2[I] == Expected);
+        Passed &= (Output2Host[I] == Expected);
       } else {
-        Passed &= (Output2[I] == ExpectedRemainder);
+        Passed &= (Output2Host[I] == ExpectedRemainder);
       }
     }
   }
diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
index 11affac9df7b9..c2d0d3e84ca7c 100644
--- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
+++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
@@ -2,7 +2,6 @@
 // RUN: %{build} -o %t.out
 //
 // RUN: %{run} %t.out
-// REQUIRES: aspect-usm_shared_allocations
 // Tests that a previously known case for reduction doesn't cause a requirement
 // for atomic64.
 // TODO: When aspect requirements are added to testing, this test could be set
@@ -19,7 +18,7 @@ using namespace sycl;
 int main() {
   queue Q;
 
-  long long *Out = malloc_shared<long long>(1, Q);
+  long long *Out = malloc_device<long long>(1, Q);
 
   // Case 1: nd_range reduction with 64-bit integer and either sycl::plus,
   // sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy

From d7803db15dc5cfd231ee0ee11d7d339e0dff7cf7 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 9 Feb 2024 15:50:57 -0800
Subject: [PATCH 18/19] Rewrite tests using malloc_device and fix casing

---
 sycl/test-e2e/KernelAndProgram/disable-caching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
index cd937a5adf207..c50e23b1c17c4 100644
--- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
+++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
@@ -6,7 +6,6 @@
 // RUN: | FileCheck %s
 // RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 %{run} %t.out \
 // RUN: | FileCheck %s --check-prefixes=CHECK-CACHE
-// REQUIRES: aspect-usm_shared_allocations
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
@@ -75,7 +74,7 @@ int main() {
   // CHECK-CACHE: piKernelRelease
   // CHECK-CACHE: piProgramRelease
   // CHECK-CACHE: piEventsWait
-  auto *p = malloc_shared<int>(1, q);
+  auto *p = malloc_device<int>(1, q);
   for (int i = 0; i < 2; ++i)
     q.submit([&](handler &cgh) {
        cgh.set_specialization_constant<spec_id>(i);

From fd26740d033551f6b6b7b91984452ceebbf1fae2 Mon Sep 17 00:00:00 2001
From: Lorenc Bushi <lorenc.bushi@intel.com>
Date: Fri, 9 Feb 2024 16:24:04 -0800
Subject: [PATCH 19/19] Fix failures

---
 sycl/test-e2e/Reduction/reduction_range_item.cpp | 11 ++++++-----
 sycl/test-e2e/Reduction/reduction_span_pack.cpp  |  4 +++-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp
index 06d4f33280f06..93fe915ac0254 100644
--- a/sycl/test-e2e/Reduction/reduction_range_item.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp
@@ -11,9 +11,12 @@ int main() {
   queue q;
   auto *RedMem = malloc_device<int>(1, q);
   auto *Success = malloc_device<bool>(1, q);
-  *Success = true;
-
-  *RedMem = 0;
+  int RedMemHost;
+  bool SuccessHost;
+  RedMemHost = 0;
+  SuccessHost = true;
+  q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait();
+  q.memcpy(Success, &SuccessHost, sizeof(bool)).wait();
   q.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
                  [=](item<1> Item, auto &Red) {
                    Red += 1;
@@ -23,8 +26,6 @@ int main() {
                      *Success = false;
                  })
       .wait();
-  int RedMemHost;
-  bool SuccessHost;
   q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait();
   q.memcpy(&SuccessHost, Success, sizeof(bool)).wait();
   assert(RedMemHost == 7);
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index b6e118c081cb0..4bc4cfb5f9f14 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -89,7 +89,9 @@ void test1(queue Q, Range Rng, T Identity, T Value) {
       Passed &= (OutputHost[I] == ExpectedRemainder);
     }
   }
-  Passed &= (*Sum == Size);
+  int SumHost;
+  Q.memcpy(&SumHost, Sum, sizeof(int)).wait();
+  Passed &= (SumHost == Size);
 
   free(Output, Q);
   free(Sum, Q);