From bee45c380d0c603809a4813ec1c4cac6967a8fca Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Thu, 18 Jan 2024 16:15:54 -0800 Subject: [PATCH 01/19] Throw when USM not supported by Device --- sycl/source/detail/usm/usm_impl.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp index 396750acf3044..4260bb8569b07 100644 --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -252,6 +252,11 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt, PrepareNotify.scopedNotify( (uint16_t)xpti::trace_point_type_t::mem_alloc_begin); #endif + if (Kind == alloc::device && + !Dev.has(sycl::aspect::usm_device_allocations)) { + throw sycl::exception(sycl::errc::feature_not_supported, + "Device does not support Unified Shared Memory!"); + } void *RetVal = alignedAllocInternal(Alignment, Size, getSyclObjImpl(Ctxt).get(), getSyclObjImpl(Dev).get(), Kind, PropList); From f3b90a4b8fac46549fd6d1577be75017dc70684d Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 13:52:48 -0800 Subject: [PATCH 02/19] Rewrite tests --- .../discard_events_mixed_calls.cpp | 2 +- .../DiscardEvents/discard_events_usm.cpp | 2 +- .../discard_events_usm_ooo_queue.cpp | 2 +- sycl/test-e2e/DiscardEvents/invalid_event.cpp | 2 +- .../exclusive_scan_over_group.cpp | 55 ++++++----- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 97 ++++++++++--------- .../InOrderEventsExt/get_last_event.cpp | 3 +- .../InOrderEventsExt/set_external_event.cpp | 23 +++-- .../KernelAndProgram/disable-caching.cpp | 7 +- .../sync_two_queues_event_dep.cpp | 49 ++++++---- .../test-e2e/KernelFusion/sync_usm_mem_op.cpp | 2 +- 11 files changed, 132 insertions(+), 112 deletions(-) diff --git a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp index 8206ea5b449c3..8399951d16298 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp @@ -24,7 +24,7 @@ // the tests, please check if they pass without the discard_events property, if // they don't pass then it's most likely a general issue unrelated to // discard_events. - +// REQUIRES: aspect-usm_shared_allocations #include #include #include diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp index 11288d6620bfd..48ab65c68896c 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt - +// REQUIRES: aspect-usm_shared_allocations // The test checks that the last parameter is `nullptr` for all PI calls that // should discard events. // {{0|0000000000000000}} is required for various output on Linux and Windows. diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp index cfe72db0c1232..96d53a632beb6 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt - +// REQUIRES: aspect-usm_shared_allocations // The test checks that the last parameter is not `nullptr` for all PI calls // that should discard events. // {{0|0000000000000000}} is required for various output on Linux and Windows. diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp index 273e74afb6c25..4fc0ee0f1495b 100644 --- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp +++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp @@ -2,7 +2,7 @@ // https://github.com/intel/llvm/issues/7330. // UNSUPPORTED: opencl && gpu // RUN: %{build} -o %t.out - +// REQUIRES: aspect-usm_shared_allocations // RUN: %{run} %t.out // The test checks that each PI call to the queue returns a discarded event diff --git a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp index 6411131ef33ff..8f7ee3a55352c 100644 --- a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp @@ -9,11 +9,14 @@ #include #include -template -sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data, - T *out_data, int num_wg, int group_size) { +template +sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data, + AccessorT &out_data, int num_wg, + int group_size) { sycl::nd_range<1> thread_range(num_wg * group_size, group_size); sycl::event event = queue->submit([&](sycl::handler &cgh) { + cgh.require(in_data); + cgh.require(out_data); cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) { auto id = item.get_global_linear_id(); auto group = item.get_group(); @@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data, return event; } -template -void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data, - int num_wg, int group_size) { - compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size); +template +void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data, + AccessorT &out_data, int num_wg, int group_size) { + compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size); } int main(int argc, const char **argv) { - int num_wg = 1; - int group_size = 16; + constexpr int num_wg = 1; + constexpr int group_size = 16; sycl::queue queue; - - typedef int T; - size_t nelems = num_wg * group_size; - T *data = sycl::malloc_shared(nelems, queue); - T *result = sycl::malloc_shared(nelems, queue); - queue.fill(data, T(2), nelems).wait(); - queue.memset(result, 0, nelems * sizeof(T)).wait(); - - test_compiler_group_scan(&queue, data, result, num_wg, group_size); - queue.wait(); - T expected[] = {1, 2, 4, 8, 16, 32, 64, 128, - 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}; - for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) { - assert(result[i] == expected[i]); + constexpr size_t nelems = num_wg * group_size; + int data[nelems]; + int result[nelems]; + for (size_t i = 0; i < nelems; ++i) { + data[i] = 2; + result[i] = 0; + } + sycl::buffer data_buf{&data[0], sycl::range{nelems}}; + sycl::buffer result_buf{&result[0], sycl::range{nelems}}; + sycl::accessor data_acc{data_buf}; + sycl::accessor result_acc{result_buf}; + test_compiler_group_scan(&queue, data_acc, result_acc, num_wg, + group_size); + sycl::host_accessor result_host{result_buf}; + int expected[] = {1, 2, 4, 8, 16, 32, 64, 128, + 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}; + for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) { + assert(result_host[i] == expected[i]); } - sycl::free(data, queue); - sycl::free(result, queue); return 0; } diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index ba0c49fa68bf7..584ecfd22fe0b 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -41,27 +41,29 @@ void testRootGroup() { max_num_work_group_sync>(q); const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; - - int *data = sycl::malloc_shared(maxWGs * WorkGroupSize, q); + sycl::buffer data_buf{sycl::range{maxWGs * WorkGroupSize}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; - q.parallel_for(range, props, [=](sycl::nd_item<1> it) { - auto root = it.ext_oneapi_get_root_group(); - data[root.get_local_id()] = root.get_local_id(); - sycl::group_barrier(root); + q.submit([&](sycl::handler &h) { + sycl::accessor data{data_buf, h}; + h.parallel_for( + range, props, [=](sycl::nd_item<1> it) { + auto root = it.ext_oneapi_get_root_group(); + data[root.get_local_id()] = root.get_local_id(); + sycl::group_barrier(root); - root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>(); - int sum = data[root.get_local_id()] + - data[root.get_local_range() - root.get_local_id() - 1]; - sycl::group_barrier(root); - data[root.get_local_id()] = sum; + root = + sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>(); + int sum = data[root.get_local_id()] + + data[root.get_local_range() - root.get_local_id() - 1]; + sycl::group_barrier(root); + data[root.get_local_id()] = sum; + }); }); - q.wait(); - + sycl::host_accessor data{data_buf}; const int workItemCount = static_cast(range.get_global_range().size()); for (int i = 0; i < workItemCount; i++) { assert(data[i] == (workItemCount - 1)); } - sycl::free(data, q); } void testRootGroupFunctions() { @@ -76,44 +78,45 @@ void testRootGroupFunctions() { sycl::ext::oneapi::experimental::use_root_sync}; constexpr int testCount = 10; - bool *testResults = sycl::malloc_shared(testCount, q); + sycl::buffer testResults_buf{sycl::range{testCount}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; - q.parallel_for( - range, props, [=](sycl::nd_item<1> it) { - const auto root = it.ext_oneapi_get_root_group(); - if (root.leader() || root.get_local_id() == 3) { - testResults[0] = root.get_group_id() == sycl::id<1>(0); - testResults[1] = root.leader() - ? root.get_local_id() == sycl::id<1>(0) - : root.get_local_id() == sycl::id<1>(3); - testResults[2] = root.get_group_range() == sycl::range<1>(1); - testResults[3] = - root.get_local_range() == sycl::range<1>(WorkGroupSize); - testResults[4] = - root.get_max_local_range() == sycl::range<1>(WorkGroupSize); - testResults[5] = root.get_group_linear_id() == 0; - testResults[6] = - root.get_local_linear_id() == root.get_local_id().get(0); - testResults[7] = root.get_group_linear_range() == 1; - testResults[8] = root.get_local_linear_range() == WorkGroupSize; - - const auto child = - sycl::ext::oneapi::experimental::get_child_group(root); - const auto grandchild = - sycl::ext::oneapi::experimental::get_child_group(child); - testResults[9] = child == it.get_group(); - static_assert( - std::is_same_v::type, - sycl::sub_group>, - "get_child_group(sycl::group) must return a sycl::sub_group"); - } - }); - q.wait(); + q.submit([&](sycl::handler &h) { + sycl::accessor testResults{testResults_buf, h}; + h.parallel_for( + range, props, [=](sycl::nd_item<1> it) { + const auto root = it.ext_oneapi_get_root_group(); + if (root.leader() || root.get_local_id() == 3) { + testResults[0] = root.get_group_id() == sycl::id<1>(0); + testResults[1] = root.leader() + ? root.get_local_id() == sycl::id<1>(0) + : root.get_local_id() == sycl::id<1>(3); + testResults[2] = root.get_group_range() == sycl::range<1>(1); + testResults[3] = + root.get_local_range() == sycl::range<1>(WorkGroupSize); + testResults[4] = + root.get_max_local_range() == sycl::range<1>(WorkGroupSize); + testResults[5] = root.get_group_linear_id() == 0; + testResults[6] = + root.get_local_linear_id() == root.get_local_id().get(0); + testResults[7] = root.get_group_linear_range() == 1; + testResults[8] = root.get_local_linear_range() == WorkGroupSize; + const auto child = + sycl::ext::oneapi::experimental::get_child_group(root); + const auto grandchild = + sycl::ext::oneapi::experimental::get_child_group(child); + testResults[9] = child == it.get_group(); + static_assert( + std::is_same_v::type, + sycl::sub_group>, + "get_child_group(sycl::group) must return a sycl::sub_group"); + } + }); + }); + sycl::host_accessor testResults{testResults_buf}; for (int i = 0; i < testCount; i++) { assert(testResults[i]); } - sycl::free(testResults, q); } int main() { diff --git a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp index 3393202b5a370..aeb4ab44acb49 100644 --- a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp +++ b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp @@ -34,7 +34,8 @@ int main() { Failed += Check(Q, "host_task", [&]() { return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); }); }); - + if (!Q.get_device().has(sycl::aspect::usm_shared_allocations)) + return Failed; constexpr size_t N = 64; int *Data1 = sycl::malloc_shared(N, Q); int *Data2 = sycl::malloc_shared(N, Q); diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp index 45e5815606dbe..4f2a409fa3d46 100644 --- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp +++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp @@ -15,20 +15,26 @@ int main() { sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}}; sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}}; - int *DevData = sycl::malloc_shared(N, Dev, Ctx); + sycl::buffer DevDatabuf{sycl::range{N}}; + sycl::accessor DevData{DevDatabuf}; int *HostData = (int *)malloc(N * sizeof(int) * 10); for (size_t I = 0; I < 10; ++I) { - Q1.fill(DevData, 0, N); - sycl::event E1 = Q1.parallel_for( - N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; }); + Q1.fill(DevData, 0); + sycl::event E1 = Q1.submit([&](sycl::handler &h) { + h.require(DevData); + h.parallel_for( + N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; }); + }); Q2.ext_oneapi_set_external_event(E1); - sycl::event E2 = - Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; }); + sycl::event E2 = Q2.submit([&](sycl::handler &h) { + h.require(DevData); + h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; }); + }); Q1.ext_oneapi_set_external_event(E2); - Q1.copy(DevData, HostData + N * I, N); + Q1.copy(DevData, HostData + N * I); } Q1.wait_and_throw(); @@ -47,8 +53,5 @@ int main() { } } - sycl::free(DevData, Ctx); - free(HostData); - return Failures; } diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index 4276efe36366f..b48467af03a63 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -74,16 +74,15 @@ int main() { // CHECK-CACHE: piKernelRelease // CHECK-CACHE: piProgramRelease // CHECK-CACHE: piEventsWait - auto *p = malloc_shared(1, q); + sycl::buffer p_buf{sycl::range{1}}; for (int i = 0; i < 2; ++i) q.submit([&](handler &cgh) { + sycl::accessor p{p_buf, cgh}; cgh.set_specialization_constant(i); cgh.parallel_for(1, [=](auto, kernel_handler kh) { - *p = kh.get_specialization_constant(); + p[0] = kh.get_specialization_constant(); }); }).wait(); - - free(p, q); } // (Program cache releases) diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index 38286c48b8a0f..e6096f231ef9b 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -13,27 +13,37 @@ int main() { queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}}; queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}}; - - int *in1 = sycl::malloc_shared(dataSize, q1); - int *in2 = sycl::malloc_shared(dataSize, q1); - int *in3 = sycl::malloc_shared(dataSize, q1); - int *tmp = sycl::malloc_shared(dataSize, q1); - int *out = sycl::malloc_shared(dataSize, q1); - + int in1_arr[dataSize]; + int in2_arr[dataSize]; + int in3_arr[dataSize]; + int tmp_arr[dataSize]; + int out_arr[dataSize]; for (size_t i = 0; i < dataSize; ++i) { - in1[i] = i * 2; - in2[i] = i * 3; - in3[i] = i * 4; - tmp[i] = -1; - out[i] = -1; + in1_arr[i] = i * 2; + in2_arr[i] = i * 3; + in3_arr[i] = i * 4; + tmp_arr[i] = -1; + out_arr[i] = -1; } - + sycl::buffer in1_buf{&in1_arr[0], sycl::range{dataSize}}; + sycl::buffer in2_buf{&in2_arr[0], sycl::range{dataSize}}; + sycl::buffer in3_buf{&in3_arr[0], sycl::range{dataSize}}; + sycl::buffer tmp_buf{&tmp_arr[0], sycl::range{dataSize}}; + sycl::buffer out_buf{&out_arr[0], sycl::range{dataSize}}; + sycl::accessor in1{in1_buf}; + sycl::accessor in2{in2_buf}; + sycl::accessor in3{in3_buf}; + sycl::accessor tmp{tmp_buf}; + sycl::accessor out{out_buf}; ext::codeplay::experimental::fusion_wrapper fw1{q1}; fw1.start_fusion(); assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode"); auto kernel1 = q1.submit([&](handler &cgh) { + cgh.require(in1); + cgh.require(in2); + cgh.require(tmp); cgh.parallel_for( dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); }); @@ -42,6 +52,7 @@ int main() { fw2.start_fusion(); auto kernel3 = q2.submit([&](handler &cgh) { + cgh.require(tmp); cgh.depends_on(kernel1); cgh.parallel_for(dataSize, [=](id<1> i) { tmp[i] *= 2; }); @@ -55,6 +66,9 @@ int main() { assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode"); auto kernel2 = q1.submit([&](handler &cgh) { + cgh.require(tmp); + cgh.require(in3); + cgh.require(out); cgh.depends_on(kernel3); cgh.parallel_for( dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); @@ -71,16 +85,11 @@ int main() { q1.wait(); q2.wait(); - + sycl::host_accessor out_host{out_buf}; // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (40 * i * i) && "Computation error"); + assert(out_host[i] == (40 * i * i) && "Computation error"); } - sycl::free(in1, q1); - sycl::free(in2, q1); - sycl::free(in3, q1); - sycl::free(tmp, q1); - sycl::free(out, q1); return 0; } diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp index 0e6bc288812fd..1728ca8dd6002 100644 --- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s - +// REQUIRES: aspect-usm_shared_allocations // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows From 86e91445557f893ea6bf1f803751da280bf74985 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 13:56:24 -0800 Subject: [PATCH 03/19] Remove rogue changes --- sycl/source/detail/usm/usm_impl.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp index 965a4f47f57c7..3db9df8b52337 100755 --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -273,11 +273,6 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt, PrepareNotify.scopedNotify( (uint16_t)xpti::trace_point_type_t::mem_alloc_begin); #endif - if (Kind == alloc::device && - !Dev.has(sycl::aspect::usm_device_allocations)) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Device does not support Unified Shared Memory!"); - } void *RetVal = alignedAllocInternal(Alignment, Size, getSyclObjImpl(Ctxt).get(), getSyclObjImpl(Dev).get(), Kind, PropList); From 273fbf23422c26bdb9e0c21fdc3ea3cf72982232 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 16:52:51 -0800 Subject: [PATCH 04/19] Fix reduction tests --- .../test-e2e/Reduction/reduction_internal.cpp | 18 ++--- .../Reduction/reduction_range_item.cpp | 81 ++++++++++--------- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp index 57947a25176e1..e16f8d750a154 100644 --- a/sycl/test-e2e/Reduction/reduction_internal.cpp +++ b/sycl/test-e2e/Reduction/reduction_internal.cpp @@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) { cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl, [=](auto Item, auto &Red) { Red.combine(T{1}); }); }).wait(); - - auto *Result = malloc_shared(1, q); + sycl::buffer Result_buf{sycl::range{1}}; q.submit([&](handler &cgh) { - auto RedAcc = GetRedAcc(cgh); - cgh.single_task([=]() { *Result = RedAcc[0]; }); - }).wait(); - + sycl::accessor Result{Result_buf, cgh}; + auto RedAcc = GetRedAcc(cgh); + cgh.single_task([=]() { Result[0] = RedAcc[0]; }); + }); + sycl::host_accessor Result{Result_buf}; auto N = get_global_range(Range).size(); int Expected = InitToIdentity ? N : Init + N; #if defined(__PRETTY_FUNCTION__) @@ -94,10 +94,8 @@ static void test(RedStorage &Storage, RangeTy Range) { #elif defined(__FUNCSIG__) std::cout << __FUNCSIG__; #endif - std::cout << ": " << *Result << ", expected " << Expected << std::endl; - assert(*Result == Expected); - - free(Result, q); + std::cout << ": " << Result[0] << ", expected " << Expected << std::endl; + assert(Result[0] == Expected); } template diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp index ed9806e4753b5..3c886063215fa 100644 --- a/sycl/test-e2e/Reduction/reduction_range_item.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp @@ -10,44 +10,49 @@ using namespace sycl; int main() { queue q; - auto *RedMem = malloc_shared(1, q); - auto *Success = malloc_shared(1, q); - *Success = true; + bool Success_val = true; + int RedMem_val = 0; + sycl::buffer RedMem_buf{&RedMem_val, sycl::range{1}}; + sycl::buffer Success_buf{&Success_val, sycl::range{1}}; + q.submit([&](sycl::handler &h) { + sycl::accessor RedMem{RedMem_buf, h}; + sycl::accessor Success{Success_buf, h}; + h.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), + [=](item<1> Item, auto &Red) { + Red += 1; + if (Item.get_range(0) != 7) + Success[0] = false; + if (Item.get_id(0) == 7) + Success[0] = false; + }); + }); + sycl::host_accessor RedMem{RedMem_buf}; + sycl::host_accessor Success{Success_buf}; + assert(RedMem[0] == 7); + assert(Success[0]); + RedMem[0] = 0; + q.submit([&](sycl::handler &h) { + sycl::accessor RedMem{RedMem_buf, h}; + sycl::accessor Success{Success_buf, h}; + h.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), + [=](item<2> Item, auto &Red) { + Red += 1; + if (Item.get_range(0) != 1030) + *Success = false; + if (Item.get_range(1) != 7) + *Success = false; + + if (Item.get_id(0) == 1030) + *Success = false; + if (Item.get_id(1) == 7) + *Success = false; + }); + }); + sycl::host_accessor RedMem{RedMem_buf}; + sycl::host_accessor Success{Success_buf}; + + assert(RedMem[0] == 1030 * 7); + assert(Success[0]); - *RedMem = 0; - q.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), - [=](item<1> Item, auto &Red) { - Red += 1; - if (Item.get_range(0) != 7) - *Success = false; - if (Item.get_id(0) == 7) - *Success = false; - }) - .wait(); - - assert(*RedMem == 7); - assert(*Success); - - *RedMem = 0; - q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), - [=](item<2> Item, auto &Red) { - Red += 1; - if (Item.get_range(0) != 1030) - *Success = false; - if (Item.get_range(1) != 7) - *Success = false; - - if (Item.get_id(0) == 1030) - *Success = false; - if (Item.get_id(1) == 7) - *Success = false; - }) - .wait(); - - assert(*RedMem == 1030 * 7); - assert(*Success); - - free(RedMem, q); - free(Success, q); return 0; } From ef2a6cc2f83b116a524e5b84677ed20795ff0fb7 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 16:53:52 -0800 Subject: [PATCH 05/19] Fix reduction tests --- sycl/test-e2e/Reduction/reduction_span.cpp | 2 +- sycl/test-e2e/Reduction/reduction_span_pack.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp index 6c06d377eabe2..63f5ec1ece9be 100644 --- a/sycl/test-e2e/Reduction/reduction_span.cpp +++ b/sycl/test-e2e/Reduction/reduction_span.cpp @@ -3,7 +3,7 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - +// REQUIRES: aspect-usm_shared_allocations // This test performs basic checks of reductions initialized with a sycl::span #include diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp index 46862ffe45cf9..706dca1363dd3 100644 --- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp +++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp @@ -6,7 +6,7 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - +// REQUIRES: aspect-usm_shared_allocations // This test performs basic checks of reductions initialized with a pack // containing at least one sycl::span From 604893fc9c94c48c22cc341db19a35708bde3c24 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 16:59:36 -0800 Subject: [PATCH 06/19] Fix reduction tests --- .../Reduction/reduction_range_item.cpp | 83 +++++++++---------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp index 3c886063215fa..485959860456d 100644 --- a/sycl/test-e2e/Reduction/reduction_range_item.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp @@ -3,56 +3,51 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - +// REQUIRES: aspect-usm_shared_allocations #include using namespace sycl; int main() { queue q; - bool Success_val = true; - int RedMem_val = 0; - sycl::buffer RedMem_buf{&RedMem_val, sycl::range{1}}; - sycl::buffer Success_buf{&Success_val, sycl::range{1}}; - q.submit([&](sycl::handler &h) { - sycl::accessor RedMem{RedMem_buf, h}; - sycl::accessor Success{Success_buf, h}; - h.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), - [=](item<1> Item, auto &Red) { - Red += 1; - if (Item.get_range(0) != 7) - Success[0] = false; - if (Item.get_id(0) == 7) - Success[0] = false; - }); - }); - sycl::host_accessor RedMem{RedMem_buf}; - sycl::host_accessor Success{Success_buf}; - assert(RedMem[0] == 7); - assert(Success[0]); - RedMem[0] = 0; - q.submit([&](sycl::handler &h) { - sycl::accessor RedMem{RedMem_buf, h}; - sycl::accessor Success{Success_buf, h}; - h.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), - [=](item<2> Item, auto &Red) { - Red += 1; - if (Item.get_range(0) != 1030) - *Success = false; - if (Item.get_range(1) != 7) - *Success = false; - - if (Item.get_id(0) == 1030) - *Success = false; - if (Item.get_id(1) == 7) - *Success = false; - }); - }); - sycl::host_accessor RedMem{RedMem_buf}; - sycl::host_accessor Success{Success_buf}; - - assert(RedMem[0] == 1030 * 7); - assert(Success[0]); + auto *RedMem = malloc_shared(1, q); + auto *Success = malloc_shared(1, q); + *Success = true; + + *RedMem = 0; + q.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), + [=](item<1> Item, auto &Red) { + Red += 1; + if (Item.get_range(0) != 7) + *Success = false; + if (Item.get_id(0) == 7) + *Success = false; + }) + .wait(); + + assert(*RedMem == 7); + assert(*Success); + + *RedMem = 0; + q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), + [=](item<2> Item, auto &Red) { + Red += 1; + if (Item.get_range(0) != 1030) + *Success = false; + if (Item.get_range(1) != 7) + *Success = false; + + if (Item.get_id(0) == 1030) + *Success = false; + if (Item.get_id(1) == 7) + *Success = false; + }) + .wait(); + + assert(*RedMem == 1030 * 7); + assert(*Success); + free(RedMem, q); + free(Success, q); return 0; } From 406ceb0e616b7ae8a583cd57d0e20a558f455fc5 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 19:13:02 -0800 Subject: [PATCH 07/19] Revert disable-caching to use usm --- sycl/test-e2e/KernelAndProgram/disable-caching.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index b48467af03a63..cd937a5adf207 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -6,6 +6,7 @@ // RUN: | FileCheck %s // RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 %{run} %t.out \ // RUN: | FileCheck %s --check-prefixes=CHECK-CACHE +// REQUIRES: aspect-usm_shared_allocations #include using namespace sycl; @@ -74,15 +75,16 @@ int main() { // CHECK-CACHE: piKernelRelease // CHECK-CACHE: piProgramRelease // CHECK-CACHE: piEventsWait - sycl::buffer p_buf{sycl::range{1}}; + auto *p = malloc_shared(1, q); for (int i = 0; i < 2; ++i) q.submit([&](handler &cgh) { - sycl::accessor p{p_buf, cgh}; cgh.set_specialization_constant(i); cgh.parallel_for(1, [=](auto, kernel_handler kh) { - p[0] = kh.get_specialization_constant(); + *p = kh.get_specialization_constant(); }); }).wait(); + + free(p, q); } // (Program cache releases) From c2ba1ad50f8d6fcd2b46f42e4710d053af8542bf Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 19:52:06 -0800 Subject: [PATCH 08/19] Minor fixes --- .../Regression/exclusive-scan-char-short.cpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp index a7d3601210fa8..743d7de3b6d85 100644 --- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp +++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp @@ -16,24 +16,27 @@ int n_fail = 0; template void test() { auto op = OpT(); auto init = sycl::known_identity_v; - auto *p = malloc_shared(1, q); - *p = 0; + T p_val = 0; + sycl::buffer p_buf{&p_val, sycl::range{1}}; + sycl::accessor p{p_buf}; T ref; - emu::exclusive_scan(p, p + 1, &ref, init, op); + emu::exclusive_scan(p.begin(), p.end(), &ref, init, op); range r(1); - q.parallel_for(nd_range(r, r), [=](nd_item<1> it) { - auto g = it.get_group(); - *p = exclusive_scan_over_group(g, *p, op); - }).wait(); - - if (*p != ref) { + q.submit([&](sycl::handler &h) { + h.require(p); + h.parallel_for(nd_range(r, r), [=](nd_item<1> it) { + auto g = it.get_group(); + p[0] = exclusive_scan_over_group(g, p[0], op); + }); + }); + sycl::host_accessor p_host{p_buf}; + if (p_host[0] != ref) { std::cout << "test " << cur_test << " fail\n"; - std::cout << "got: " << int(*p) << "\n"; + std::cout << "got: " << int(p_host[0]) << "\n"; std::cout << "expected: " << int(ref) << "\n\n"; ++n_fail; } ++cur_test; - free(p, q); } int main() { From f3b2142f098c00524d8c03c99f112c585b020b1c Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 21:15:34 -0800 Subject: [PATCH 09/19] Add more tests --- .../Regression/exclusive-scan-char-short.cpp | 25 +++++----- .../Regression/group_local_linear_id.cpp | 20 ++++---- sycl/test-e2e/Regression/half_operators.cpp | 46 +++++++++---------- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp index 743d7de3b6d85..a7d3601210fa8 100644 --- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp +++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp @@ -16,27 +16,24 @@ int n_fail = 0; template void test() { auto op = OpT(); auto init = sycl::known_identity_v; - T p_val = 0; - sycl::buffer p_buf{&p_val, sycl::range{1}}; - sycl::accessor p{p_buf}; + auto *p = malloc_shared(1, q); + *p = 0; T ref; - emu::exclusive_scan(p.begin(), p.end(), &ref, init, op); + emu::exclusive_scan(p, p + 1, &ref, init, op); range r(1); - q.submit([&](sycl::handler &h) { - h.require(p); - h.parallel_for(nd_range(r, r), [=](nd_item<1> it) { - auto g = it.get_group(); - p[0] = exclusive_scan_over_group(g, p[0], op); - }); - }); - sycl::host_accessor p_host{p_buf}; - if (p_host[0] != ref) { + q.parallel_for(nd_range(r, r), [=](nd_item<1> it) { + auto g = it.get_group(); + *p = exclusive_scan_over_group(g, *p, op); + }).wait(); + + if (*p != ref) { std::cout << "test " << cur_test << " fail\n"; - std::cout << "got: " << int(p_host[0]) << "\n"; + std::cout << "got: " << int(*p) << "\n"; std::cout << "expected: " << int(ref) << "\n\n"; ++n_fail; } ++cur_test; + free(p, q); } int main() { diff --git a/sycl/test-e2e/Regression/group_local_linear_id.cpp b/sycl/test-e2e/Regression/group_local_linear_id.cpp index d3562c2b25cbf..8fc216f76df31 100644 --- a/sycl/test-e2e/Regression/group_local_linear_id.cpp +++ b/sycl/test-e2e/Regression/group_local_linear_id.cpp @@ -11,14 +11,17 @@ int main() { const sycl::range<3> GlobalRange(2, 8, 16); const sycl::range<3> LocalRange(2, 4, 4); sycl::queue Q; - bool *ReadSame = sycl::malloc_shared(GlobalRange.size(), Q); - Q.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange}, - [=](sycl::nd_item<3> Item) { - ReadSame[Item.get_global_linear_id()] = - Item.get_local_linear_id() == - Item.get_group().get_local_linear_id(); - }) - .wait(); + sycl::buffer ReadSame_buf{GlobalRange.size()}; + Q.submit([&](sycl::handler &h) { + sycl::accessor ReadSame{ReadSame_buf, h}; + h.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange}, + [=](sycl::nd_item<3> Item) { + ReadSame[Item.get_global_linear_id()] = + Item.get_local_linear_id() == + Item.get_group().get_local_linear_id(); + }); + }); + sycl::host_accessor ReadSame{ReadSame_buf}; int Failures = 0; for (size_t I = 0; I < GlobalRange.size(); ++I) { if (ReadSame[I]) @@ -26,6 +29,5 @@ int main() { ++Failures; std::cout << "Read mismatch at index " << I << std::endl; } - sycl::free(ReadSame, Q); return Failures; } diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp index b227806a02bc0..7b663efd13e34 100644 --- a/sycl/test-e2e/Regression/half_operators.cpp +++ b/sycl/test-e2e/Regression/half_operators.cpp @@ -7,11 +7,6 @@ using namespace sycl; -template -using shared_allocator = sycl::usm_allocator; - -template using shared_vector = std::vector>; - template bool are_bitwise_equal(T lhs, T rhs) { constexpr size_t size{sizeof(T)}; @@ -33,33 +28,34 @@ template bool test(sycl::queue &queue) { static const T inexact = static_cast(0.1); - shared_vector result_source{NumElems, shared_allocator{queue}}; - shared_vector input{NumElems, shared_allocator{queue}}; + std::vector result_source_vec{NumElems}; + std::vector input_vec{NumElems}; for (size_t i = 0; i < NumElems; ++i) { - input[i] = inexact * i; + input_vec[i] = inexact * i; } - - queue.submit([&](sycl::handler &cgh) { - auto out_source = result_source.data(); - auto in = input.data(); - - cgh.single_task<>([=]() { - for (size_t i = 0; i < NumElems; ++i) { - auto source = in[i]; - ++source; - out_source[i] = source; - } + { + sycl::buffer result_source_buf{result_source_vec}; + sycl::buffer input_buf{input_vec}; + queue.submit([&](sycl::handler &cgh) { + sycl::accessor out_source{result_source_buf, cgh}; + sycl::accessor in{input_buf, cgh}; + cgh.single_task<>([=]() { + for (size_t i = 0; i < NumElems; ++i) { + auto source = in[i]; + ++source; + out_source[i] = source; + } + }); }); - }); - queue.wait_and_throw(); - + queue.wait_and_throw(); + } // buffers go out of scope here and write back to the vectors for (size_t i = 0; i < NumElems; ++i) { - T expected_value = input[i] + 1; + T expected_value = input_vec[i] + 1; - if (!are_bitwise_equal(expected_value, result_source[i])) { + if (!are_bitwise_equal(expected_value, result_source_vec[i])) { pass = false; - std::cout << "Sample failed retrieved value: " << result_source[i] + std::cout << "Sample failed retrieved value: " << result_source_vec[i] << ", but expected: " << expected_value << ", at index: " << i << std::endl; } From 01a6431eebb629316fd73cc3791e03eeaafbf9cf Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 21:16:51 -0800 Subject: [PATCH 10/19] Add more tests --- sycl/test-e2e/Regression/exclusive-scan-char-short.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp index a7d3601210fa8..81e5ddc8b1a27 100644 --- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp +++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out - +// REQUIRES: aspect-usm_shared_allocations // This test ensures the result computed by exclusive_scan_over_group // for the first work item when given a short or char argument with // the maximum or minimum operator is computed correctly. From c73e919136ba236c80c5b8e0838bc6e92cc2d644 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 22:33:59 -0800 Subject: [PATCH 11/19] Fix vector initialization bug --- sycl/test-e2e/Regression/half_operators.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp index 7b663efd13e34..7309a032ea35a 100644 --- a/sycl/test-e2e/Regression/half_operators.cpp +++ b/sycl/test-e2e/Regression/half_operators.cpp @@ -28,8 +28,8 @@ template bool test(sycl::queue &queue) { static const T inexact = static_cast(0.1); - std::vector result_source_vec{NumElems}; - std::vector input_vec{NumElems}; + std::vector result_source_vec(NumElems); + std::vector input_vec(NumElems);; for (size_t i = 0; i < NumElems; ++i) { input_vec[i] = inexact * i; From f2bde72fd392c2d37f29b1fe8b0da9902ef8104c Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Wed, 7 Feb 2024 22:55:09 -0800 Subject: [PATCH 12/19] Fix formatting --- sycl/test-e2e/Regression/half_operators.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp index 7309a032ea35a..14f6924260ec6 100644 --- a/sycl/test-e2e/Regression/half_operators.cpp +++ b/sycl/test-e2e/Regression/half_operators.cpp @@ -29,7 +29,7 @@ template bool test(sycl::queue &queue) { static const T inexact = static_cast(0.1); std::vector result_source_vec(NumElems); - std::vector input_vec(NumElems);; + std::vector input_vec(NumElems); for (size_t i = 0; i < NumElems; ++i) { input_vec[i] = inexact * i; From ae3e35effaf65bcdc885dd6a3f31687f7f0d13e7 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Thu, 8 Feb 2024 07:02:57 -0800 Subject: [PATCH 13/19] Change malloc_shared to malloc_device --- .../sync_two_queues_event_dep.cpp | 54 ++++++++----------- .../test-e2e/KernelFusion/sync_usm_mem_op.cpp | 26 ++++----- 2 files changed, 37 insertions(+), 43 deletions(-) diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index e6096f231ef9b..13b199c2bc0ba 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -13,37 +13,29 @@ int main() { queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}}; queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}}; - int in1_arr[dataSize]; - int in2_arr[dataSize]; - int in3_arr[dataSize]; - int tmp_arr[dataSize]; - int out_arr[dataSize]; - for (size_t i = 0; i < dataSize; ++i) { - in1_arr[i] = i * 2; - in2_arr[i] = i * 3; - in3_arr[i] = i * 4; - tmp_arr[i] = -1; - out_arr[i] = -1; - } - sycl::buffer in1_buf{&in1_arr[0], sycl::range{dataSize}}; - sycl::buffer in2_buf{&in2_arr[0], sycl::range{dataSize}}; - sycl::buffer in3_buf{&in3_arr[0], sycl::range{dataSize}}; - sycl::buffer tmp_buf{&tmp_arr[0], sycl::range{dataSize}}; - sycl::buffer out_buf{&out_arr[0], sycl::range{dataSize}}; - sycl::accessor in1{in1_buf}; - sycl::accessor in2{in2_buf}; - sycl::accessor in3{in3_buf}; - sycl::accessor tmp{tmp_buf}; - sycl::accessor out{out_buf}; + + int *in1 = sycl::malloc_device(dataSize, q1); + int *in2 = sycl::malloc_device(dataSize, q1); + int *in3 = sycl::malloc_device(dataSize, q1); + int *tmp = sycl::malloc_device(dataSize, q1); + int *out = sycl::malloc_device(dataSize, q1); + + q1.single_task([=]() { + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + }).wait(); + ext::codeplay::experimental::fusion_wrapper fw1{q1}; fw1.start_fusion(); assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode"); auto kernel1 = q1.submit([&](handler &cgh) { - cgh.require(in1); - cgh.require(in2); - cgh.require(tmp); cgh.parallel_for( dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); }); @@ -52,7 +44,6 @@ int main() { fw2.start_fusion(); auto kernel3 = q2.submit([&](handler &cgh) { - cgh.require(tmp); cgh.depends_on(kernel1); cgh.parallel_for(dataSize, [=](id<1> i) { tmp[i] *= 2; }); @@ -66,9 +57,6 @@ int main() { assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode"); auto kernel2 = q1.submit([&](handler &cgh) { - cgh.require(tmp); - cgh.require(in3); - cgh.require(out); cgh.depends_on(kernel3); cgh.parallel_for( dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); @@ -85,11 +73,15 @@ int main() { q1.wait(); q2.wait(); - sycl::host_accessor out_host{out_buf}; // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out_host[i] == (40 * i * i) && "Computation error"); + assert(out[i] == (40 * i * i) && "Computation error"); } + sycl::free(in1, q1); + sycl::free(in2, q1); + sycl::free(in3, q1); + sycl::free(tmp, q1); + sycl::free(out, q1); return 0; } diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp index 1728ca8dd6002..a2899ff41d375 100644 --- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp @@ -1,6 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s -// REQUIRES: aspect-usm_shared_allocations // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows @@ -16,21 +15,24 @@ int main() { queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - int *in1 = sycl::malloc_shared(dataSize, q); - int *in2 = sycl::malloc_shared(dataSize, q); - int *in3 = sycl::malloc_shared(dataSize, q); - int *tmp = sycl::malloc_shared(dataSize, q); - int *out = sycl::malloc_shared(dataSize, q); + int *in1 = sycl::malloc_device(dataSize, q); + int *in2 = sycl::malloc_device(dataSize, q); + int *in3 = sycl::malloc_device(dataSize, q); + int *tmp = sycl::malloc_device(dataSize, q); + int *out = sycl::malloc_device(dataSize, q); int dst[dataSize]; - for (size_t i = 0; i < dataSize; ++i) { - in1[i] = i * 2; - in2[i] = i * 3; - in3[i] = i * 4; - tmp[i] = -1; - out[i] = -1; dst[i] = -1; } + q.single_task([=]() { + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + }).wait(); ext::codeplay::experimental::fusion_wrapper fw{q}; fw.start_fusion(); From 6d25416fb67ab2e2e93896a4ecafb49a03628d8f Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Thu, 8 Feb 2024 08:24:10 -0800 Subject: [PATCH 14/19] Fix access of device USM ptr from host --- sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp | 5 ++++- sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index 13b199c2bc0ba..79347114ec2a4 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -73,9 +73,12 @@ int main() { q1.wait(); q2.wait(); + int host_out[dataSize]; + q1.memcpy(host_out, out, dataSize * sizeof(int)); + q1.wait(); // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (40 * i * i) && "Computation error"); + assert(host_out[i] == (40 * i * i) && "Computation error"); } sycl::free(in1, q1); sycl::free(in2, q1); diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp index a2899ff41d375..f3d2cc97b9ae5 100644 --- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp @@ -61,14 +61,16 @@ int main() { "Queue should not be in fusion mode anymore"); fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); - + int host_out[dataSize]; + q.memcpy(host_out, out, dataSize * sizeof(int)); + q.wait(); for (size_t i = 0; i < dataSize; ++i) { - std::cout << out[i] << ", "; + std::cout << host_out[i] << ", "; } std::cout << "\n"; // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (20 * i * i) && "Computation error"); + assert(host_out[i] == (20 * i * i) && "Computation error"); assert(dst[i] == (5 * i) && "Computation error"); } From f0d325f56892730803eeeb817421c4d3c8e6ff68 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Thu, 8 Feb 2024 11:52:28 -0800 Subject: [PATCH 15/19] Finish off rewriting regression tests --- sycl/test-e2e/Regression/pf-wg-atomic64.cpp | 5 +++-- .../Regression/range-rounding-this-id.cpp | 18 +++++++++++------- .../Regression/reduction_64bit_atomic64.cpp | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp index f2985b5a33b1d..d7adc4e6c153b 100644 --- a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp +++ b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp @@ -10,11 +10,12 @@ using AtomicRefT = int main() { queue q; - auto *p = malloc_shared(1, q); + sycl::buffer p_buf{sycl::range{1}}; try { q.submit([&](sycl::handler &cgh) { + sycl::accessor p{p_buf, cgh}; cgh.parallel_for_work_group(range{1}, range{1}, [=](group<1>) { - AtomicRefT feature(*p); + AtomicRefT feature(p[0]); feature += 42; }); }).wait(); diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp index 33fa41c60cc68..5e3baf2355084 100644 --- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp +++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp @@ -30,13 +30,17 @@ template void test(queue &q) { id this_id; id ref_id; }; - std::vector> vec(range.size(), q); - auto *p = vec.data(); - q.parallel_for(range, [=](auto it) { - p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id(), - it.get_id()}; - }).wait_and_throw(); - + std::vector vec(range.size()); + { + sycl::buffer p_buf{vec}; + q.submit([&](sycl::handler &h) { + sycl::accessor p{p_buf, h}; + q.parallel_for(range, [=](auto it) { + p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id(), + it.get_id()}; + }); + }).wait_and_throw(); + } // p_buf goes out of scope here and writed back to vec for (const auto &[this_item, ref_item] : vec) { if (this_item != ref_item) { std::cout << "fail: " << this_item << " != " << ref_item << "\n"; diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp index dc138d9b79da8..11affac9df7b9 100644 --- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp +++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp @@ -2,7 +2,7 @@ // RUN: %{build} -o %t.out // // RUN: %{run} %t.out - +// REQUIRES: aspect-usm_shared_allocations // Tests that a previously known case for reduction doesn't cause a requirement // for atomic64. // TODO: When aspect requirements are added to testing, this test could be set From ceff46aaf3a120d0a8e0f27ce7c7098b517cfffe Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Thu, 8 Feb 2024 13:18:47 -0800 Subject: [PATCH 16/19] Fix runtime error --- sycl/test-e2e/Regression/range-rounding-this-id.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp index 5e3baf2355084..a5970adf46e41 100644 --- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp +++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp @@ -35,7 +35,7 @@ template void test(queue &q) { sycl::buffer p_buf{vec}; q.submit([&](sycl::handler &h) { sycl::accessor p{p_buf, h}; - q.parallel_for(range, [=](auto it) { + h.parallel_for(range, [=](auto it) { p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id(), it.get_id()}; }); From 971e87545adad9f3378787aa2de8e26e98e0ff0f Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Fri, 9 Feb 2024 15:49:12 -0800 Subject: [PATCH 17/19] Rewrite tests using malloc_device and fix casing --- sycl/test-e2e/DiscardEvents/invalid_event.cpp | 5 ++-- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 12 ++++---- .../InOrderEventsExt/set_external_event.cpp | 6 ++-- .../test-e2e/Reduction/reduction_internal.cpp | 6 ++-- .../Reduction/reduction_range_item.cpp | 25 +++++++++------- sycl/test-e2e/Reduction/reduction_span.cpp | 9 +++--- .../Reduction/reduction_span_pack.cpp | 29 ++++++++++--------- .../Regression/reduction_64bit_atomic64.cpp | 3 +- 8 files changed, 51 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp index 4fc0ee0f1495b..17c6a492a10c2 100644 --- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp +++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp @@ -2,7 +2,6 @@ // https://github.com/intel/llvm/issues/7330. // UNSUPPORTED: opencl && gpu // RUN: %{build} -o %t.out -// REQUIRES: aspect-usm_shared_allocations // RUN: %{run} %t.out // The test checks that each PI call to the queue returns a discarded event @@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) { sycl::range<1> range(BUFFER_SIZE); auto Dev = Q.get_device(); - int *x = sycl::malloc_shared(BUFFER_SIZE, Q); + int *x = sycl::malloc_device(BUFFER_SIZE, Q); assert(x != nullptr); - int *y = sycl::malloc_shared(BUFFER_SIZE, Q); + int *y = sycl::malloc_device(BUFFER_SIZE, Q); assert(y != nullptr); sycl::event DiscardedEvent; diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 584ecfd22fe0b..e346ea142b759 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -41,10 +41,10 @@ void testRootGroup() { max_num_work_group_sync>(q); const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; - sycl::buffer data_buf{sycl::range{maxWGs * WorkGroupSize}}; + sycl::buffer dataBuf{sycl::range{maxWGs * WorkGroupSize}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; q.submit([&](sycl::handler &h) { - sycl::accessor data{data_buf, h}; + sycl::accessor data{dataBuf, h}; h.parallel_for( range, props, [=](sycl::nd_item<1> it) { auto root = it.ext_oneapi_get_root_group(); @@ -59,7 +59,7 @@ void testRootGroup() { data[root.get_local_id()] = sum; }); }); - sycl::host_accessor data{data_buf}; + sycl::host_accessor data{dataBuf}; const int workItemCount = static_cast(range.get_global_range().size()); for (int i = 0; i < workItemCount; i++) { assert(data[i] == (workItemCount - 1)); @@ -78,10 +78,10 @@ void testRootGroupFunctions() { sycl::ext::oneapi::experimental::use_root_sync}; constexpr int testCount = 10; - sycl::buffer testResults_buf{sycl::range{testCount}}; + sycl::buffer testResultsBuf{sycl::range{testCount}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; q.submit([&](sycl::handler &h) { - sycl::accessor testResults{testResults_buf, h}; + sycl::accessor testResults{testResultsBuf, h}; h.parallel_for( range, props, [=](sycl::nd_item<1> it) { const auto root = it.ext_oneapi_get_root_group(); @@ -113,7 +113,7 @@ void testRootGroupFunctions() { } }); }); - sycl::host_accessor testResults{testResults_buf}; + sycl::host_accessor testResults{testResultsBuf}; for (int i = 0; i < testCount; i++) { assert(testResults[i]); } diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp index 4f2a409fa3d46..61987d5b5b3cb 100644 --- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp +++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp @@ -15,8 +15,8 @@ int main() { sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}}; sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}}; - sycl::buffer DevDatabuf{sycl::range{N}}; - sycl::accessor DevData{DevDatabuf}; + sycl::buffer DevDataBuf{sycl::range{N}}; + sycl::accessor DevData{DevDataBuf}; int *HostData = (int *)malloc(N * sizeof(int) * 10); for (size_t I = 0; I < 10; ++I) { @@ -52,6 +52,6 @@ int main() { } } } - + free(HostData); return Failures; } diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp index e16f8d750a154..39c64932802da 100644 --- a/sycl/test-e2e/Reduction/reduction_internal.cpp +++ b/sycl/test-e2e/Reduction/reduction_internal.cpp @@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) { cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl, [=](auto Item, auto &Red) { Red.combine(T{1}); }); }).wait(); - sycl::buffer Result_buf{sycl::range{1}}; + sycl::buffer ResultBuf{sycl::range{1}}; q.submit([&](handler &cgh) { - sycl::accessor Result{Result_buf, cgh}; + sycl::accessor Result{ResultBuf, cgh}; auto RedAcc = GetRedAcc(cgh); cgh.single_task([=]() { Result[0] = RedAcc[0]; }); }); - sycl::host_accessor Result{Result_buf}; + sycl::host_accessor Result{ResultBuf}; auto N = get_global_range(Range).size(); int Expected = InitToIdentity ? N : Init + N; #if defined(__PRETTY_FUNCTION__) diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp index 485959860456d..06d4f33280f06 100644 --- a/sycl/test-e2e/Reduction/reduction_range_item.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp @@ -3,15 +3,14 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows -// REQUIRES: aspect-usm_shared_allocations #include using namespace sycl; int main() { queue q; - auto *RedMem = malloc_shared(1, q); - auto *Success = malloc_shared(1, q); + auto *RedMem = malloc_device(1, q); + auto *Success = malloc_device(1, q); *Success = true; *RedMem = 0; @@ -24,11 +23,15 @@ int main() { *Success = false; }) .wait(); - - assert(*RedMem == 7); - assert(*Success); - - *RedMem = 0; + int RedMemHost; + bool SuccessHost; + q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait(); + q.memcpy(&SuccessHost, Success, sizeof(bool)).wait(); + assert(RedMemHost == 7); + assert(SuccessHost); + + RedMemHost = 0; + q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait(); q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), [=](item<2> Item, auto &Red) { Red += 1; @@ -44,8 +47,10 @@ int main() { }) .wait(); - assert(*RedMem == 1030 * 7); - assert(*Success); + q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait(); + q.memcpy(&SuccessHost, Success, sizeof(bool)).wait(); + assert(RedMemHost == 1030 * 7); + assert(SuccessHost); free(RedMem, q); free(Success, q); diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp index 63f5ec1ece9be..23ee61e805f9b 100644 --- a/sycl/test-e2e/Reduction/reduction_span.cpp +++ b/sycl/test-e2e/Reduction/reduction_span.cpp @@ -3,7 +3,6 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows -// REQUIRES: aspect-usm_shared_allocations // This test performs basic checks of reductions initialized with a sycl::span #include @@ -44,7 +43,7 @@ template (N, Q); + T *Output = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -70,11 +69,13 @@ void test(queue Q, Range Rng, T Identity, T Value) { } bool Passed = true; + T OutputHost[N]; + Q.memcpy(OutputHost, Output, N * sizeof(T)).wait(); for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output[I] == Expected); + Passed &= (OutputHost[I] == Expected); } else { - Passed &= (Output[I] == ExpectedRemainder); + Passed &= (OutputHost[I] == ExpectedRemainder); } } diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp index 706dca1363dd3..b6e118c081cb0 100644 --- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp +++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp @@ -6,7 +6,6 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows -// REQUIRES: aspect-usm_shared_allocations // This test performs basic checks of reductions initialized with a pack // containing at least one sycl::span @@ -49,9 +48,9 @@ template (1, Q); + int *Sum = malloc_device(1, Q); Q.single_task([=]() { *Sum = 0; }).wait(); - T *Output = malloc_shared(N, Q); + T *Output = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -81,11 +80,13 @@ void test1(queue Q, Range Rng, T Identity, T Value) { } bool Passed = true; + T OutputHost[N]; + Q.memcpy(OutputHost, Output, N * sizeof(T)).wait(); for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output[I] == Expected); + Passed &= (OutputHost[I] == Expected); } else { - Passed &= (Output[I] == ExpectedRemainder); + Passed &= (OutputHost[I] == ExpectedRemainder); } } Passed &= (*Sum == Size); @@ -101,9 +102,9 @@ template (N, Q); + int *Output1 = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait(); - T *Output2 = malloc_shared(N, Q); + T *Output2 = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -121,7 +122,10 @@ void test2(queue Q, Range Rng, T Identity, T Value) { } else /*if (SubmissionMode == submission_mode::queue) */ { Q.parallel_for(Rng, Redu1, Redu2, Kern).wait(); } - + int Output1Host[N]; + T Output2Host[N]; + Q.memcpy(Output1Host, Output1, N * sizeof(int)).wait(); + Q.memcpy(Output2Host, Output2, N * sizeof(T)).wait(); size_t Size = getLinearSize(Rng); bool Passed = true; // Span1 @@ -132,12 +136,11 @@ void test2(queue Q, Range Rng, T Identity, T Value) { ExpectedRemainder = Expected; Expected += 1; } - for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output1[I] == Expected); + Passed &= (Output1Host[I] == Expected); } else { - Passed &= (Output1[I] == ExpectedRemainder); + Passed &= (Output1Host[I] == ExpectedRemainder); } } } @@ -153,9 +156,9 @@ void test2(queue Q, Range Rng, T Identity, T Value) { for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output2[I] == Expected); + Passed &= (Output2Host[I] == Expected); } else { - Passed &= (Output2[I] == ExpectedRemainder); + Passed &= (Output2Host[I] == ExpectedRemainder); } } } diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp index 11affac9df7b9..c2d0d3e84ca7c 100644 --- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp +++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp @@ -2,7 +2,6 @@ // RUN: %{build} -o %t.out // // RUN: %{run} %t.out -// REQUIRES: aspect-usm_shared_allocations // Tests that a previously known case for reduction doesn't cause a requirement // for atomic64. // TODO: When aspect requirements are added to testing, this test could be set @@ -19,7 +18,7 @@ using namespace sycl; int main() { queue Q; - long long *Out = malloc_shared(1, Q); + long long *Out = malloc_device(1, Q); // Case 1: nd_range reduction with 64-bit integer and either sycl::plus, // sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy From d7803db15dc5cfd231ee0ee11d7d339e0dff7cf7 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Fri, 9 Feb 2024 15:50:57 -0800 Subject: [PATCH 18/19] Rewrite tests using malloc_device and fix casing --- sycl/test-e2e/KernelAndProgram/disable-caching.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index cd937a5adf207..c50e23b1c17c4 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -6,7 +6,6 @@ // RUN: | FileCheck %s // RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 %{run} %t.out \ // RUN: | FileCheck %s --check-prefixes=CHECK-CACHE -// REQUIRES: aspect-usm_shared_allocations #include using namespace sycl; @@ -75,7 +74,7 @@ int main() { // CHECK-CACHE: piKernelRelease // CHECK-CACHE: piProgramRelease // CHECK-CACHE: piEventsWait - auto *p = malloc_shared(1, q); + auto *p = malloc_device(1, q); for (int i = 0; i < 2; ++i) q.submit([&](handler &cgh) { cgh.set_specialization_constant(i); From fd26740d033551f6b6b7b91984452ceebbf1fae2 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Fri, 9 Feb 2024 16:24:04 -0800 Subject: [PATCH 19/19] Fix failures --- sycl/test-e2e/Reduction/reduction_range_item.cpp | 11 ++++++----- sycl/test-e2e/Reduction/reduction_span_pack.cpp | 4 +++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp index 06d4f33280f06..93fe915ac0254 100644 --- a/sycl/test-e2e/Reduction/reduction_range_item.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp @@ -11,9 +11,12 @@ int main() { queue q; auto *RedMem = malloc_device(1, q); auto *Success = malloc_device(1, q); - *Success = true; - - *RedMem = 0; + int RedMemHost; + bool SuccessHost; + RedMemHost = 0; + SuccessHost = true; + q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait(); + q.memcpy(Success, &SuccessHost, sizeof(bool)).wait(); q.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), [=](item<1> Item, auto &Red) { Red += 1; @@ -23,8 +26,6 @@ int main() { *Success = false; }) .wait(); - int RedMemHost; - bool SuccessHost; q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait(); q.memcpy(&SuccessHost, Success, sizeof(bool)).wait(); assert(RedMemHost == 7); diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp index b6e118c081cb0..4bc4cfb5f9f14 100644 --- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp +++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp @@ -89,7 +89,9 @@ void test1(queue Q, Range Rng, T Identity, T Value) { Passed &= (OutputHost[I] == ExpectedRemainder); } } - Passed &= (*Sum == Size); + int SumHost; + Q.memcpy(&SumHost, Sum, sizeof(int)).wait(); + Passed &= (SumHost == Size); free(Output, Q); free(Sum, Q);