From 56d6e0334d3e8a73bad89679353012511be7a67c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 22 Apr 2024 12:17:29 +0100
Subject: [PATCH 1/3] [SYCL][E2E] Fix some tests in multi-device mode

The atomic tests can be run by multiple SYCL devices, in which case the
compiler needs to be told which device the `--cuda-gpu-arch` parameter
applies to.

The changes to the bfloat16 tests try and restore earlier behaviour,
where the `aspect-ext_oneapi_bfloat16_math_functions` feature was
preventing other devices from running. It also removes RUN lines from a
shared header file (which we don't consider a valid test file) which may
mislead people into thinking it itself is a runnable test.
---
 sycl/test-e2e/BFloat16/bfloat16_builtins.hpp      | 15 ---------------
 .../BFloat16/bfloat16_builtins_cuda_generic.cpp   |  1 +
 sycl/test-e2e/syclcompat/atomic/atomic_class.cpp  |  2 +-
 .../syclcompat/atomic/atomic_memory_acq_rel.cpp   |  2 +-
 4 files changed, 3 insertions(+), 17 deletions(-)
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
index df0716a4c3b5a..0452129d439c9 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
@@ -4,21 +4,6 @@
 // + sm_80 and above uses some native bfloat16 math instructions
 // + below sm_80 always uses generic impls
 
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// REQUIRES: aspect-ext_oneapi_bfloat16_math_functions
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_80 %} %s -o %t.out %{mathflags}
-// RUN: %{run} %t.out
-
-// Test "new" (ABI breaking) for all platforms ( sm_80/native if CUDA )
-// RUN:  %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_80 %} %s -o %t2.out %{mathflags} %}
-// RUN:  %if preview-breaking-changes-supported %{  %{run} %t2.out  %}
-
-// If CUDA, test "new" again for sm_75/generic
-// RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple}  -Xsycl-target-backend --cuda-gpu-arch=sm_75  %s -o %t3.out %{mathflags} %} %}
-// RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %{run} %t3.out  %} %}
-
-// Currently the feature isn't supported on FPGA.
-// UNSUPPORTED: accelerator
 #include <sycl/sycl.hpp>
 
 #include <cmath>
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
index 3e2b6b1aae433..ced9093b6b075 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
@@ -5,6 +5,7 @@
 // + below sm_80 always uses generic impls
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
+// REQUIRES: aspect-ext_oneapi_bfloat16_math_functions
 
 // If CUDA, test "new" again for sm_75/generic
 // RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple}  -Xsycl-target-backend --cuda-gpu-arch=sm_75  %s -o %t3.out %{mathflags} %} %}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
index d004a37bcc610..4e574a556887c 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
@@ -32,7 +32,7 @@
 
 // UNSUPPORTED: hip || (windows && level_zero)
 
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_70 %} %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <sycl/sycl.hpp>
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
index 2acaa85022f0b..047d490140760 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
@@ -32,7 +32,7 @@
 
 // UNSUPPORTED: hip
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_70 %} %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <iostream>

From 7ce174aa86ebeaa49dbccbdf7e1b0c6cc0fdcab3 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 25 Apr 2024 15:32:48 +0100
Subject: [PATCH 2/3] feedback: remove aspect; unsupported cpu

---
 sycl/test-e2e/BFloat16/bfloat16_builtins.cpp              | 4 ++--
 sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
index 72eda9978f434..6a4010aed17f5 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
@@ -5,7 +5,6 @@
 // + below sm_80 always uses generic impls
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// REQUIRES: aspect-ext_oneapi_bfloat16_math_functions
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_80 %} %s -o %t.out %{mathflags}
 // RUN: %{run} %t.out
 
@@ -14,7 +13,8 @@
 // RUN:  %if preview-breaking-changes-supported %{  %{run} %t2.out  %}
 
 // Currently the feature isn't supported on FPGA.
-// UNSUPPORTED: accelerator
+// FIXME: enable opaque pointers support on CPU.
+// UNSUPPORTED: accelerator, cpu
 #include "bfloat16_builtins.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
index ced9093b6b075..844c5b3c69684 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
@@ -5,14 +5,14 @@
 // + below sm_80 always uses generic impls
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// REQUIRES: aspect-ext_oneapi_bfloat16_math_functions
 
 // If CUDA, test "new" again for sm_75/generic
 // RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple}  -Xsycl-target-backend --cuda-gpu-arch=sm_75  %s -o %t3.out %{mathflags} %} %}
 // RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %{run} %t3.out  %} %}
 
 // Currently the feature isn't supported on FPGA.
-// UNSUPPORTED: accelerator
+// FIXME: enable opaque pointers support on CPU.
+// UNSUPPORTED: accelerator, cpu
 #include "bfloat16_builtins.hpp"
 
 int main() {

From 4dd6916f4c78d3fd25d7975f73278f0a2a5fc6ed Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 10 Jul 2024 18:39:08 +0100
Subject: [PATCH 3/3] update: xfail another test

---
 sycl/test-e2e/BFloat16/bfloat16_builtins.cpp              | 3 ++-
 sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp | 3 ++-
 sycl/test-e2e/BFloat16/bfloat16_type.cpp                  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
index 6a4010aed17f5..b96882aa86851 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
@@ -13,8 +13,9 @@
 // RUN:  %if preview-breaking-changes-supported %{  %{run} %t2.out  %}
 
 // Currently the feature isn't supported on FPGA.
+// UNSUPPORTED: accelerator
 // FIXME: enable opaque pointers support on CPU.
-// UNSUPPORTED: accelerator, cpu
+// XFAIL: cpu
 #include "bfloat16_builtins.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
index 844c5b3c69684..06621c8d001ba 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
@@ -11,8 +11,9 @@
 // RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %{run} %t3.out  %} %}
 
 // Currently the feature isn't supported on FPGA.
+// UNSUPPORTED: accelerator
 // FIXME: enable opaque pointers support on CPU.
-// UNSUPPORTED: accelerator, cpu
+// XFAIL: cpu
 #include "bfloat16_builtins.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/BFloat16/bfloat16_type.cpp b/sycl/test-e2e/BFloat16/bfloat16_type.cpp
index 73936eb5dc4e7..0aef086807f73 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_type.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_type.cpp
@@ -7,6 +7,7 @@
 // UNSUPPORTED: accelerator
 
 // FIXME: enable opaque pointers support on CPU.
+// XFAIL: cpu
 
 //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==//
 //